From d2c97a9a5b13536a993dc6b3c271cbebf50032d0 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Tue, 25 May 2010 16:02:17 +0200 Subject: Expand ligature presentation forms into their individual characters when extracting text. --- fitz/dev_text.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 6 deletions(-) (limited to 'fitz') diff --git a/fitz/dev_text.c b/fitz/dev_text.c index 02ee9687..0753fe31 100644 --- a/fitz/dev_text.c +++ b/fitz/dev_text.c @@ -29,6 +29,7 @@ typedef struct fz_textdevice_s fz_textdevice; struct fz_textdevice_s { fz_point point; + fz_textspan *head; fz_textspan *span; }; @@ -58,6 +59,28 @@ fz_freetextspan(fz_textspan *span) fz_free(span); } +static void +fz_addtextcharimp(fz_textspan *span, int c, fz_bbox bbox) +{ + if (span->len + 1 >= span->cap) + { + span->cap = span->cap ? (span->cap * 3) / 2 : 80; + span->text = fz_realloc(span->text, sizeof(fz_textchar) * span->cap); + } + span->text[span->len].c = c; + span->text[span->len].bbox = bbox; + span->len ++; +} + +static fz_bbox +fz_splitbbox(fz_bbox bbox, int i, int n) +{ + int w = bbox.x1 - bbox.x0; + bbox.x0 = bbox.x0 + w * i / n; + bbox.x1 = bbox.x0 + w * (i + 1) / n; + return bbox; +} + static void fz_addtextchar(fz_textspan **last, fz_font *font, float size, int c, fz_bbox bbox) { @@ -78,14 +101,39 @@ fz_addtextchar(fz_textspan **last, fz_font *font, float size, int c, fz_bbox bbo *last = span; } - if (span->len + 1 >= span->cap) + switch (c) { - span->cap = span->cap ? (span->cap * 3) / 2 : 80; - span->text = fz_realloc(span->text, sizeof(fz_textchar) * span->cap); + case 0xFB00: /* ff */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 2)); + break; + case 0xFB01: /* fi */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 'i', fz_splitbbox(bbox, 1, 2)); + break; + case 0xFB02: /* fl */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 'l', fz_splitbbox(bbox, 1, 2)); + break; + case 0xFB03: /* ffi */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 3)); + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 3)); + fz_addtextcharimp(span, 'i', fz_splitbbox(bbox, 2, 3)); + break; + case 0xFB04: /* ffl */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 3)); + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 3)); + fz_addtextcharimp(span, 'l', fz_splitbbox(bbox, 2, 3)); + break; + case 0xFB05: /* long st */ + case 0xFB06: /* st */ + fz_addtextcharimp(span, 's', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 't', fz_splitbbox(bbox, 1, 2)); + break; + default: + fz_addtextcharimp(span, c, bbox); + break; } - span->text[span->len].c = c; - span->text[span->len].bbox = bbox; - span->len ++; } static void @@ -286,7 +334,12 @@ static void fz_textfreeuser(void *user) { fz_textdevice *tdev = user; + tdev->span->eol = 1; + + /* TODO: unicode NFC normalization */ + /* TODO: bidi logical reordering */ + fz_free(tdev); } @@ -295,6 +348,7 @@ fz_newtextdevice(fz_textspan *root) { fz_device *dev; fz_textdevice *tdev = fz_malloc(sizeof(fz_textdevice)); + tdev->head = root; tdev->span = root; tdev->point.x = -1; tdev->point.y = -1; -- cgit v1.2.3