diff options
author | Tor Andersson <tor@ghostscript.com> | 2010-05-25 16:02:17 +0200 |
---|---|---|
committer | Tor Andersson <tor@ghostscript.com> | 2010-05-25 16:02:17 +0200 |
commit | d2c97a9a5b13536a993dc6b3c271cbebf50032d0 (patch) | |
tree | cdd191127cb6316607203e6de02681cec19655d5 /fitz | |
parent | 2475220e88397e0c810d191b24f581be01cefb6a (diff) | |
download | mupdf-d2c97a9a5b13536a993dc6b3c271cbebf50032d0.tar.xz |
Expand ligature presentation forms into their individual characters when extracting text.
Diffstat (limited to 'fitz')
-rw-r--r-- | fitz/dev_text.c | 66 |
1 files changed, 60 insertions, 6 deletions
diff --git a/fitz/dev_text.c b/fitz/dev_text.c index 02ee9687..0753fe31 100644 --- a/fitz/dev_text.c +++ b/fitz/dev_text.c @@ -29,6 +29,7 @@ typedef struct fz_textdevice_s fz_textdevice; struct fz_textdevice_s { fz_point point; + fz_textspan *head; fz_textspan *span; }; @@ -59,6 +60,28 @@ fz_freetextspan(fz_textspan *span) } static void +fz_addtextcharimp(fz_textspan *span, int c, fz_bbox bbox) +{ + if (span->len + 1 >= span->cap) + { + span->cap = span->cap ? (span->cap * 3) / 2 : 80; + span->text = fz_realloc(span->text, sizeof(fz_textchar) * span->cap); + } + span->text[span->len].c = c; + span->text[span->len].bbox = bbox; + span->len ++; +} + +static fz_bbox +fz_splitbbox(fz_bbox bbox, int i, int n) +{ + int w = bbox.x1 - bbox.x0; + bbox.x0 = bbox.x0 + w * i / n; + bbox.x1 = bbox.x0 + w * (i + 1) / n; + return bbox; +} + +static void fz_addtextchar(fz_textspan **last, fz_font *font, float size, int c, fz_bbox bbox) { fz_textspan *span = *last; @@ -78,14 +101,39 @@ fz_addtextchar(fz_textspan **last, fz_font *font, float size, int c, fz_bbox bbo *last = span; } - if (span->len + 1 >= span->cap) + switch (c) { - span->cap = span->cap ? (span->cap * 3) / 2 : 80; - span->text = fz_realloc(span->text, sizeof(fz_textchar) * span->cap); + case 0xFB00: /* ff */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 2)); + break; + case 0xFB01: /* fi */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 'i', fz_splitbbox(bbox, 1, 2)); + break; + case 0xFB02: /* fl */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 'l', fz_splitbbox(bbox, 1, 2)); + break; + case 0xFB03: /* ffi */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 3)); + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 3)); + fz_addtextcharimp(span, 'i', fz_splitbbox(bbox, 2, 3)); + break; + case 0xFB04: /* ffl */ + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 3)); + fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 3)); + fz_addtextcharimp(span, 'l', fz_splitbbox(bbox, 2, 3)); + break; + case 0xFB05: /* long st */ + case 0xFB06: /* st */ + fz_addtextcharimp(span, 's', fz_splitbbox(bbox, 0, 2)); + fz_addtextcharimp(span, 't', fz_splitbbox(bbox, 1, 2)); + break; + default: + fz_addtextcharimp(span, c, bbox); + break; } - span->text[span->len].c = c; - span->text[span->len].bbox = bbox; - span->len ++; } static void @@ -286,7 +334,12 @@ static void fz_textfreeuser(void *user) { fz_textdevice *tdev = user; + tdev->span->eol = 1; + + /* TODO: unicode NFC normalization */ + /* TODO: bidi logical reordering */ + fz_free(tdev); } @@ -295,6 +348,7 @@ fz_newtextdevice(fz_textspan *root) { fz_device *dev; fz_textdevice *tdev = fz_malloc(sizeof(fz_textdevice)); + tdev->head = root; tdev->span = root; tdev->point.x = -1; tdev->point.y = -1; |