summaryrefslogtreecommitdiff
path: root/fitz/dev_text.c
diff options
context:
space:
mode:
authorTor Andersson <tor@ghostscript.com>2010-05-25 16:02:17 +0200
committerTor Andersson <tor@ghostscript.com>2010-05-25 16:02:17 +0200
commitd2c97a9a5b13536a993dc6b3c271cbebf50032d0 (patch)
treecdd191127cb6316607203e6de02681cec19655d5 /fitz/dev_text.c
parent2475220e88397e0c810d191b24f581be01cefb6a (diff)
downloadmupdf-d2c97a9a5b13536a993dc6b3c271cbebf50032d0.tar.xz
Expand ligature presentation forms into their individual characters when extracting text.
Diffstat (limited to 'fitz/dev_text.c')
-rw-r--r--fitz/dev_text.c66
1 files changed, 60 insertions, 6 deletions
diff --git a/fitz/dev_text.c b/fitz/dev_text.c
index 02ee9687..0753fe31 100644
--- a/fitz/dev_text.c
+++ b/fitz/dev_text.c
@@ -29,6 +29,7 @@ typedef struct fz_textdevice_s fz_textdevice;
struct fz_textdevice_s
{
fz_point point;
+ fz_textspan *head;
fz_textspan *span;
};
@@ -59,6 +60,28 @@ fz_freetextspan(fz_textspan *span)
}
static void
+fz_addtextcharimp(fz_textspan *span, int c, fz_bbox bbox)
+{
+ if (span->len + 1 >= span->cap)
+ {
+ span->cap = span->cap ? (span->cap * 3) / 2 : 80;
+ span->text = fz_realloc(span->text, sizeof(fz_textchar) * span->cap);
+ }
+ span->text[span->len].c = c;
+ span->text[span->len].bbox = bbox;
+ span->len ++;
+}
+
+static fz_bbox
+fz_splitbbox(fz_bbox bbox, int i, int n)
+{
+ int w = bbox.x1 - bbox.x0;
+ bbox.x0 = bbox.x0 + w * i / n;
+ bbox.x1 = bbox.x0 + w * (i + 1) / n;
+ return bbox;
+}
+
+static void
fz_addtextchar(fz_textspan **last, fz_font *font, float size, int c, fz_bbox bbox)
{
fz_textspan *span = *last;
@@ -78,14 +101,39 @@ fz_addtextchar(fz_textspan **last, fz_font *font, float size, int c, fz_bbox bbo
*last = span;
}
- if (span->len + 1 >= span->cap)
+ switch (c)
{
- span->cap = span->cap ? (span->cap * 3) / 2 : 80;
- span->text = fz_realloc(span->text, sizeof(fz_textchar) * span->cap);
+ case 0xFB00: /* ff */
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2));
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 2));
+ break;
+ case 0xFB01: /* fi */
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2));
+ fz_addtextcharimp(span, 'i', fz_splitbbox(bbox, 1, 2));
+ break;
+ case 0xFB02: /* fl */
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 2));
+ fz_addtextcharimp(span, 'l', fz_splitbbox(bbox, 1, 2));
+ break;
+ case 0xFB03: /* ffi */
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 3));
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 3));
+ fz_addtextcharimp(span, 'i', fz_splitbbox(bbox, 2, 3));
+ break;
+ case 0xFB04: /* ffl */
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 0, 3));
+ fz_addtextcharimp(span, 'f', fz_splitbbox(bbox, 1, 3));
+ fz_addtextcharimp(span, 'l', fz_splitbbox(bbox, 2, 3));
+ break;
+ case 0xFB05: /* long st */
+ case 0xFB06: /* st */
+ fz_addtextcharimp(span, 's', fz_splitbbox(bbox, 0, 2));
+ fz_addtextcharimp(span, 't', fz_splitbbox(bbox, 1, 2));
+ break;
+ default:
+ fz_addtextcharimp(span, c, bbox);
+ break;
}
- span->text[span->len].c = c;
- span->text[span->len].bbox = bbox;
- span->len ++;
}
static void
@@ -286,7 +334,12 @@ static void
fz_textfreeuser(void *user)
{
fz_textdevice *tdev = user;
+
tdev->span->eol = 1;
+
+ /* TODO: unicode NFC normalization */
+ /* TODO: bidi logical reordering */
+
fz_free(tdev);
}
@@ -295,6 +348,7 @@ fz_newtextdevice(fz_textspan *root)
{
fz_device *dev;
fz_textdevice *tdev = fz_malloc(sizeof(fz_textdevice));
+ tdev->head = root;
tdev->span = root;
tdev->point.x = -1;
tdev->point.y = -1;