diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2012-03-06 17:41:04 +0100 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2012-03-12 18:43:02 +0100 |
commit | 0b0e2af392428b5dbc88d6fbd2f6b5181e85165f (patch) | |
tree | 9668c284b68b912c103f0778ff298aafb4d6453b /fitz/dev_text.c | |
parent | bc1c06ddd335f899025470dde7e839a82d792972 (diff) | |
download | mupdf-0b0e2af392428b5dbc88d6fbd2f6b5181e85165f.tar.xz |
Create style sheet and group extracted text into blocks, lines and spans.
Diffstat (limited to 'fitz/dev_text.c')
-rw-r--r-- | fitz/dev_text.c | 689 |
1 files changed, 503 insertions, 186 deletions
diff --git a/fitz/dev_text.c b/fitz/dev_text.c index 79d8c137..7a56b56c 100644 --- a/fitz/dev_text.c +++ b/fitz/dev_text.c @@ -2,6 +2,7 @@ #define LINE_DIST 0.9f #define SPACE_DIST 0.2f +#define PARAGRAPH_DIST 0.5f #include <ft2build.h> #include FT_FREETYPE_H @@ -11,60 +12,211 @@ typedef struct fz_text_device_s fz_text_device; struct fz_text_device_s { + fz_text_sheet *sheet; + fz_text_page *page; + fz_text_line cur_line; + fz_text_span cur_span; fz_point point; - fz_text_span *head; - fz_text_span *span; }; -fz_text_span * -fz_new_text_span(fz_context *ctx) +fz_text_sheet * +fz_new_text_sheet(fz_context *ctx) { - fz_text_span *span; - span = fz_malloc_struct(ctx, fz_text_span); - span->font = NULL; - span->wmode = 0; - span->size = 0; - span->len = 0; - span->cap = 0; - span->text = NULL; - span->next = NULL; - span->eol = 0; - return span; + fz_text_sheet *sheet = fz_malloc(ctx, sizeof *sheet); + sheet->maxid = 0; + sheet->style = NULL; + return sheet; } void -fz_free_text_span(fz_context *ctx, fz_text_span *span) +fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet) +{ + fz_text_style *style = sheet->style; + while (style) + { + fz_text_style *next = style->next; + fz_drop_font(ctx, style->font); + fz_free(ctx, style); + style = next; + } +} + +static fz_text_style * +fz_find_text_style_imp(fz_context *ctx, fz_text_sheet *sheet, + float size, fz_font *font, int wmode, int script) +{ + fz_text_style *style; + + for (style = sheet->style; style; style = style->next) + { + if (style->font == font && + style->size == size && + style->wmode == wmode && + style->script == script) /* FIXME: others */ + { + return style; + } + } + + /* Better make a new one and add it to our list */ + style = fz_malloc(ctx, sizeof *style); + style->id = sheet->maxid++; + style->font = fz_keep_font(ctx, font); + style->size = size; + style->wmode = wmode; + style->script = script; + style->next = sheet->style; + sheet->style = style; + return style; +} + +static fz_text_style * +fz_find_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, fz_matrix *ctm, + fz_colorspace *colorspace, float *color, float alpha, fz_stroke_state *stroke) { - fz_text_span *next; + float size = 1.0f; + fz_font *font = text ? text->font : NULL; + int wmode = text ? text->wmode : 0; + if (ctm && text) + { + fz_matrix tm = text->trm; + fz_matrix trm; + tm.e = 0; + tm.f = 0; + trm = fz_concat(tm, *ctm); + size = fz_matrix_expansion(trm); + } + return fz_find_text_style_imp(ctx, sheet, size, font, wmode, 0); +} - while (span) +fz_text_page * +fz_new_text_page(fz_context *ctx, fz_rect mediabox) +{ + fz_text_page *page = fz_malloc(ctx, sizeof(*page)); + page->mediabox = mediabox; + page->len = 0; + page->cap = 0; + page->blocks = NULL; + return page; +} + +void +fz_free_text_page(fz_context *ctx, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + for (block = page->blocks; block < page->blocks + page->len; block++) { - if (span->font) - fz_drop_font(ctx, span->font); - next = span->next; - fz_free(ctx, span->text); - fz_free(ctx, span); - span = next; + for (line = block->lines; line < block->lines + block->len; line++) + { + for (span = line->spans; span < line->spans + line->len; span++) + { + fz_free(ctx, span->text); + } + fz_free(ctx, line->spans); + } + fz_free(ctx, block->lines); } + fz_free(ctx, page->blocks); + fz_free(ctx, page); } static void -fz_add_text_char_imp(fz_context *ctx, fz_text_span *span, int c, fz_bbox bbox) +append_char(fz_context *ctx, fz_text_span *span, int c, fz_rect bbox) { - if (span->len + 1 >= span->cap) + if (span->len == span->cap) { - span->cap = span->cap > 1 ? (span->cap * 3) / 2 : 80; - span->text = fz_resize_array(ctx, span->text, span->cap, sizeof(fz_text_char)); + span->cap = MAX(64, span->cap * 2); + span->text = fz_resize_array(ctx, span->text, span->cap, sizeof(*span->text)); } + span->bbox = fz_union_rect(span->bbox, bbox); span->text[span->len].c = c; span->text[span->len].bbox = bbox; - span->len ++; + span->len++; +} + +static void +init_span(fz_context *ctx, fz_text_span *span, fz_text_style *style) +{ + span->style = style; + span->bbox = fz_empty_rect; + span->len = span->cap = 0; + span->text = NULL; +} + +static void +append_span(fz_context *ctx, fz_text_line *line, fz_text_span *span) +{ + if (line->len == line->cap) + { + line->cap = MAX(8, line->cap * 2); + line->spans = fz_resize_array(ctx, line->spans, line->cap, sizeof(*line->spans)); + } + line->bbox = fz_union_rect(line->bbox, span->bbox); + line->spans[line->len++] = *span; } -static fz_bbox -fz_split_bbox(fz_bbox bbox, int i, int n) +static void +init_line(fz_context *ctx, fz_text_line *line) +{ + line->bbox = fz_empty_rect; + line->len = line->cap = 0; + line->spans = NULL; +} + +static void +append_line(fz_context *ctx, fz_text_block *block, fz_text_line *line) { - float w = (float)(bbox.x1 - bbox.x0) / n; + if (block->len == block->cap) + { + block->cap = MAX(16, block->cap * 2); + block->lines = fz_resize_array(ctx, block->lines, block->cap, sizeof *block->lines); + } + block->bbox = fz_union_rect(block->bbox, line->bbox); + block->lines[block->len++] = *line; +} + +static fz_text_block * +find_block_for_line(fz_context *ctx, fz_text_page *page, fz_text_line *line) +{ + float size = line->len > 0 && line->spans[0].len > 0 ? line->spans[0].style->size : 1; + int i; + + for (i = 0; i < page->len; i++) + { + fz_text_block *block = page->blocks + i; + int w = block->bbox.x1 - block->bbox.x0; + if (block->bbox.y0 - line->bbox.y1 < size * PARAGRAPH_DIST) + if (line->bbox.x0 < block->bbox.x1 && line->bbox.x1 > block->bbox.x0) + if (ABS(line->bbox.x0 - block->bbox.x0) < w / 4) + return block; + } + + if (page->len == page->cap) + { + page->cap = MAX(16, page->cap * 2); + page->blocks = fz_resize_array(ctx, page->blocks, page->cap, sizeof(*page->blocks)); + } + + page->blocks[page->len].bbox = fz_empty_rect; + page->blocks[page->len].len = 0; + page->blocks[page->len].cap = 0; + page->blocks[page->len].lines = NULL; + + return &page->blocks[page->len++]; +} + +static void +insert_line(fz_context *ctx, fz_text_page *page, fz_text_line *line) +{ + append_line(ctx, find_block_for_line(ctx, page, line), line); +} + +static fz_rect +fz_split_bbox(fz_rect bbox, int i, int n) +{ + float w = (bbox.x1 - bbox.x0) / n; float x0 = bbox.x0; bbox.x0 = x0 + i * w; bbox.x1 = x0 + (i + 1) * w; @@ -72,154 +224,71 @@ fz_split_bbox(fz_bbox bbox, int i, int n) } static void -fz_add_text_char(fz_context *ctx, fz_text_span **last, fz_font *font, float size, int wmode, int c, fz_bbox bbox) +fz_flush_text_line(fz_context *ctx, fz_text_device *dev, fz_text_style *style) { - fz_text_span *span = *last; - - if (!span->font) - { - span->font = fz_keep_font(ctx, font); - span->size = size; - } + append_span(ctx, &dev->cur_line, &dev->cur_span); + insert_line(ctx, dev->page, &dev->cur_line); + init_span(ctx, &dev->cur_span, style); + init_line(ctx, &dev->cur_line); +} - if ((span->font != font || span->size != size || span->wmode != wmode) && c != 32) +static void +fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox) +{ + if (!dev->cur_span.style) + dev->cur_span.style = style; + if (style != dev->cur_span.style) { - span = fz_new_text_span(ctx); - span->font = fz_keep_font(ctx, font); - span->size = size; - span->wmode = wmode; - (*last)->next = span; - *last = span; + append_span(ctx, &dev->cur_line, &dev->cur_span); + init_span(ctx, &dev->cur_span, style); } + append_char(ctx, &dev->cur_span, c, bbox); +} +static void +fz_add_text_char(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox) +{ switch (c) { case -1: /* ignore when one unicode character maps to multiple glyphs */ break; case 0xFB00: /* ff */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 2)); break; case 0xFB01: /* fi */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 'i', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 1, 2)); break; case 0xFB02: /* fl */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 'l', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 1, 2)); break; case 0xFB03: /* ffi */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 3)); - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 3)); - fz_add_text_char_imp(ctx, span, 'i', fz_split_bbox(bbox, 2, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3)); + fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 2, 3)); break; case 0xFB04: /* ffl */ - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 0, 3)); - fz_add_text_char_imp(ctx, span, 'f', fz_split_bbox(bbox, 1, 3)); - fz_add_text_char_imp(ctx, span, 'l', fz_split_bbox(bbox, 2, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3)); + fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3)); + fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 2, 3)); break; case 0xFB05: /* long st */ case 0xFB06: /* st */ - fz_add_text_char_imp(ctx, span, 's', fz_split_bbox(bbox, 0, 2)); - fz_add_text_char_imp(ctx, span, 't', fz_split_bbox(bbox, 1, 2)); + fz_add_text_char_imp(ctx, dev, style, 's', fz_split_bbox(bbox, 0, 2)); + fz_add_text_char_imp(ctx, dev, style, 't', fz_split_bbox(bbox, 1, 2)); break; default: - fz_add_text_char_imp(ctx, span, c, bbox); + fz_add_text_char_imp(ctx, dev, style, c, bbox); break; } } static void -fz_divide_text_chars(fz_text_span **last, int n, fz_bbox bbox) -{ - fz_text_span *span = *last; - int i, x; - x = span->len - n; - if (x >= 0) - for (i = 0; i < n; i++) - span->text[x + i].bbox = fz_split_bbox(bbox, i, n); -} - -static void -fz_add_text_newline(fz_context *ctx, fz_text_span **last, fz_font *font, float size, int wmode) -{ - fz_text_span *span; - span = fz_new_text_span(ctx); - span->font = fz_keep_font(ctx, font); - span->size = size; - span->wmode = wmode; - (*last)->eol = 1; - (*last)->next = span; - *last = span; -} - -void -fz_debug_text_span_xml(fz_text_span *span) -{ - char buf[10]; - int c, n, k, i; - - while (span) - { - printf("<span font=\"%s\" size=\"%g\" wmode=\"%d\" eol=\"%d\">\n", - span->font ? span->font->name : "NULL", span->size, span->wmode, span->eol); - - for (i = 0; i < span->len; i++) - { - printf("\t<char ucs=\""); - c = span->text[i].c; - if (c < 128) - putchar(c); - else - { - n = fz_runetochar(buf, c); - for (k = 0; k < n; k++) - putchar(buf[k]); - } - printf("\" bbox=\"%d %d %d %d\" />\n", - span->text[i].bbox.x0, - span->text[i].bbox.y0, - span->text[i].bbox.x1, - span->text[i].bbox.y1); - } - - printf("</span>\n"); - - span = span->next; - } -} - -void -fz_debug_text_span(fz_text_span *span) -{ - char buf[10]; - int c, n, k, i; - - while (span) - { - for (i = 0; i < span->len; i++) - { - c = span->text[i].c; - if (c < 128) - putchar(c); - else - { - n = fz_runetochar(buf, c); - for (k = 0; k < n; k++) - putchar(buf[k]); - } - } - - if (span->eol) - putchar('\n'); - - span = span->next; - } -} - -static void -fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_matrix ctm, fz_point *pen) +fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, fz_matrix ctm, fz_text_style *style) { + fz_point *pen = &dev->point; fz_font *font = text->font; FT_Face face = font->ft_face; fz_matrix tm = text->trm; @@ -233,19 +302,20 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat float ascender = 1; float descender = 0; int multi; - int i, err; + int i, j, err; if (text->len == 0) return; - fz_lock(ctx, FZ_LOCK_FREETYPE); if (font->ft_face) { + fz_lock(ctx, FZ_LOCK_FREETYPE); err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72); if (err) fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); ascender = (float)face->ascender / face->units_per_EM; descender = (float)face->descender / face->units_per_EM; + fz_unlock(ctx, FZ_LOCK_FREETYPE); } rect = fz_empty_rect; @@ -264,6 +334,7 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat tm.e = 0; tm.f = 0; trm = fz_concat(tm, ctm); + dir = fz_transform_vector(trm, dir); dist = sqrtf(dir.x * dir.x + dir.y * dir.y); ndir.x = dir.x / dist; @@ -271,19 +342,10 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat size = fz_matrix_expansion(trm); - multi = 1; + int lastchar = ' '; for (i = 0; i < text->len; i++) { - if (text->items[i].gid < 0) - { - fz_add_text_char(ctx, last, font, size, text->wmode, text->items[i].ucs, fz_round_rect(rect)); - multi ++; - fz_divide_text_chars(last, multi, fz_round_rect(rect)); - continue; - } - multi = 1; - /* Calculate new pen location and delta */ tm.e = text->items[i].x; tm.f = text->items[i].y; @@ -305,20 +367,19 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat if (dist > size * LINE_DIST) { - fz_add_text_newline(ctx, last, font, size, text->wmode); + fz_flush_text_line(ctx, dev, style); + lastchar = ' '; } - else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST) + else if (fabsf(dot) > 0.95f && dist > size * SPACE_DIST && lastchar != ' ') { - if ((*last)->len > 0 && (*last)->text[(*last)->len - 1].c != ' ') - { - fz_rect spacerect; - spacerect.x0 = -0.2f; - spacerect.y0 = 0; - spacerect.x1 = 0; - spacerect.y1 = 1; - spacerect = fz_transform_rect(trm, spacerect); - fz_add_text_char(ctx, last, font, size, text->wmode, ' ', fz_round_rect(spacerect)); - } + fz_rect spacerect; + spacerect.x0 = -0.2f; + spacerect.y0 = 0; + spacerect.x1 = 0; + spacerect.y1 = 1; + spacerect = fz_transform_rect(trm, spacerect); + fz_add_text_char(ctx, dev, style, ' ', spacerect); + lastchar = ' '; } } @@ -331,8 +392,13 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat /* TODO: freetype returns broken vertical metrics */ /* if (text->wmode) mask |= FT_LOAD_VERTICAL_LAYOUT; */ + fz_lock(ctx, FZ_LOCK_FREETYPE); + err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72); + if (err) + fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); FT_Get_Advance(font->ft_face, text->items[i].gid, mask, &ftadv); adv = ftadv / 65536.0f; + fz_unlock(ctx, FZ_LOCK_FREETYPE); rect.x0 = 0; rect.y0 = descender; @@ -352,9 +418,27 @@ fz_text_extract_span(fz_context *ctx, fz_text_span **last, fz_text *text, fz_mat pen->x = trm.e + dir.x * adv; pen->y = trm.f + dir.y * adv; - fz_add_text_char(ctx, last, font, size, text->wmode, text->items[i].ucs, fz_round_rect(rect)); + /* Check for one glyph to many char mapping */ + for (j = i + 1; j < text->len; j++) + if (text->items[j].gid >= 0) + break; + multi = j - i; + + if (multi == 1) + { + fz_add_text_char(ctx, dev, style, text->items[i].ucs, rect); + } + else + { + for (j = 0; j < multi; j++) + { + fz_rect part = fz_split_bbox(rect, j, multi); + fz_add_text_char(ctx, dev, style, text->items[i].ucs, part); + } + } + + lastchar = text->items[i].ucs; } - fz_unlock(ctx, FZ_LOCK_FREETYPE); } static void @@ -362,7 +446,9 @@ fz_text_fill_text(fz_device *dev, fz_text *text, fz_matrix ctm, fz_colorspace *colorspace, float *color, float alpha) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, colorspace, color, alpha, NULL); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void @@ -370,36 +456,57 @@ fz_text_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, fz_m fz_colorspace *colorspace, float *color, float alpha) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, colorspace, color, alpha, stroke); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void fz_text_clip_text(fz_device *dev, fz_text *text, fz_matrix ctm, int accumulate) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, NULL); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void fz_text_clip_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, fz_matrix ctm) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, stroke); + fz_text_extract(dev->ctx, tdev, text, ctm, style); } static void fz_text_ignore_text(fz_device *dev, fz_text *text, fz_matrix ctm) { fz_text_device *tdev = dev->user; - fz_text_extract_span(dev->ctx, &tdev->span, text, ctm, &tdev->point); + fz_text_style *style; + style = fz_find_text_style(dev->ctx, tdev->sheet, text, &ctm, NULL, NULL, 0, NULL); + fz_text_extract(dev->ctx, tdev, text, ctm, style); +} + +static int cmp_block(const void *av, const void *bv) +{ + const fz_text_block *a = av; + const fz_text_block *b = bv; + int x = a->bbox.x0 - b->bbox.x0; + if (x) return x; + return -(a->bbox.y0 - b->bbox.y0); } static void fz_text_free_user(fz_device *dev) { + fz_context *ctx = dev->ctx; fz_text_device *tdev = dev->user; - tdev->span->eol = 1; + append_span(ctx, &tdev->cur_line, &tdev->cur_span); + insert_line(ctx, tdev->page, &tdev->cur_line); + + qsort(tdev->page->blocks, tdev->page->len, sizeof *tdev->page->blocks, cmp_block); /* TODO: unicode NFC normalization */ /* TODO: bidi logical reordering */ @@ -408,15 +515,19 @@ fz_text_free_user(fz_device *dev) } fz_device * -fz_new_text_device(fz_context *ctx, fz_text_span *root) +fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page) { fz_device *dev; + fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device); - tdev->head = root; - tdev->span = root; + tdev->sheet = sheet; + tdev->page = page; tdev->point.x = -1; tdev->point.y = -1; + init_line(ctx, &tdev->cur_line); + init_span(ctx, &tdev->cur_span, NULL); + dev = fz_new_device(ctx, tdev); dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE; dev->free_user = fz_text_free_user; @@ -427,3 +538,209 @@ fz_new_text_device(fz_context *ctx, fz_text_span *root) dev->ignore_text = fz_text_ignore_text; return dev; } + +/* XML, HTML and plain-text output */ + +static int font_is_bold(fz_font *font) +{ + FT_Face face = font->ft_face; + if (face && (face->style_flags & FT_STYLE_FLAG_BOLD)) + return 1; + if (strstr(font->name, "Bold")) + return 1; + return 0; +} + +static int font_is_italic(fz_font *font) +{ + FT_Face face = font->ft_face; + if (face && (face->style_flags & FT_STYLE_FLAG_ITALIC)) + return 1; + if (strstr(font->name, "Italic") || strstr(font->name, "Oblique")) + return 1; + return 0; +} + +static void +fz_print_style_begin(FILE *out, fz_text_style *style) +{ + int script = style->script; + fprintf(out, "<span class=\"s%d\">", style->id); + while (script-- > 0) + fprintf(out, "<sup>"); + while (++script < 0) + fprintf(out, "<sub>"); +} + +static void +fz_print_style_end(FILE *out, fz_text_style *style) +{ + int script = style->script; + while (script-- > 0) + fprintf(out, "</sup>"); + while (++script < 0) + fprintf(out, "</sub>"); + fprintf(out, "</span>"); +} + +static void +fz_print_style(FILE *out, fz_text_style *style) +{ + char *s = strchr(style->font->name, '+'); + s = s ? s + 1 : style->font->name; + fprintf(out, "span.s%d{font-family:\"%s\";font-size:%gpt;", + style->id, s, style->size); + if (font_is_italic(style->font)) + fprintf(out, "font-style:italic;"); + if (font_is_bold(style->font)) + fprintf(out, "font-weight:bold;"); + fprintf(out, "}\n"); +} + +void +fz_print_text_sheet(FILE *out, fz_text_sheet *sheet) +{ + fz_text_style *style; + for (style = sheet->style; style; style = style->next) + fz_print_style(out, style); +} + +void +fz_print_text_page_html(FILE *out, fz_text_page *page) +{ + int block_n, line_n, span_n, ch_n; + fz_text_style *style = NULL; + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + + fprintf(out, "<div class=\"page\">\n"); + + for (block_n = 0; block_n < page->len; block_n++) + { + block = &page->blocks[block_n]; + fprintf(out, "<div class=\"block\">\n"); + for (line_n = 0; line_n < block->len; line_n++) + { + line = &block->lines[line_n]; + fprintf(out, "<p>"); + style = NULL; + + for (span_n = 0; span_n < line->len; span_n++) + { + span = &line->spans[span_n]; + if (style != span->style) + { + if (style != NULL) + fz_print_style_end(out, style); + fz_print_style_begin(out, span->style); + style = span->style; + } + + for (ch_n = 0; ch_n < span->len; ch_n++) + { + fz_text_char *ch = &span->text[ch_n]; + if (ch->c == '<') + fprintf(out, "<"); + else if (ch->c == '>') + fprintf(out, ">"); + else if (ch->c == '&') + fprintf(out, "&"); + else if (ch->c >= 32 && ch->c <= 127) + fprintf(out, "%c", ch->c); + else + fprintf(out, "&#x%x;", ch->c); + } + } + fz_print_style_end(out, style); + fprintf(out, "</p>\n"); + } + fprintf(out, "</div>\n"); + } + + fprintf(out, "</div>\n"); +} + +void +fz_print_text_page_xml(FILE *out, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + fz_text_char *ch; + char *s; + + fprintf(out, "<page>\n"); + for (block = page->blocks; block < page->blocks + page->len; block++) + { + fprintf(out, "<block bbox=\"%g %g %g %g\">\n", + block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); + for (line = block->lines; line < block->lines + block->len; line++) + { + fprintf(out, "<line bbox=\"%g %g %g %g\">\n", + line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); + for (span = line->spans; span < line->spans + line->len; span++) + { + fz_text_style *style = span->style; + s = strchr(style->font->name, '+'); + s = s ? s + 1 : style->font->name; + fprintf(out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n", + span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, + s, style->size); + for (ch = span->text; ch < span->text + span->len; ch++) + { + fprintf(out, "<char bbox=\"%g %g %g %g\" c=\"", + ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1); + switch (ch->c) + { + case '<': fprintf(out, "<"); break; + case '>': fprintf(out, ">"); break; + case '&': fprintf(out, "&"); break; + case '"': fprintf(out, """); break; + case '\'': fprintf(out, "'"); break; + default: + if (ch->c >= 32 && ch->c <= 127) + fprintf(out, "%c", ch->c); + else + fprintf(out, "&#x%x;", ch->c); + break; + } + fprintf(out, "\"/>\n"); + } + fprintf(out, "</span>\n"); + } + fprintf(out, "</line>\n"); + } + fprintf(out, "</block>\n"); + } + fprintf(out, "</page>\n"); +} + +void +fz_print_text_page(FILE *out, fz_text_page *page) +{ + fz_text_block *block; + fz_text_line *line; + fz_text_span *span; + fz_text_char *ch; + char utf[10]; + int i, n; + + for (block = page->blocks; block < page->blocks + page->len; block++) + { + for (line = block->lines; line < block->lines + block->len; line++) + { + for (span = line->spans; span < line->spans + line->len; span++) + { + for (ch = span->text; ch < span->text + span->len; ch++) + { + n = fz_runetochar(utf, ch->c); + for (i = 0; i < n; i++) + putc(utf[i], out); + } + } + fprintf(out, "\n"); + } + fprintf(out, "\n"); + } +} |