From 626ea2ea771735492c9a4350ae02b26ea09d1423 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Tue, 1 Aug 2017 18:15:23 +0200 Subject: Simplify stext structure and device. * Use pool allocator and linked lists for all levels. * Remove separate fz_stext_sheet struct. * Remove unused 'script' style. * Remove 'span' level items. * Detect visual/logical RTL layouts. * Detect indented paragraphs. --- source/fitz/font.c | 22 + source/fitz/stext-device.c | 1047 +++++++++------------------- source/fitz/stext-output.c | 386 +++++------ source/fitz/stext-paragraph.c | 1538 ----------------------------------------- source/fitz/stext-search.c | 137 ++-- source/fitz/util.c | 109 +-- source/tools/mudraw.c | 10 +- source/tools/murun.c | 16 +- 8 files changed, 613 insertions(+), 2652 deletions(-) delete mode 100644 source/fitz/stext-paragraph.c (limited to 'source') diff --git a/source/fitz/font.c b/source/fitz/font.c index eb7c8c35..dfe4ab24 100644 --- a/source/fitz/font.c +++ b/source/fitz/font.c @@ -193,6 +193,28 @@ fz_set_font_bbox(fz_context *ctx, fz_font *font, float xmin, float ymin, float x } } +float fz_font_ascender(fz_context *ctx, fz_font *font) +{ + if (font->t3procs) + return font->bbox.y1; + else + { + FT_Face face = font->ft_face; + return (float)face->ascender / face->units_per_EM; + } +} + +float fz_font_descender(fz_context *ctx, fz_font *font) +{ + if (font->t3procs) + return font->bbox.y0; + else + { + FT_Face face = font->ft_face; + return (float)face->descender / face->units_per_EM; + } +} + /* * Freetype hooks */ diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c index 73fa309e..166f5aa0 100644 --- a/source/fitz/stext-device.c +++ b/source/fitz/stext-device.c @@ -4,36 +4,25 @@ #include #include -/* Extract text into an unsorted span soup. */ +#include /* for debug printing */ + +/* Extract text into blocks and lines. */ #define LINE_DIST 0.9f #define SPACE_DIST 0.15f #define SPACE_MAX_DIST 0.8f #define PARAGRAPH_DIST 0.5f -#include /* for debug printing */ -#undef DEBUG_SPANS -#undef DEBUG_INTERNALS -#undef DEBUG_LINE_HEIGHTS -#undef DEBUG_MASKS -#undef DEBUG_ALIGN -#undef DEBUG_INDENTS - -#include -#include FT_FREETYPE_H -#include FT_ADVANCES_H - typedef struct fz_stext_device_s fz_stext_device; -typedef struct span_soup_s span_soup; - struct fz_stext_device_s { fz_device super; - fz_stext_sheet *sheet; fz_stext_page *page; - span_soup *spans; - fz_stext_span *cur_span; + fz_point pen, start; + fz_matrix trm; + int new_obj; + int curdir; int lastchar; int flags; }; @@ -42,553 +31,235 @@ const char *fz_stext_options_usage = "Structured text output options:\n" "\tpreserve-ligatures: do not expand all ligatures into constituent characters\n" "\tpreserve-whitespace: do not convert all whitespace characters into spaces\n" + "\tpreserve-images: keep images in output\n" "\n"; -static fz_rect * -add_point_to_rect(fz_rect *a, const fz_point *p) +fz_rect * +fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch) { - if (p->x < a->x0) - a->x0 = p->x; - if (p->x > a->x1) - a->x1 = p->x; - if (p->y < a->y0) - a->y0 = p->y; - if (p->y > a->y1) - a->y1 = p->y; - return a; + *bbox = ch->bbox; + return bbox; } -fz_rect * -fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int i) +fz_stext_page * +fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox) { - fz_point a, d; - const fz_point *max; - fz_stext_char *ch; - - if (!span || i >= span->len) - { - *bbox = fz_empty_rect; - return bbox; - } - ch = &span->text[i]; - if (i == span->len-1) - max = &span->max; - else - max = &span->text[i+1].p; - if (span->wmode == 0) + fz_pool *pool = fz_new_pool(ctx); + fz_stext_page *page; + fz_try(ctx) { - a.x = 0; - a.y = span->ascender_max; - d.x = 0; - d.y = span->descender_min; + page = fz_pool_alloc(ctx, pool, sizeof(*page)); + page->pool = pool; + page->mediabox = *mediabox; + page->first_block = NULL; + page->last_block = NULL; } - else + fz_catch(ctx) { - a.x = span->ascender_max; - a.y = 0; - d.x = span->descender_min; - d.y = 0; + fz_drop_pool(ctx, pool); + fz_rethrow(ctx); } - fz_transform_vector(&a, &span->transform); - fz_transform_vector(&d, &span->transform); - bbox->x0 = bbox->x1 = ch->p.x + a.x; - bbox->y0 = bbox->y1 = ch->p.y + a.y; - a.x += max->x; - a.y += max->y; - add_point_to_rect(bbox, &a); - a.x = ch->p.x + d.x; - a.y = ch->p.y + d.y; - add_point_to_rect(bbox, &a); - a.x = max->x + d.x; - a.y = max->y + d.y; - add_point_to_rect(bbox, &a); - return bbox; + return page; } -static void -add_bbox_to_span(fz_stext_span *span) +void +fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) { - fz_point a, d; - fz_rect *bbox = &span->bbox; - - if (!span) - return; - if (span->wmode == 0) - { - a.x = 0; - a.y = span->ascender_max; - d.x = 0; - d.y = span->descender_min; - } - else + if (page) { - a.x = span->ascender_max; - a.y = 0; - d.x = span->descender_min; - d.y = 0; + fz_stext_block *block; + for (block = page->first_block; block; block = block->next) + if (block->type == FZ_STEXT_BLOCK_IMAGE) + fz_drop_image(ctx, block->u.i.image); + fz_drop_pool(ctx, page->pool); } - fz_transform_vector(&a, &span->transform); - fz_transform_vector(&d, &span->transform); - bbox->x0 = bbox->x1 = span->min.x + a.x; - bbox->y0 = bbox->y1 = span->min.y + a.y; - a.x += span->max.x; - a.y += span->max.y; - add_point_to_rect(bbox, &a); - a.x = span->min.x + d.x; - a.y = span->min.y + d.y; - add_point_to_rect(bbox, &a); - a.x = span->max.x + d.x; - a.y = span->max.y + d.y; - add_point_to_rect(bbox, &a); } -struct span_soup_s -{ - int len, cap; - fz_stext_span **spans; -}; - -static span_soup * -new_span_soup(fz_context *ctx) +static fz_stext_block * +add_block_to_page(fz_context *ctx, fz_stext_page *page) { - span_soup *soup = fz_malloc_struct(ctx, span_soup); - soup->len = 0; - soup->cap = 0; - soup->spans = NULL; - return soup; + fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); + if (!page->first_block) + page->first_block = page->last_block = block; + else + { + page->last_block->next = block; + page->last_block = block; + } + return block; } -static void -free_span_soup(fz_context *ctx, span_soup *soup) +static fz_stext_block * +add_text_block_to_page(fz_context *ctx, fz_stext_page *page) { - int i; - - if (soup == NULL) - return; - for (i = 0; i < soup->len; i++) - { - fz_free(ctx, soup->spans[i]); - } - fz_free(ctx, soup->spans); - fz_free(ctx, soup); + fz_stext_block *block = add_block_to_page(ctx, page); + block->type = FZ_STEXT_BLOCK_TEXT; + return block; } -static void -add_span_to_soup(fz_context *ctx, span_soup *soup, fz_stext_span *span) +static fz_stext_block * +add_image_block_to_page(fz_context *ctx, fz_stext_page *page, const fz_matrix *ctm, fz_image *image) { - if (span == NULL) - return; - if (soup->len == soup->cap) - { - int newcap = (soup->cap ? soup->cap * 2 : 16); - soup->spans = fz_resize_array(ctx, soup->spans, newcap, sizeof(*soup->spans)); - soup->cap = newcap; - } - add_bbox_to_span(span); - soup->spans[soup->len++] = span; + fz_stext_block *block = add_block_to_page(ctx, page); + block->type = FZ_STEXT_BLOCK_IMAGE; + block->u.i.transform = *ctm; + block->u.i.image = fz_keep_image(ctx, image); + block->bbox.x0 = 0; + block->bbox.y0 = 0; + block->bbox.x1 = 1; + block->bbox.y1 = 1; + fz_transform_rect(&block->bbox, ctm); + return block; } static fz_stext_line * -push_span(fz_context *ctx, fz_stext_device *tdev, fz_stext_span *span, int new_line, float distance) +add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, int wmode) { - fz_stext_line *line; - fz_stext_block *block; - fz_stext_page *page = tdev->page; - int prev_not_text = 0; - - if (page->len == 0 || page->blocks[page->len-1].type != FZ_PAGE_BLOCK_TEXT) - prev_not_text = 1; - - if (new_line || prev_not_text) - { - float size = fz_matrix_expansion(&span->transform); - /* So, a new line. Part of the same block or not? */ - if (distance == 0 || distance > size * 1.5f || distance < -size * PARAGRAPH_DIST || page->len == 0 || prev_not_text) - { - /* New block */ - if (page->len == page->cap) - { - int newcap = (page->cap ? page->cap*2 : 4); - page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks)); - page->cap = newcap; - } - block = fz_malloc_struct(ctx, fz_stext_block); - page->blocks[page->len].type = FZ_PAGE_BLOCK_TEXT; - page->blocks[page->len].u.text = block; - block->cap = 0; - block->len = 0; - block->lines = 0; - block->bbox = fz_empty_rect; - page->len++; - distance = 0; - } - - /* New line */ - block = page->blocks[page->len-1].u.text; - if (block->len == block->cap) - { - int newcap = (block->cap ? block->cap*2 : 4); - block->lines = fz_resize_array(ctx, block->lines, newcap, sizeof(*block->lines)); - block->cap = newcap; - } - block->lines[block->len].first_span = NULL; - block->lines[block->len].last_span = NULL; - block->lines[block->len].distance = distance; - block->lines[block->len].bbox = fz_empty_rect; - block->len++; - } - - /* Find last line and append to it */ - block = page->blocks[page->len-1].u.text; - line = &block->lines[block->len-1]; - - fz_union_rect(&block->lines[block->len-1].bbox, &span->bbox); - fz_union_rect(&block->bbox, &span->bbox); - span->base_offset = (new_line ? 0 : distance); - - if (!line->first_span) - { - line->first_span = line->last_span = span; - span->next = NULL; - } + fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); + if (!block->u.t.first_line) + block->u.t.first_line = block->u.t.last_line = line; else { - line->last_span->next = span; - line->last_span = span; + block->u.t.last_line->next = line; + block->u.t.last_line = line; } + line->wmode = wmode; + return line; } -#if defined(DEBUG_SPANS) || defined(DEBUG_ALIGN) || defined(DEBUG_INDENTS) -static void -dump_span(fz_stext_span *s) +static float min4(float a, float b, float c, float d) { - int i; - for (i=0; i < s->len; i++) - { - printf("%c", s->text[i].c); - } + return fz_min(fz_min(a, b), fz_min(c, d)); } -#endif -#ifdef DEBUG_ALIGN -static void -dump_line(fz_stext_line *line) +static float max4(float a, float b, float c, float d) { - int i; - for (i=0; i < line->len; i++) - { - fz_stext_span *s = line->spans[i]; - if (s->spacing > 1) - printf(" "); - dump_span(s); - } - printf("\n"); + return fz_max(fz_max(a, b), fz_max(c, d)); } -#endif -static void -strain_soup(fz_context *ctx, fz_stext_device *tdev) +static fz_stext_char * +add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, const fz_matrix *trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int rtl) { - span_soup *soup = tdev->spans; - fz_stext_line *last_line = NULL; - fz_stext_span *last_span = NULL; - int span_num; - - if (soup == NULL) - return; + fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); + fz_point a, d; - /* Really dumb implementation to match what we had before */ - for (span_num=0; span_num < soup->len; span_num++) + if (!line->first_char) + line->first_char = line->last_char = ch; + else { - fz_stext_span *span = soup->spans[span_num]; - int new_line = 1; - float distance = 0; - float spacing = 0; - soup->spans[span_num] = NULL; - if (last_span) - { - /* If we have a last_span, we must have a last_line */ - /* Do span and last_line share the same baseline? */ - fz_point p, q, perp_r; - float dot; - float size = fz_matrix_expansion(&span->transform); - -#ifdef DEBUG_SPANS - { - printf("Comparing: \""); - dump_span(last_span); - printf("\" and \""); - dump_span(span); - printf("\"\n"); - } -#endif - - p.x = last_line->first_span->max.x - last_line->first_span->min.x; - p.y = last_line->first_span->max.y - last_line->first_span->min.y; - fz_normalize_vector(&p); - q.x = span->max.x - span->min.x; - q.y = span->max.y - span->min.y; - fz_normalize_vector(&q); -#ifdef DEBUG_SPANS - printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y); - printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y); -#endif - perp_r.y = last_line->first_span->min.x - span->min.x; - perp_r.x = -(last_line->first_span->min.y - span->min.y); - /* Check if p and q are parallel. If so, then this - * line is parallel with the last one. */ - dot = p.x * q.x + p.y * q.y; - if (fabsf(dot) > 0.9995f) - { - /* If we take the dot product of normalised(p) and - * perp(r), we get the perpendicular distance from - * one line to the next (assuming they are parallel). */ - distance = p.x * perp_r.x + p.y * perp_r.y; - /* We allow 'small' distances of baseline changes - * to cope with super/subscript. FIXME: We should - * gather subscript/superscript information here. */ - new_line = (fabsf(distance) > size * LINE_DIST); - } - else - { - new_line = 1; - distance = 0; - } - if (!new_line) - { - fz_point delta; - - delta.x = span->min.x - last_span->max.x; - delta.y = span->min.y - last_span->max.y; - - spacing = (p.x * delta.x + p.y * delta.y); - spacing = fabsf(spacing); - /* Only allow changes in baseline (subscript/superscript etc) - * when the spacing is small. */ - if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f) - { - new_line = 1; - distance = 0; - spacing = 0; - } - else - { - spacing /= size * SPACE_DIST; - /* Apply the same logic here as when we're adding chars to build spans. */ - if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) - spacing = 1; - } - } -#ifdef DEBUG_SPANS - printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing); -#endif - } - span->spacing = spacing; - last_line = push_span(ctx, tdev, span, new_line, distance); - last_span = span; + line->last_char->next = ch; + line->last_char = ch; } -} - -fz_stext_sheet * -fz_new_stext_sheet(fz_context *ctx) -{ - fz_stext_sheet *sheet = fz_malloc(ctx, sizeof *sheet); - sheet->maxid = 0; - sheet->style = NULL; - return sheet; -} -void -fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet) -{ - fz_stext_style *style; + ch->c = c; + ch->rtl = rtl; + ch->origin = *p; + ch->size = size; + ch->font = font; /* TODO: keep and drop */ - if (sheet == NULL) - return; - - style = sheet->style; - while (style) + if (line->wmode == 0) { - fz_stext_style *next = style->next; - fz_drop_font(ctx, style->font); - fz_free(ctx, style); - style = next; + a.x = 0; + d.x = 0; + a.y = fz_font_ascender(ctx, font); + d.y = fz_font_descender(ctx, font); } - fz_free(ctx, sheet); -} - -static fz_stext_style * -fz_lookup_stext_style_imp(fz_context *ctx, fz_stext_sheet *sheet, - float size, fz_font *font, int wmode, int script) -{ - fz_stext_style *style; - - for (style = sheet->style; style; style = style->next) + else { - if (style->font == font && - style->size == size && - style->wmode == wmode && - style->script == script) /* FIXME: others */ - { - return style; - } + fz_rect *bbox = fz_font_bbox(ctx, font); + a.x = bbox->x1; + d.x = bbox->x0; + a.y = 0; + d.y = 0; } + fz_transform_vector(&a, trm); + fz_transform_vector(&d, trm); - /* Better make a new one and add it to our list */ - style = fz_malloc(ctx, sizeof *style); - style->id = sheet->maxid++; - style->font = fz_keep_font(ctx, font); - style->size = size; - style->wmode = wmode; - style->script = script; - style->next = sheet->style; - sheet->style = style; - return style; -} + ch->bbox.x0 = min4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x); + ch->bbox.x1 = max4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x); + ch->bbox.y0 = min4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y); + ch->bbox.y1 = max4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y); -static fz_stext_style * -fz_lookup_stext_style(fz_context *ctx, fz_stext_sheet *sheet, fz_text_span *span, const fz_matrix *ctm, - fz_colorspace *colorspace, const float *color, float alpha, const fz_stroke_state *stroke) -{ - float size = 1.0f; - fz_font *font = span ? span->font : NULL; - int wmode = span ? span->wmode : 0; - if (ctm && span) + if (fz_is_empty_rect(&line->bbox)) + line->bbox = ch->bbox; + else { - fz_matrix tm = span->trm; - fz_matrix trm; - tm.e = 0; - tm.f = 0; - fz_concat(&trm, &tm, ctm); - size = fz_matrix_expansion(&trm); + line->bbox.x0 = fz_min(line->bbox.x0, ch->bbox.x0); + line->bbox.y0 = fz_min(line->bbox.y0, ch->bbox.y0); + line->bbox.x1 = fz_min(line->bbox.x1, ch->bbox.x1); + line->bbox.y1 = fz_min(line->bbox.y1, ch->bbox.y1); } - return fz_lookup_stext_style_imp(ctx, sheet, size, font, wmode, 0); -} -fz_stext_page * -fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox) -{ - fz_stext_page *page = fz_malloc(ctx, sizeof(*page)); - page->mediabox = *mediabox; - page->len = 0; - page->cap = 0; - page->blocks = NULL; - page->next = NULL; - return page; + return ch; } -static void -fz_drop_stext_line_contents(fz_context *ctx, fz_stext_line *line) +static int +direction_from_bidi_class(int bidiclass, int curdir) { - fz_stext_span *span, *next; - for (span = line->first_span; span; span=next) + switch (bidiclass) { - next = span->next; - fz_free(ctx, span->text); - fz_free(ctx, span); - } -} + /* strong */ + case UCDN_BIDI_CLASS_L: return 1; + case UCDN_BIDI_CLASS_R: return -1; + case UCDN_BIDI_CLASS_AL: return -1; -static void -fz_drop_stext_block(fz_context *ctx, fz_stext_block *block) -{ - fz_stext_line *line; - if (block == NULL) - return; - for (line = block->lines; line < block->lines + block->len; line++) - fz_drop_stext_line_contents(ctx, line); - fz_free(ctx, block->lines); - fz_free(ctx, block); -} + /* weak */ + case UCDN_BIDI_CLASS_EN: + case UCDN_BIDI_CLASS_ES: + case UCDN_BIDI_CLASS_ET: + case UCDN_BIDI_CLASS_AN: + case UCDN_BIDI_CLASS_CS: + case UCDN_BIDI_CLASS_NSM: + case UCDN_BIDI_CLASS_BN: + return curdir; -static void -fz_drop_image_block(fz_context *ctx, fz_image_block *block) -{ - if (block == NULL) - return; - fz_drop_image(ctx, block->image); - fz_drop_colorspace(ctx, block->cspace); - fz_free(ctx, block); -} + /* neutral */ + case UCDN_BIDI_CLASS_B: + case UCDN_BIDI_CLASS_S: + case UCDN_BIDI_CLASS_WS: + case UCDN_BIDI_CLASS_ON: + return curdir; -void -fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) -{ - fz_page_block *block; - if (page == NULL) - return; - for (block = page->blocks; block < page->blocks + page->len; block++) - { - switch (block->type) - { - case FZ_PAGE_BLOCK_TEXT: - fz_drop_stext_block(ctx, block->u.text); - break; - case FZ_PAGE_BLOCK_IMAGE: - fz_drop_image_block(ctx, block->u.image); - break; - } + /* embedding, override, pop ... we don't support them */ + default: + return 0; } - fz_free(ctx, page->blocks); - fz_free(ctx, page); } -static fz_stext_span * -fz_new_stext_span(fz_context *ctx, const fz_point *p, int wmode, const fz_matrix *trm) +static int +sign_eq(float x, float y) { - fz_stext_span *span = fz_malloc_struct(ctx, fz_stext_span); - span->ascender_max = 0; - span->descender_min = 0; - span->cap = 0; - span->len = 0; - span->min = *p; - span->max = *p; - span->wmode = wmode; - span->transform.a = trm->a; - span->transform.b = trm->b; - span->transform.c = trm->c; - span->transform.d = trm->d; - span->transform.e = 0; - span->transform.f = 0; - span->text = NULL; - span->next = NULL; - return span; + return (x < 0 && y < 0) || (x > 0 && y > 0) || (x == 0 && y == 0); } -static void -add_char_to_span(fz_context *ctx, fz_stext_span *span, int c, fz_point *p, fz_point *max, fz_stext_style *style) +static int +mat_sign_eq(const fz_matrix *x, const fz_matrix *y) { - if (span->len == span->cap) - { - int newcap = (span->cap ? span->cap * 2 : 16); - span->text = fz_resize_array(ctx, span->text, newcap, sizeof(fz_stext_char)); - span->cap = newcap; - span->bbox = fz_empty_rect; - } - span->max = *max; - if (style->ascender > span->ascender_max) - span->ascender_max = style->ascender; - if (style->descender < span->descender_min) - span->descender_min = style->descender; - span->text[span->len].c = c; - span->text[span->len].p = *p; - span->text[span->len].style = style; - span->len++; + return sign_eq(x->a, y->a) && sign_eq(x->b, y->b) && sign_eq(x->c, y->c) && sign_eq(x->d, y->d); } static void -fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) +fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode) { - int can_append = 1; + fz_stext_page *page = dev->page; + fz_stext_block *cur_block; + fz_stext_line *cur_line; + + int new_para = 0; + int new_line = 1; int add_space = 0; - fz_point dir, ndir, p, q, r; + fz_point dir, ndir, p, q; float size; fz_point delta; float spacing = 0; float base_offset = 0; + int rtl = 0; + + dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir); + /* dir = direction vector for motion. ndir = normalised(dir) */ if (wmode == 0) { dir.x = 1; @@ -602,17 +273,16 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty fz_transform_vector(&dir, trm); ndir = dir; fz_normalize_vector(&ndir); - /* dir = direction vector for motion. ndir = normalised(dir) */ size = fz_matrix_expansion(trm); /* We need to identify where glyphs 'start' (p) and 'stop' (q). - * Each glyph holds it's 'start' position, and the next glyph in the - * span (or span->max if there is no next glyph) holds it's 'end' + * Each glyph holds its 'start' position, and the next glyph in the + * span (or span->max if there is no next glyph) holds its 'end' * position. * * For both horizontal and vertical motion, trm->{e,f} gives the - * bottom left corner of the glyph. + * origin (usually the bottom left) of the glyph. * * In horizontal mode: * + p is bottom left. @@ -636,37 +306,38 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty q.y = trm->f; } - if (glyph < 0) + /* Find current position to enter new text. */ + cur_block = page->last_block; + if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) + cur_block = NULL; + cur_line = cur_block ? cur_block->u.t.last_line : NULL; + + if (cur_line && glyph < 0) { - /* Don't reset 'pen' to start of no-glyph characters in cluster */ - if (dev->cur_span) - q = dev->cur_span->max; - goto no_glyph; + /* Don't advance pen or break lines for no-glyph characters in a cluster */ + add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, 0); + dev->lastchar = c; + return; } - if (dev->cur_span == NULL || - trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b || - trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d || - dev->cur_span->wmode != wmode) + if (cur_line == NULL || !mat_sign_eq(trm, &dev->trm) || cur_line->wmode != wmode) { - /* If the matrix has changed, or the wmode is different (or - * if we don't have a span at all), then we can't append. */ -#ifdef DEBUG_SPANS - printf("Transform/WMode changed\n"); -#endif - can_append = 0; + /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), + * then we can't append to the current block/line. */ + new_para = 1; + new_line = 1; } else { - delta.x = q.x - dev->cur_span->max.x; - delta.y = q.y - dev->cur_span->max.y; + /* Detect fake bold where text is printed twice in the same place. */ + delta.x = q.x - dev->pen.x; + delta.y = q.y - dev->pen.y; if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) return; - /* Calculate how far we've moved since the end of the current - * span. */ - delta.x = p.x - dev->cur_span->max.x; - delta.y = p.y - dev->cur_span->max.y; + /* Calculate how far we've moved since the last character. */ + delta.x = p.x - dev->pen.x; + delta.y = p.y - dev->pen.y; /* The transform has not changed, so we know we're in the same * direction. Calculate 2 distances; how far off the previous @@ -675,102 +346,129 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty spacing = ndir.x * delta.x + ndir.y * delta.y; base_offset = -ndir.y * delta.x + ndir.x * delta.y; - spacing /= size * SPACE_DIST; - if (fabsf(base_offset) < size * 0.1f) + /* Only a small amount off the baseline - we'll take this */ + if (fabsf(base_offset) < size * 0.8f) { - /* Only a small amount off the baseline - we'll take this */ - if (fabsf(spacing) < 1.0f) + /* LTR or neutral character */ + if (dev->curdir >= 0) { - /* Motion is in line, and small. */ - } - else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST)) - { - /* Motion is in line, but large enough - * to warrant us adding a space */ - if (dev->lastchar != ' ' && wmode == 0) - add_space = 1; + if (fabs(spacing) < size * SPACE_DIST) + { + /* Motion is in line, and small. */ + new_line = 0; + } + else if (spacing >= size * SPACE_DIST && spacing < size * SPACE_MAX_DIST) + { + /* Motion is in line, but large enough to warrant us adding a space. */ + if (dev->lastchar != ' ' && wmode == 0) + add_space = 1; + new_line = 0; + } + else + { + /* Motion is in line, but large enough to warrant splitting to a new line */ + new_line = 1; + } } + + /* RTL character -- disable space character and column detection heuristics */ else { - /* Motion is in line, but too large - split to a new span */ - can_append = 0; + new_line = 0; + if (spacing > size * SPACE_DIST || spacing < 0) + rtl = 0; /* backward (or big jump to 'right' side) means logical order */ + else + rtl = 1; /* visual order, we need to reverse in a post process pass */ } } + + /* Enough for a new line, but not enough for a new paragraph */ + else if (fabsf(base_offset) < size * 1.3f) + { + /* Check indent to spot text-indent style paragraphs */ + if (wmode == 0 && cur_line && dev->new_obj) + if (fabsf(p.x - dev->start.x) > size * 0.5f) + new_para = 1; + new_line = 1; + } + + /* Way off the baseline - open a new paragraph */ else { - can_append = 0; -#ifdef DEBUG_SPANS - spacing = 0; -#endif + new_para = 1; + new_line = 1; } } -#ifdef DEBUG_SPANS - printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset); -#endif + /* Start a new block (but only at the beginning of a text object) */ + if (new_para || !cur_block) + { + cur_block = add_text_block_to_page(ctx, page); + cur_line = cur_block->u.t.last_line; + } - /* Start a new span */ - if (!can_append) + /* Start a new line */ + if (new_line || !cur_line) { - add_span_to_soup(ctx, dev->spans, dev->cur_span); - dev->cur_span = NULL; - dev->cur_span = fz_new_stext_span(ctx, &p, wmode, trm); - dev->cur_span->spacing = 0; + cur_line = add_line_to_block(ctx, page, cur_block, wmode); + dev->start = p; } /* Add synthetic space */ if (add_space) - { - /* We know we always have a cur_span here */ - r = dev->cur_span->max; - add_char_to_span(ctx, dev->cur_span, ' ', &r, &p, style); - } + add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, rtl); -no_glyph: - add_char_to_span(ctx, dev->cur_span, c, &p, &q, style); + add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, rtl); dev->lastchar = c; + dev->pen = q; + + dev->new_obj = 0; + dev->trm = *trm; } static void -fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode) +fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode) { /* ignore when one unicode character maps to multiple glyphs */ if (c == -1) return; if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) + { switch (c) { case 0xFB00: /* ff */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); return; case 0xFB01: /* fi */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); return; case 0xFB02: /* fl */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); return; case 0xFB03: /* ffi */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); return; case 0xFB04: /* ffl */ - fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode); - fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); return; case 0xFB05: /* long st */ case 0xFB06: /* st */ - fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode); - fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode); + fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode); return; } + } if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) + { switch (c) { case 0x0009: /* tab */ @@ -794,56 +492,23 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, case 0x3000: /* ideographic space */ c = ' '; } + } - fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode); + fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode); } static void -fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm, fz_stext_style *style) +fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm) { fz_font *font = span->font; - FT_Face face = fz_font_ft_face(ctx, font); - fz_buffer **t3procs = fz_font_t3_procs(ctx, font); - fz_rect *bbox = fz_font_bbox(ctx, font); fz_matrix tm = span->trm; fz_matrix trm; float adv; - float ascender = 1; - float descender = 0; - int i, err; + int i; if (span->len == 0) return; - if (dev->spans == NULL) - dev->spans = new_span_soup(ctx); - - if (style->wmode == 0) - { - if (face) - { - fz_lock(ctx, FZ_LOCK_FREETYPE); - err = FT_Set_Char_Size(face, 64, 64, 72, 72); - if (err) - fz_warn(ctx, "freetype set character size: %s", ft_error_string(err)); - ascender = (float)face->ascender / face->units_per_EM; - descender = (float)face->descender / face->units_per_EM; - fz_unlock(ctx, FZ_LOCK_FREETYPE); - } - else if (t3procs && !fz_is_empty_rect(bbox)) - { - ascender = bbox->y1; - descender = bbox->y0; - } - } - else - { - ascender = bbox->x1; - descender = bbox->x0; - } - style->ascender = ascender; - style->descender = descender; - tm.e = 0; tm.f = 0; fz_concat(&trm, &tm, ctm); @@ -857,11 +522,11 @@ fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, cons /* Calculate bounding box and new pen position based on font metrics */ if (span->items[i].gid >= 0) - adv = fz_advance_glyph(ctx, font, span->items[i].gid, style->wmode); + adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode); else adv = 0; - fz_add_stext_char(ctx, dev, style, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode); + fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode); } } @@ -870,13 +535,10 @@ fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, const f fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, NULL); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void @@ -884,94 +546,61 @@ fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, stroke); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, stroke); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } static void fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_style *style; fz_text_span *span; + tdev->new_obj = 1; for (span = text->head; span; span = span->next) - { - style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL); - fz_stext_extract(ctx, tdev, span, ctm, style); - } + fz_stext_extract(ctx, tdev, span, ctm); } +/* Images and shadings */ + static void -fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, - fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params) +fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params) { fz_stext_device *tdev = (fz_stext_device*)dev; - fz_stext_page *page = tdev->page; - fz_image_block *block; - /* If the alpha is less than 50% then it's probably a watermark or - * effect or something. Skip it */ + /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ if (alpha < 0.5f) return; - /* New block */ - if (page->len == page->cap) - { - int newcap = (page->cap ? page->cap*2 : 4); - page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks)); - page->cap = newcap; - } - block = fz_malloc_struct(ctx, fz_image_block); - page->blocks[page->len].type = FZ_PAGE_BLOCK_IMAGE; - page->blocks[page->len].u.image = block; - block->image = fz_keep_image(ctx, img); - block->cspace = fz_keep_colorspace(ctx, cspace); - if (cspace) - memcpy(block->colors, color, sizeof(block->colors[0])*fz_colorspace_n(ctx, cspace)); - block->mat = *ctm; - block->bbox.x0 = 0; - block->bbox.y0 = 0; - block->bbox.x1 = 1; - block->bbox.y1 = 1; - fz_transform_rect(&block->bbox, ctm); - page->len++; + add_image_block_to_page(ctx, tdev->page, ctm, img); } static void -fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params) +fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, + fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params) { - fz_stext_fill_image_mask(ctx, dev, img, ctm, NULL, NULL, alpha, color_params); + fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); } static fz_image * @@ -1025,103 +654,89 @@ fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, const fz_m fz_rethrow(ctx); } -static int -direction_from_bidi_class(int bidiclass, int curdir) -{ - switch (bidiclass) - { - /* strong */ - case UCDN_BIDI_CLASS_L: return 1; - case UCDN_BIDI_CLASS_R: return -1; - case UCDN_BIDI_CLASS_AL: return -1; - - /* weak */ - case UCDN_BIDI_CLASS_EN: - case UCDN_BIDI_CLASS_ES: - case UCDN_BIDI_CLASS_ET: - case UCDN_BIDI_CLASS_AN: - case UCDN_BIDI_CLASS_CS: - case UCDN_BIDI_CLASS_NSM: - case UCDN_BIDI_CLASS_BN: - return curdir; - - /* neutral */ - case UCDN_BIDI_CLASS_B: - case UCDN_BIDI_CLASS_S: - case UCDN_BIDI_CLASS_WS: - case UCDN_BIDI_CLASS_ON: - return curdir; - - /* embedding, override, pop ... we don't support them */ - default: - return 0; - } -} +/* RTL visual to logical order pass */ static void -fz_bidi_reorder_run(fz_stext_span *span, int a, int b, int dir) +fz_bidi_reorder_run(fz_stext_char *a, fz_stext_char *b, int dir) { if (a < b && dir == -1) { - fz_stext_char c; - int m = a + (b - a) / 2; + fz_stext_char tmp; + fz_stext_char *m = a + (b - a) / 2; while (a < m) { b--; - c = span->text[a]; - span->text[a] = span->text[b]; - span->text[b] = c; + + tmp.c = a->c; + tmp.origin = a->origin; + tmp.bbox = a->bbox; + tmp.size = a->size; + tmp.font = a->font; + + a->c = b->c; + a->origin = b->origin; + a->bbox = b->bbox; + a->size = b->size; + a->font = b->font; + + b->c = tmp.c; + b->origin = tmp.origin; + b->bbox = tmp.bbox; + b->size = tmp.size; + b->font = tmp.font; + a++; } } } static void -fz_bidi_reorder_span(fz_stext_span *span) +fz_bidi_reorder_line(fz_stext_line *line) { - int a, b, dir, curdir; + fz_stext_char *a, *b; + int dir, curdir; - a = 0; - curdir = 1; - for (b = 0; b < span->len; b++) + a = line->first_char; + curdir = 0; + for (b = line->first_char; b; b = b->next) { - dir = direction_from_bidi_class(ucdn_get_bidi_class(span->text[b].c), curdir); + dir = b->rtl; if (dir != curdir) { - fz_bidi_reorder_run(span, a, b, curdir); + fz_bidi_reorder_run(a, b, curdir); curdir = dir; a = b; } } - fz_bidi_reorder_run(span, a, b, curdir); + fz_bidi_reorder_run(a, b, curdir); } static void fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page) { - fz_page_block *pageblock; fz_stext_block *block; fz_stext_line *line; - fz_stext_span *span; - for (pageblock = page->blocks; pageblock < page->blocks + page->len; pageblock++) - if (pageblock->type == FZ_PAGE_BLOCK_TEXT) - for (block = pageblock->u.text, line = block->lines; line < block->lines + block->len; line++) - for (span = line->first_span; span; span = span->next) - fz_bidi_reorder_span(span); + for (block = page->first_block; block; block = block->next) + if (block->type == FZ_STEXT_BLOCK_TEXT) + for (line = block->u.t.first_line; line; line = line->next) + fz_bidi_reorder_line(line); } static void fz_stext_close_device(fz_context *ctx, fz_device *dev) { fz_stext_device *tdev = (fz_stext_device*)dev; + fz_stext_page *page = tdev->page; + fz_stext_block *block; + fz_stext_line *line; - add_span_to_soup(ctx, tdev->spans, tdev->cur_span); - tdev->cur_span = NULL; - - strain_soup(ctx, tdev); + for (block = page->first_block; block; block = block->next) + if (block->type == FZ_STEXT_BLOCK_TEXT) + for (line = block->u.t.first_line; line; line = line->next) + fz_union_rect(&block->bbox, &line->bbox); - /* TODO: smart sorting of blocks in reading order */ + /* TODO: smart sorting of blocks and lines in reading order */ /* TODO: unicode NFC normalization */ fz_bidi_reorder_stext_page(ctx, tdev->page); @@ -1130,9 +745,6 @@ fz_stext_close_device(fz_context *ctx, fz_device *dev) static void fz_stext_drop_device(fz_context *ctx, fz_device *dev) { - fz_stext_device *tdev = (fz_stext_device*)dev; - free_span_soup(ctx, tdev->spans); - tdev->spans = NULL; } fz_stext_options * @@ -1153,7 +765,7 @@ fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *stri } fz_device * -fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts) +fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) { fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); @@ -1174,11 +786,12 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, dev->super.fill_image_mask = fz_stext_fill_image_mask; } - dev->sheet = sheet; dev->page = page; - dev->spans = NULL; - dev->cur_span = NULL; + dev->pen.x = 0; + dev->pen.y = 0; + dev->trm = fz_identity; dev->lastchar = ' '; + dev->curdir = 1; return (fz_device*)dev; } diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c index 63124aa7..f5f72412 100644 --- a/source/fitz/stext-output.c +++ b/source/fitz/stext-output.c @@ -9,40 +9,28 @@ /* HTML output (visual formatting with preserved layout) */ static void -fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int is_serif = fz_font_is_serif(ctx, style->font); - int is_mono = fz_font_is_monospaced(ctx, style->font); - int script = style->script; + int is_bold = fz_font_is_bold(ctx, font); + int is_italic = fz_font_is_italic(ctx, font); + int is_serif = fz_font_is_serif(ctx, font); + int is_mono = fz_font_is_monospaced(ctx, font); - fz_write_printf(ctx, out, "", is_serif ? "serif" : "sans-serif", style->size); + fz_write_printf(ctx, out, "", is_serif ? "serif" : "sans-serif", size); if (is_mono) fz_write_string(ctx, out, ""); if (is_bold) fz_write_string(ctx, out, ""); if (is_italic) fz_write_string(ctx, out, ""); - - while (script-- > 0) - fz_write_string(ctx, out, ""); - while (++script < 0) - fz_write_string(ctx, out, ""); } static void -fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_mono = fz_font_is_monospaced(ctx, style->font); - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int script = style->script; - - while (script-- > 0) - fz_write_string(ctx, out, ""); - while (++script < 0) - fz_write_string(ctx, out, ""); + int is_mono = fz_font_is_monospaced(ctx, font); + int is_bold = fz_font_is_bold(ctx,font); + int is_italic = fz_font_is_italic(ctx, font); if (is_italic) fz_write_string(ctx, out, ""); @@ -54,7 +42,7 @@ fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style) } static void -fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block) +fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) { int x = block->bbox.x0; int y = block->bbox.y0; @@ -62,90 +50,78 @@ fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *bl int h = block->bbox.y1 - block->bbox.y0; fz_write_printf(ctx, out, "image); + fz_write_image_as_data_uri(ctx, out, block->u.i.image); fz_write_string(ctx, out, "\">\n"); } void fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block) { - fz_stext_style *style = NULL; fz_stext_line *line; - fz_stext_span *span; fz_stext_char *ch; int x, y; - style = NULL; + fz_font *font = NULL; + float size = 0; - for (line = block->lines; line < block->lines + block->len; ++line) + for (line = block->u.t.first_line; line; line = line->next) { - for (span = line->first_span; span; span = span->next) + x = line->bbox.x0; + y = line->bbox.y0; + + fz_write_printf(ctx, out, "

", y, x); + font = NULL; + + for (ch = line->first_char; ch; ch = ch->next) { - if (span == line->first_span || span->spacing > 1) + if (ch->font != font || ch->size != size) { - if (style) - { - fz_print_style_end_html(ctx, out, style); - fz_write_string(ctx, out, "

\n"); - style = NULL; - } - x = span->bbox.x0; - y = span->bbox.y0; - fz_write_printf(ctx, out, "

", y, x); + if (font) + fz_print_style_end_html(ctx, out, font, size); + font = ch->font; + size = ch->size; + fz_print_style_begin_html(ctx, out, font, size); } - for (ch = span->text; ch < span->text + span->len; ++ch) + switch (ch->c) { - if (ch->style != style) - { - if (style) - fz_print_style_end_html(ctx, out, style); - style = ch->style; - fz_print_style_begin_html(ctx, out, style); - } - - switch (ch->c) - { - default: - if (ch->c >= 32 && ch->c <= 127) - fz_write_byte(ctx, out, ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - break; - case '<': fz_write_string(ctx, out, "<"); break; - case '>': fz_write_string(ctx, out, ">"); break; - case '&': fz_write_string(ctx, out, "&"); break; - case '"': fz_write_string(ctx, out, """); break; - case '\'': fz_write_string(ctx, out, "'"); break; - } + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_byte(ctx, out, ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; } } - if (style) - { - fz_print_style_end_html(ctx, out, style); - fz_write_string(ctx, out, "

\n"); - style = NULL; - } + if (font) + fz_print_style_end_html(ctx, out, font, size); + + fz_write_string(ctx, out, "

\n"); } } void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page) { - fz_page_block *block; + fz_stext_block *block; int w = page->mediabox.x1 - page->mediabox.x0; int h = page->mediabox.y1 - page->mediabox.y0; fz_write_printf(ctx, out, "
\n", w, h); - for (block = page->blocks; block < page->blocks + page->len; ++block) + for (block = page->first_block; block; block = block->next) { - if (block->type == FZ_PAGE_BLOCK_IMAGE) - fz_print_stext_image_as_html(ctx, out, block->u.image); - else if (block->type == FZ_PAGE_BLOCK_TEXT) - fz_print_stext_block_as_html(ctx, out, block->u.text); + if (block->type == FZ_STEXT_BLOCK_IMAGE) + fz_print_stext_image_as_html(ctx, out, block); + else if (block->type == FZ_STEXT_BLOCK_TEXT) + fz_print_stext_block_as_html(ctx, out, block); } fz_write_string(ctx, out, "
\n"); @@ -177,23 +153,22 @@ fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out) /* XHTML output (semantic, little layout, suitable for reflow) */ static void -fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block) +fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) { int w = block->bbox.x1 - block->bbox.x0; int h = block->bbox.y1 - block->bbox.y0; fz_write_printf(ctx, out, "

image); + fz_write_image_as_data_uri(ctx, out, block->u.i.image); fz_write_string(ctx, out, "\"/>

\n"); } static void -fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_mono = fz_font_is_monospaced(ctx, style->font); - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int script = style->script; + int is_mono = fz_font_is_monospaced(ctx, font); + int is_bold = fz_font_is_bold(ctx, font); + int is_italic = fz_font_is_italic(ctx, font); if (is_mono) fz_write_string(ctx, out, ""); @@ -201,25 +176,14 @@ fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *styl fz_write_string(ctx, out, ""); if (is_italic) fz_write_string(ctx, out, ""); - - while (script-- > 0) - fz_write_string(ctx, out, ""); - while (++script < 0) - fz_write_string(ctx, out, ""); } static void -fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) +fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size) { - int is_mono = fz_font_is_monospaced(ctx, style->font); - int is_bold = fz_font_is_bold(ctx, style->font); - int is_italic = fz_font_is_italic(ctx, style->font); - int script = style->script; - - while (script-- > 0) - fz_write_string(ctx, out, ""); - while (++script < 0) - fz_write_string(ctx, out, ""); + int is_mono = fz_font_is_monospaced(ctx, font); + int is_bold = fz_font_is_bold(ctx, font); + int is_italic = fz_font_is_italic(ctx, font); if (is_italic) fz_write_string(ctx, out, ""); @@ -232,68 +196,63 @@ fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style) static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block) { fz_stext_line *line; - fz_stext_span *span; fz_stext_char *ch; - fz_stext_style *style; - style = NULL; - fz_write_string(ctx, out, "

\n"); + fz_font *font = NULL; + float size = 0; + + fz_write_string(ctx, out, "

"); - for (line = block->lines; line < block->lines + block->len; ++line) + for (line = block->u.t.first_line; line; line = line->next) { - if (line > block->lines) - fz_write_string(ctx, out, "
\n"); - for (span = line->first_span; span; span = span->next) + if (line != block->u.t.first_line) + fz_write_string(ctx, out, "\n"); + for (ch = line->first_char; ch; ch = ch->next) { - if (span->spacing > 1) - fz_write_byte(ctx, out, ' '); - - for (ch = span->text; ch < span->text + span->len; ++ch) + if (ch->font != font || ch->size != size) { - if (ch->style != style) - { - if (style) - fz_print_style_end_xhtml(ctx, out, style); - style = ch->style; - fz_print_style_begin_xhtml(ctx, out, style); - } + if (font) + fz_print_style_end_xhtml(ctx, out, font, size); + font = ch->font; + size = ch->size; + fz_print_style_begin_xhtml(ctx, out, font, size); + } - switch (ch->c) - { - default: - if (ch->c >= 32 && ch->c <= 127) - fz_write_byte(ctx, out, ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - break; - case '<': fz_write_string(ctx, out, "<"); break; - case '>': fz_write_string(ctx, out, ">"); break; - case '&': fz_write_string(ctx, out, "&"); break; - case '"': fz_write_string(ctx, out, """); break; - case '\'': fz_write_string(ctx, out, "'"); break; - } + switch (ch->c) + { + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_byte(ctx, out, ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; } } } - if (style) - fz_print_style_end_xhtml(ctx, out, style); - fz_write_string(ctx, out, "\n

\n"); + if (font) + fz_print_style_end_xhtml(ctx, out, font, size); + fz_write_string(ctx, out, "

\n"); } void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page) { - fz_page_block *block; + fz_stext_block *block; fz_write_string(ctx, out, "
\n"); - for (block = page->blocks; block < page->blocks + page->len; ++block) + for (block = page->first_block; block; block = block->next) { - if (block->type == FZ_PAGE_BLOCK_IMAGE) - fz_print_stext_image_as_xhtml(ctx, out, block->u.image); - else if (block->type == FZ_PAGE_BLOCK_TEXT) - fz_print_stext_block_as_xhtml(ctx, out, block->u.text); + if (block->type == FZ_STEXT_BLOCK_IMAGE) + fz_print_stext_image_as_xhtml(ctx, out, block); + else if (block->type == FZ_STEXT_BLOCK_TEXT) + fz_print_stext_block_as_xhtml(ctx, out, block); } fz_write_string(ctx, out, "
\n"); @@ -311,6 +270,7 @@ fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out) fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); fz_write_string(ctx, out, "\n"); @@ -328,87 +288,79 @@ fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out) void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) { - int block_n; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; fz_write_printf(ctx, out, "\n", page->mediabox.x1 - page->mediabox.x0, page->mediabox.y1 - page->mediabox.y0); - for (block_n = 0; block_n < page->len; block_n++) + for (block = page->first_block; block; block = block->next) { - switch (page->blocks[block_n].type) - { - case FZ_PAGE_BLOCK_TEXT: + switch (block->type) { - fz_stext_block *block = page->blocks[block_n].u.text; - fz_stext_line *line; - const char *s; - + case FZ_STEXT_BLOCK_TEXT: fz_write_printf(ctx, out, "\n", - block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); - for (line = block->lines; line < block->lines + block->len; line++) + block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); + for (line = block->u.t.first_line; line; line = line->next) { - fz_stext_span *span; + fz_font *font = NULL; + float size = 0; + const char *name = NULL; + const char *s; + fz_rect rect; + fz_write_printf(ctx, out, "\n", - line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); - for (span = line->first_span; span; span = span->next) + line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1); + + for (ch = line->first_char; ch; ch = ch->next) { - fz_stext_style *style = NULL; - const char *name = NULL; - int char_num; - for (char_num = 0; char_num < span->len; char_num++) + if (ch->font != font || ch->size != size) + { + if (font) + fz_write_string(ctx, out, "\n"); + font = ch->font; + size = ch->size; + name = fz_font_name(ctx, font); + s = strchr(name, '+'); + s = s ? s + 1 : name; + fz_write_printf(ctx, out, "\n", s, size); + } + fz_stext_char_bbox(ctx, &rect, line, ch); + fz_write_printf(ctx, out, "origin.x, ch->origin.y); + switch (ch->c) { - fz_stext_char *ch = &span->text[char_num]; - if (ch->style != style) - { - if (style) - { - fz_write_string(ctx, out, "
\n"); - } - style = ch->style; - name = fz_font_name(ctx, style->font); - s = strchr(name, '+'); - s = s ? s + 1 : name; - fz_write_printf(ctx, out, "\n", - span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1, - s, style->size); - } - { - fz_rect rect; - fz_stext_char_bbox(ctx, &rect, span, char_num); - fz_write_printf(ctx, out, "p.x, ch->p.y); - } - switch (ch->c) - { - case '<': fz_write_string(ctx, out, "<"); break; - case '>': fz_write_string(ctx, out, ">"); break; - case '&': fz_write_string(ctx, out, "&"); break; - case '"': fz_write_string(ctx, out, """); break; - case '\'': fz_write_string(ctx, out, "'"); break; - default: - if (ch->c >= 32 && ch->c <= 127) - fz_write_printf(ctx, out, "%c", ch->c); - else - fz_write_printf(ctx, out, "&#x%x;", ch->c); - break; - } - fz_write_string(ctx, out, "\"/>\n"); + case '<': fz_write_string(ctx, out, "<"); break; + case '>': fz_write_string(ctx, out, ">"); break; + case '&': fz_write_string(ctx, out, "&"); break; + case '"': fz_write_string(ctx, out, """); break; + case '\'': fz_write_string(ctx, out, "'"); break; + default: + if (ch->c >= 32 && ch->c <= 127) + fz_write_printf(ctx, out, "%c", ch->c); + else + fz_write_printf(ctx, out, "&#x%x;", ch->c); + break; } - if (style) - fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\"/>\n"); } + + if (font) + fz_write_string(ctx, out, "\n"); + fz_write_string(ctx, out, "\n"); } fz_write_string(ctx, out, "\n"); break; - } - case FZ_PAGE_BLOCK_IMAGE: - { + + case FZ_STEXT_BLOCK_IMAGE: + fz_write_printf(ctx, out, "\n", + block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1); break; } } - } fz_write_string(ctx, out, "\n"); } @@ -417,31 +369,23 @@ fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page) void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page) { - fz_page_block *pblock; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + char utf[10]; + int i, n; - for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock) + for (block = page->first_block; block; block = block->next) { - if (pblock->type == FZ_PAGE_BLOCK_TEXT) + if (block->type == FZ_STEXT_BLOCK_TEXT) { - fz_stext_block *block = pblock->u.text; - fz_stext_line *line; - fz_stext_char *ch; - char utf[10]; - int i, n; - - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { - fz_stext_span *span; - for (span = line->first_span; span; span = span->next) + for (ch = line->first_char; ch; ch = ch->next) { - if (span->spacing > 1) - fz_write_byte(ctx, out, ' '); - for (ch = span->text; ch < span->text + span->len; ch++) - { - n = fz_runetochar(utf, ch->c); - for (i = 0; i < n; i++) - fz_write_byte(ctx, out, utf[i]); - } + n = fz_runetochar(utf, ch->c); + for (i = 0; i < n; i++) + fz_write_byte(ctx, out, utf[i]); } fz_write_string(ctx, out, "\n"); } @@ -466,7 +410,6 @@ struct fz_text_writer_s fz_document_writer super; int format; fz_stext_options opts; - fz_stext_sheet *sheet; fz_stext_page *page; fz_output *out; }; @@ -483,7 +426,7 @@ text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediab } wri->page = fz_new_stext_page(ctx, mediabox); - return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts); + return fz_new_stext_device(ctx, wri->page, &wri->opts); } static void @@ -537,7 +480,6 @@ text_drop_writer(fz_context *ctx, fz_document_writer *wri_) { fz_text_writer *wri = (fz_text_writer*)wri_; fz_drop_stext_page(ctx, wri->page); - fz_drop_stext_sheet(ctx, wri->sheet); fz_drop_output(ctx, wri->out); } @@ -561,7 +503,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const else if (!strcmp(format, "stext")) wri->format = FZ_FORMAT_STEXT; - wri->sheet = fz_new_stext_sheet(ctx); wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0); switch (wri->format) @@ -581,7 +522,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const fz_catch(ctx) { fz_drop_output(ctx, wri->out); - fz_drop_stext_sheet(ctx, wri->sheet); fz_free(ctx, wri); fz_rethrow(ctx); } diff --git a/source/fitz/stext-paragraph.c b/source/fitz/stext-paragraph.c deleted file mode 100644 index e275ecae..00000000 --- a/source/fitz/stext-paragraph.c +++ /dev/null @@ -1,1538 +0,0 @@ -#include "mupdf/fitz.h" - -#include -#include -#include - -/* Assemble span soup into blocks and lines. */ - -#define MY_EPSILON 0.001f - -#include /* for debug printing */ -#undef DEBUG_LINE_HEIGHTS -#undef DEBUG_MASKS -#undef DEBUG_ALIGN -#undef DEBUG_INDENTS - -#undef SPOT_LINE_NUMBERS - -typedef struct line_height_s -{ - float height; - int count; - fz_stext_style *style; -} line_height; - -typedef struct line_heights_s -{ - fz_context *ctx; - int cap; - int len; - line_height *lh; -} line_heights; - -static line_heights * -new_line_heights(fz_context *ctx) -{ - line_heights *lh = fz_malloc_struct(ctx, line_heights); - lh->ctx = ctx; - return lh; -} - -static void -free_line_heights(line_heights *lh) -{ - if (!lh) - return; - fz_free(lh->ctx, lh->lh); - fz_free(lh->ctx, lh); -} - -static void -insert_line_height(line_heights *lh, fz_stext_style *style, float height) -{ - int i; - -#ifdef DEBUG_LINE_HEIGHTS - printf("style=%x height=%g\n", style, height); -#endif - - /* If we have one already, add it in */ - for (i=0; i < lh->len; i++) - { - /* Match if we are within 5% */ - if (lh->lh[i].style == style && lh->lh[i].height * 0.95f <= height && lh->lh[i].height * 1.05f >= height) - { - /* Ensure that the average height is correct */ - lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1); - lh->lh[i].count++; - return; - } - } - - /* Otherwise extend (if required) and add it */ - if (lh->cap == lh->len) - { - int newcap = (lh->cap ? lh->cap * 2 : 4); - lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height)); - lh->cap = newcap; - } - - lh->lh[lh->len].count = 1; - lh->lh[lh->len].height = height; - lh->lh[lh->len].style = style; - lh->len++; -} - -static void -cull_line_heights(line_heights *lh) -{ - int i, j, k; - -#ifdef DEBUG_LINE_HEIGHTS - printf("Before culling:\n"); - for (i = 0; i < lh->len; i++) - { - fz_stext_style *style = lh->lh[i].style; - printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count); - } -#endif - for (i = 0; i < lh->len; i++) - { - fz_stext_style *style = lh->lh[i].style; - int count = lh->lh[i].count; - int max = i; - - /* Find the max for this style */ - for (j = i+1; j < lh->len; j++) - { - if (lh->lh[j].style == style && lh->lh[j].count > count) - { - max = j; - count = lh->lh[j].count; - } - } - - /* Destroy all the ones other than the max */ - if (max != i) - { - lh->lh[i].count = count; - lh->lh[i].height = lh->lh[max].height; - lh->lh[max].count = 0; - } - j = i+1; - for (k = j; k < lh->len; k++) - { - if (lh->lh[k].style != style) - lh->lh[j++] = lh->lh[k]; - } - lh->len = j; - } -#ifdef DEBUG_LINE_HEIGHTS - printf("After culling:\n"); - for (i = 0; i < lh->len; i++) - { - fz_stext_style *style = lh->lh[i].style; - printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count); - } -#endif -} - -static float -line_height_for_style(line_heights *lh, fz_stext_style *style) -{ - int i; - - for (i=0; i < lh->len; i++) - { - if (lh->lh[i].style == style) - return lh->lh[i].height; - } - return 0.0f; /* Never reached */ -} - -static void -split_block(fz_context *ctx, fz_stext_page *page, int block_num, int linenum) -{ - int split_len; - fz_stext_block *block, *block2; - - if (page->len == page->cap) - { - int new_cap = fz_maxi(16, page->cap * 2); - page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks)); - page->cap = new_cap; - } - - memmove(page->blocks+block_num+1, page->blocks+block_num, (page->len - block_num)*sizeof(*page->blocks)); - page->len++; - - block2 = fz_malloc_struct(ctx, fz_stext_block); - block = page->blocks[block_num].u.text; - - page->blocks[block_num+1].type = FZ_PAGE_BLOCK_TEXT; - page->blocks[block_num+1].u.text = block2; - split_len = block->len - linenum; - block2->bbox = block->bbox; /* FIXME! */ - block2->cap = 0; - block2->len = 0; - block2->lines = NULL; - block2->lines = fz_malloc_array(ctx, split_len, sizeof(fz_stext_line)); - block2->cap = block2->len; - block2->len = split_len; - block->len = linenum; - memcpy(block2->lines, block->lines + linenum, split_len * sizeof(fz_stext_line)); - block2->lines[0].distance = 0; -} - -static inline int -is_unicode_wspace(int c) -{ - return (c == 9 || /* TAB */ - c == 0x0a || /* HT */ - c == 0x0b || /* LF */ - c == 0x0c || /* VT */ - c == 0x0d || /* FF */ - c == 0x20 || /* CR */ - c == 0x85 || /* NEL */ - c == 0xA0 || /* No break space */ - c == 0x1680 || /* Ogham space mark */ - c == 0x180E || /* Mongolian Vowel Separator */ - c == 0x2000 || /* En quad */ - c == 0x2001 || /* Em quad */ - c == 0x2002 || /* En space */ - c == 0x2003 || /* Em space */ - c == 0x2004 || /* Three-per-Em space */ - c == 0x2005 || /* Four-per-Em space */ - c == 0x2006 || /* Five-per-Em space */ - c == 0x2007 || /* Figure space */ - c == 0x2008 || /* Punctuation space */ - c == 0x2009 || /* Thin space */ - c == 0x200A || /* Hair space */ - c == 0x2028 || /* Line separator */ - c == 0x2029 || /* Paragraph separator */ - c == 0x202F || /* Narrow no-break space */ - c == 0x205F || /* Medium mathematical space */ - c == 0x3000); /* Ideographic space */ -} - -static inline int -is_unicode_bullet(int c) -{ - /* The last 2 aren't strictly bullets, but will do for our usage here */ - return (c == 0x2022 || /* Bullet */ - c == 0x2023 || /* Triangular bullet */ - c == 0x25e6 || /* White bullet */ - c == 0x2043 || /* Hyphen bullet */ - c == 0x2219 || /* Bullet operator */ - c == 149 || /* Ascii bullet */ - c == '*'); -} - -#ifdef SPOT_LINE_NUMBERS -static inline int -is_number(int c) -{ - return ((c >= '0' && c <= '9') || - (c == '.')); -} - -static inline int -is_latin_char(int c) -{ - return ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z')); -} - -static inline int -is_roman(int c) -{ - return (c == 'i' || c == 'I' || - c == 'v' || c == 'V' || - c == 'x' || c == 'X' || - c == 'l' || c == 'L' || - c == 'c' || c == 'C' || - c == 'm' || c == 'M'); -} -#endif - -static int -is_list_entry(fz_stext_line *line, fz_stext_span *span, int *char_num_ptr) -{ - int char_num; - fz_stext_char *chr; - - /* First, skip over any whitespace */ - for (char_num = 0; char_num < span->len; char_num++) - { - chr = &span->text[char_num]; - if (!is_unicode_wspace(chr->c)) - break; - } - *char_num_ptr = char_num; - - if (span != line->first_span || char_num >= span->len) - return 0; - - /* Now we check for various special cases, which we consider to mean - * that this is probably a list entry and therefore should always count - * as a separate paragraph (and hence not be entered in the line height - * table). */ - chr = &span->text[char_num]; - - /* Is the first char on the line, a bullet point? */ - if (is_unicode_bullet(chr->c)) - return 1; - -#ifdef SPOT_LINE_NUMBERS - /* Is the entire first span a number? Or does it start with a number - * followed by ) or : ? Allowed to involve single latin chars too. */ - if (is_number(chr->c) || is_latin_char(chr->c)) - { - int cn = char_num; - int met_char = is_latin_char(chr->c); - for (cn = char_num+1; cn < span->len; cn++) - { - fz_stext_char *chr2 = &span->text[cn]; - - if (is_latin_char(chr2->c) && !met_char) - { - met_char = 1; - continue; - } - met_char = 0; - if (!is_number(chr2->c) && !is_unicode_wspace(chr2->c)) - break; - else if (chr2->c == ')' || chr2->c == ':') - { - cn = span->len; - break; - } - } - if (cn == span->len) - return 1; - } - - /* Is the entire first span a roman numeral? Or does it start with - * a roman numeral followed by ) or : ? */ - if (is_roman(chr->c)) - { - int cn = char_num; - for (cn = char_num+1; cn < span->len; cn++) - { - fz_stext_char *chr2 = &span->text[cn]; - - if (!is_roman(chr2->c) && !is_unicode_wspace(chr2->c)) - break; - else if (chr2->c == ')' || chr2->c == ':') - { - cn = span->len; - break; - } - } - if (cn == span->len) - return 1; - } -#endif - return 0; -} - -typedef struct region_masks_s region_masks; - -typedef struct region_mask_s region_mask; - -typedef struct region_s region; - -struct region_s -{ - float start; - float stop; - float ave_start; - float ave_stop; - int align; - float colw; -}; - -struct region_mask_s -{ - fz_context *ctx; - int freq; - fz_point blv; - int cap; - int len; - float size; - region *mask; -}; - -struct region_masks_s -{ - fz_context *ctx; - int cap; - int len; - region_mask **mask; -}; - -static region_masks * -new_region_masks(fz_context *ctx) -{ - region_masks *rms = fz_malloc_struct(ctx, region_masks); - rms->ctx = ctx; - rms->cap = 0; - rms->len = 0; - rms->mask = NULL; - return rms; -} - -static void -free_region_mask(region_mask *rm) -{ - if (!rm) - return; - fz_free(rm->ctx, rm->mask); - fz_free(rm->ctx, rm); -} - -static void -free_region_masks(region_masks *rms) -{ - int i; - - if (!rms) - return; - for (i=0; i < rms->len; i++) - { - free_region_mask(rms->mask[i]); - } - fz_free(rms->ctx, rms->mask); - fz_free(rms->ctx, rms); -} - -static int region_masks_mergeable(const region_mask *rm1, const region_mask *rm2, float *score) -{ - int i1, i2; - int count = 0; - - *score = 0; - if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON) - return 0; - - for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; ) - { - if (rm1->mask[i1].stop < rm2->mask[i2].start) - { - /* rm1's region is entirely before rm2's */ - *score += rm1->mask[i1].stop - rm1->mask[i1].start; - i1++; - } - else if (rm1->mask[i1].start > rm2->mask[i2].stop) - { - /* rm2's region is entirely before rm1's */ - *score += rm2->mask[i2].stop - rm2->mask[i2].start; - i2++; - } - else - { - float lscore, rscore; - if (rm1->mask[i1].start < rm2->mask[i2].start) - { - if (i2 > 0 && rm2->mask[i2-1].stop >= rm1->mask[i1].start) - return 0; /* Not compatible */ - lscore = rm2->mask[i2].start - rm1->mask[i1].start; - } - else - { - if (i1 > 0 && rm1->mask[i1-1].stop >= rm2->mask[i2].start) - return 0; /* Not compatible */ - lscore = rm1->mask[i1].start - rm2->mask[i2].start; - } - if (rm1->mask[i1].stop > rm2->mask[i2].stop) - { - if (i2+1 < rm2->len && rm2->mask[i2+1].start <= rm1->mask[i1].stop) - return 0; /* Not compatible */ - rscore = rm1->mask[i1].stop - rm2->mask[i2].stop; - } - else - { - if (i1+1 < rm1->len && rm1->mask[i1+1].start <= rm2->mask[i2].stop) - return 0; /* Not compatible */ - rscore = rm2->mask[i2].stop - rm1->mask[i1].stop; - } - /* In order to allow a region to merge, either the - * left, the right, or the centre must agree */ - if (lscore < 1) - { - if (rscore < 1) - { - rscore = 0; - } - lscore = 0; - } - else if (rscore < 1) - { - rscore = 0; - } - else - { - /* Neither Left or right agree. Does the centre? */ - float ave1 = rm1->mask[i1].start + rm1->mask[i1].stop; - float ave2 = rm2->mask[i2].start + rm2->mask[i2].stop; - if (fabsf(ave1-ave2) > 1) - { - /* Nothing agrees, so don't merge */ - return 0; - } - lscore = 0; - rscore = 0; - } - *score += lscore + rscore; - /* These two regions could be merged */ - i1++; - i2++; - } - count++; - } - count += rm1->len-i1 + rm2->len-i2; - return count; -} - -static int region_mask_matches(const region_mask *rm1, const region_mask *rm2, float *score) -{ - int i1, i2; - int close = 1; - - *score = 0; - if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON) - return 0; - - for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; ) - { - if (rm1->mask[i1].stop < rm2->mask[i2].start) - { - /* rm1's region is entirely before rm2's */ - *score += rm1->mask[i1].stop - rm1->mask[i1].start; - i1++; - } - else if (rm1->mask[i1].start > rm2->mask[i2].stop) - { - /* Not compatible */ - return 0; - } - else - { - float lscore, rscore; - if (rm1->mask[i1].start > rm2->mask[i2].start) - { - /* Not compatible */ - return 0; - } - if (rm1->mask[i1].stop < rm2->mask[i2].stop) - { - /* Not compatible */ - return 0; - } - lscore = rm2->mask[i2].start - rm1->mask[i1].start; - rscore = rm1->mask[i1].stop - rm2->mask[i2].stop; - if (lscore < 1) - { - if (rscore < 1) - close++; - close++; - } - else if (rscore < 1) - close++; - else if (fabsf(lscore - rscore) < 1) - { - lscore = fabsf(lscore-rscore); - rscore = 0; - close++; - } - *score += lscore + rscore; - i1++; - i2++; - } - } - if (i1 < rm1->len) - { - /* Still more to go in rm1 */ - if (rm1->mask[i1].start < rm2->mask[rm2->len-1].stop) - return 0; - } - else if (i2 < rm2->len) - { - /* Still more to go in rm2 */ - if (rm2->mask[i2].start < rm1->mask[rm1->len-1].stop) - return 0; - } - - return close; -} - -static void region_mask_merge(region_mask *rm1, const region_mask *rm2, int newlen) -{ - int o, i1, i2; - - /* First, ensure that rm1 is long enough */ - if (rm1->cap < newlen) - { - int newcap = rm1->cap ? rm1->cap : 2; - do - { - newcap *= 2; - } - while (newcap < newlen); - rm1->mask = fz_resize_array(rm1->ctx, rm1->mask, newcap, sizeof(*rm1->mask)); - rm1->cap = newcap; - } - - /* Now run backwards along rm1, filling it out with the merged regions */ - for (o = newlen-1, i1 = rm1->len-1, i2 = rm2->len-1; o >= 0; o--) - { - /* So we read from i1 and i2 and store in o */ - if (i1 < 0) - { - /* Just copy i2 */ - rm1->mask[o] = rm2->mask[i2]; - i2--; - } - else if (i2 < 0) - { - /* Just copy i1 */ - rm1->mask[o] = rm1->mask[i1]; - i1--; - } - else if (rm1->mask[i1].stop < rm2->mask[i2].start) - { - /* rm1's region is entirely before rm2's - copy rm2's */ - rm1->mask[o] = rm2->mask[i2]; - i2--; - } - else if (rm2->mask[i2].stop < rm1->mask[i1].start) - { - /* rm2's region is entirely before rm1's - copy rm1's */ - rm1->mask[o] = rm1->mask[i1]; - i1--; - } - else - { - /* We must be merging */ - rm1->mask[o].ave_start = (rm1->mask[i1].start * rm1->freq + rm2->mask[i2].start * rm2->freq)/(rm1->freq + rm2->freq); - rm1->mask[o].ave_stop = (rm1->mask[i1].stop * rm1->freq + rm2->mask[i2].stop * rm2->freq)/(rm1->freq + rm2->freq); - rm1->mask[o].start = fz_min(rm1->mask[i1].start, rm2->mask[i2].start); - rm1->mask[o].stop = fz_max(rm1->mask[i1].stop, rm2->mask[i2].stop); - i1--; - i2--; - } - } - rm1->freq += rm2->freq; - rm1->len = newlen; -} - -static region_mask *region_masks_match(const region_masks *rms, const region_mask *rm, fz_stext_line *line, region_mask *prev_match) -{ - int i; - float best_score = 9999999; - float score; - int best = -1; - int best_count = 0; - - /* If the 'previous match' matches, use it regardless. */ - if (prev_match && region_mask_matches(prev_match, rm, &score)) - { - return prev_match; - } - - /* Run through and find the 'most compatible' region mask. We are - * guaranteed that there will always be at least one compatible one! - */ - for (i=0; i < rms->len; i++) - { - int count = region_mask_matches(rms->mask[i], rm, &score); - if (count > best_count || (count == best_count && (score < best_score || best == -1))) - { - best = i; - best_score = score; - best_count = count; - } - } - assert(best >= 0 && best < rms->len); - - /* So we have the matching mask. */ - return rms->mask[best]; -} - -#ifdef DEBUG_MASKS -static void -dump_region_mask(const region_mask *rm) -{ - int j; - for (j = 0; j < rm->len; j++) - { - printf("%g->%g ", rm->mask[j].start, rm->mask[j].stop); - } - printf("* %d\n", rm->freq); -} - -static void -dump_region_masks(const region_masks *rms) -{ - int i; - - for (i = 0; i < rms->len; i++) - { - region_mask *rm = rms->mask[i]; - dump_region_mask(rm); - } -} -#endif - -static void region_masks_add(region_masks *rms, region_mask *rm) -{ - /* Add rm to rms */ - if (rms->len == rms->cap) - { - int newcap = (rms->cap ? rms->cap * 2 : 4); - rms->mask = fz_resize_array(rms->ctx, rms->mask, newcap, sizeof(*rms->mask)); - rms->cap = newcap; - } - rms->mask[rms->len] = rm; - rms->len++; -} - -static void region_masks_sort(region_masks *rms) -{ - int i, j; - - /* First calculate sizes */ - for (i=0; i < rms->len; i++) - { - region_mask *rm = rms->mask[i]; - float size = 0; - for (j=0; j < rm->len; j++) - { - size += rm->mask[j].stop - rm->mask[j].start; - } - rm->size = size; - } - - /* Now, sort on size */ - /* FIXME: bubble sort - use heapsort for efficiency */ - for (i=0; i < rms->len-1; i++) - { - for (j=i+1; j < rms->len; j++) - { - if (rms->mask[i]->size < rms->mask[j]->size) - { - region_mask *tmp = rms->mask[i]; - rms->mask[i] = rms->mask[j]; - rms->mask[j] = tmp; - } - } - } -} - -static void region_masks_merge(region_masks *rms, region_mask *rm) -{ - int i; - float best_score = 9999999; - float score; - int best = -1; - int best_count = 0; - -#ifdef DEBUG_MASKS - printf("\nAdding:\n"); - dump_region_mask(rm); - printf("To:\n"); - dump_region_masks(rms); -#endif - for (i=0; i < rms->len; i++) - { - int count = region_masks_mergeable(rms->mask[i], rm, &score); - if (count && (score < best_score || best == -1)) - { - best = i; - best_count = count; - best_score = score; - } - } - if (best != -1) - { - region_mask_merge(rms->mask[best], rm, best_count); -#ifdef DEBUG_MASKS - printf("Merges to give:\n"); - dump_region_masks(rms); -#endif - free_region_mask(rm); - return; - } - region_masks_add(rms, rm); -#ifdef DEBUG_MASKS - printf("Adding new one to give:\n"); - dump_region_masks(rms); -#endif -} - -static region_mask * -new_region_mask(fz_context *ctx, const fz_point *blv) -{ - region_mask *rm = fz_malloc_struct(ctx, region_mask); - rm->ctx = ctx; - rm->freq = 1; - rm->blv = *blv; - rm->cap = 0; - rm->len = 0; - rm->mask = NULL; - return rm; -} - -static void -region_mask_project(const region_mask *rm, const fz_point *min, const fz_point *max, float *start, float *end) -{ - /* We project min and max down onto the blv */ - float s = min->x * rm->blv.x + min->y * rm->blv.y; - float e = max->x * rm->blv.x + max->y * rm->blv.y; - if (s > e) - { - *start = e; - *end = s; - } - else - { - *start = s; - *end = e; - } -} - -static void -region_mask_add(region_mask *rm, const fz_point *min, const fz_point *max) -{ - float start, end; - int i, j; - - region_mask_project(rm, min, max, &start, &end); - - /* Now add start/end into our region list. Typically we will be adding - * to the end of the region list, so search from there backwards. */ - for (i = rm->len; i > 0;) - { - if (start > rm->mask[i-1].stop) - break; - i--; - } - /* So we know that our interval can only affect list items >= i. - * We know that start is after our previous end. */ - if (i == rm->len || end < rm->mask[i].start) - { - /* Insert new one. No overlap. No merging */ - if (rm->len == rm->cap) - { - int newcap = (rm->cap ? rm->cap * 2 : 4); - rm->mask = fz_resize_array(rm->ctx, rm->mask, newcap, sizeof(*rm->mask)); - rm->cap = newcap; - } - if (rm->len > i) - memmove(&rm->mask[i+1], &rm->mask[i], (rm->len - i) * sizeof(*rm->mask)); - rm->mask[i].ave_start = start; - rm->mask[i].ave_stop = end; - rm->mask[i].start = start; - rm->mask[i].stop = end; - rm->len++; - } - else - { - /* Extend current one down. */ - rm->mask[i].ave_start = start; - rm->mask[i].start = start; - if (rm->mask[i].stop < end) - { - rm->mask[i].stop = end; - rm->mask[i].ave_stop = end; - /* Our region may now extend upwards too far */ - i++; - j = i; - while (j < rm->len && rm->mask[j].start <= end) - { - rm->mask[i-1].stop = end = rm->mask[j].stop; - j++; - } - if (i != j) - { - /* Move everything from j down to i */ - while (j < rm->len) - { - rm->mask[i++] = rm->mask[j++]; - } - } - rm->len -= j-i; - } - } -} - -static int -region_mask_column(region_mask *rm, const fz_point *min, const fz_point *max, int *align, float *colw, float *left_) -{ - float start, end, left, right; - int i; - - region_mask_project(rm, min, max, &start, &end); - - for (i = 0; i < rm->len; i++) - { - /* The use of MY_EPSILON here is because we might be matching - * start/end values calculated with slightly different blv's */ - if (rm->mask[i].start - MY_EPSILON <= start && rm->mask[i].stop + MY_EPSILON >= end) - break; - } - if (i >= rm->len) - { - *align = 0; - *colw = 0; - return 0; - } - left = start - rm->mask[i].start; - right = rm->mask[i].stop - end; - if (left < 1 && right < 1) - *align = rm->mask[i].align; - else if (left*2 <= right) - *align = 0; /* Left */ - else if (right * 2 < left) - *align = 2; /* Right */ - else - *align = 1; - *left_ = left; - *colw = rm->mask[i].colw; - return i; -} - -static void -region_mask_alignment(region_mask *rm) -{ - int i; - float width = 0; - - for (i = 0; i < rm->len; i++) - { - width += rm->mask[i].stop - rm->mask[i].start; - } - for (i = 0; i < rm->len; i++) - { - region *r = &rm->mask[i]; - float left = r->ave_start - r->start; - float right = r->stop - r->ave_stop; - if (left*2 <= right) - r->align = 0; /* Left */ - else if (right * 2 < left) - r->align = 2; /* Right */ - else - r->align = 1; - r->colw = 100 * (rm->mask[i].stop - rm->mask[i].start) / width; - } -} - -static void -region_masks_alignment(region_masks *rms) -{ - int i; - - for (i = 0; i < rms->len; i++) - { - region_mask_alignment(rms->mask[i]); - } -} - -static int -is_unicode_hyphen(int c) -{ - /* We omit 0x2011 (Non breaking hyphen) and 0x2043 (Hyphen Bullet) - * from this list. */ - return (c == '-' || - c == 0x2010 || /* Hyphen */ - c == 0x002d || /* Hyphen-Minus */ - c == 0x00ad || /* Soft hyphen */ - c == 0x058a || /* Armenian Hyphen */ - c == 0x1400 || /* Canadian Syllabive Hyphen */ - c == 0x1806); /* Mongolian Todo soft hyphen */ -} - -static int -is_unicode_hyphenatable(int c) -{ - /* This is a pretty ad-hoc collection. It may need tuning. */ - return ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - (c >= 0x00c0 && c <= 0x00d6) || - (c >= 0x00d8 && c <= 0x00f6) || - (c >= 0x00f8 && c <= 0x02af) || - (c >= 0x1d00 && c <= 0x1dbf) || - (c >= 0x1e00 && c <= 0x1eff) || - (c >= 0x2c60 && c <= 0x2c7f) || - (c >= 0xa722 && c <= 0xa78e) || - (c >= 0xa790 && c <= 0xa793) || - (c >= 0xa7a8 && c <= 0xa7af) || - (c >= 0xfb00 && c <= 0xfb07) || - (c >= 0xff21 && c <= 0xff3a) || - (c >= 0xff41 && c <= 0xff5a)); -} - -static void -dehyphenate(fz_stext_span *s1, fz_stext_span *s2) -{ - int i; - - for (i = s1->len-1; i > 0; i--) - if (!is_unicode_wspace(s1->text[i].c)) - break; - /* Can't leave an empty span. */ - if (i == 0) - return; - - if (!is_unicode_hyphen(s1->text[i].c)) - return; - if (!is_unicode_hyphenatable(s1->text[i-1].c)) - return; - if (!is_unicode_hyphenatable(s2->text[0].c)) - return; - s1->len = i; - s2->spacing = 0; -} - -#ifdef DEBUG_ALIGN -static void -dump_span(fz_stext_span *span) -{ -} - -static void -dump_line(fz_stext_line *line) -{ - fz_stext_span *span; - - if (!line) - return; - printf("d=%g: ", line->distance); - - span = line->first_span; - while (span) - { - dump_span(span); - span = span->next; - } - - printf("\n"); -} -#endif - -void -fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page) -{ - fz_stext_line *line; - fz_stext_span *span; - line_heights *lh; - region_masks *rms; - int block_num; - - /* Simple paragraph analysis; look for the most common 'inter line' - * spacing. This will be assumed to be our line spacing. Anything - * more than 25% wider than this will be assumed to be a paragraph - * space. */ - - /* Step 1: Gather the line height information */ - lh = new_line_heights(ctx); - for (block_num = 0; block_num < page->len; block_num++) - { - fz_stext_block *block; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) - continue; - block = page->blocks[block_num].u.text; - - for (line = block->lines; line < block->lines + block->len; line++) - { - /* For every style in the line, add lineheight to the - * record for that style. FIXME: This is a nasty n^2 - * algorithm at the moment. */ - fz_stext_style *style = NULL; - - if (line->distance == 0) - continue; - - for (span = line->first_span; span; span = span->next) - { - int char_num; - - if (is_list_entry(line, span, &char_num)) - goto list_entry; - - for (; char_num < span->len; char_num++) - { - fz_stext_char *chr = &span->text[char_num]; - - /* Ignore any whitespace chars */ - if (is_unicode_wspace(chr->c)) - continue; - - if (chr->style != style) - { - /* Have we had this style before? */ - int match = 0; - fz_stext_span *span2; - for (span2 = line->first_span; span2 != span; span2 = span2->next) - { - int char_num2; - for (char_num2 = 0; char_num2 < span2->len; char_num2++) - { - fz_stext_char *chr2 = &span2->text[char_num2]; - if (chr2->style == chr->style) - { - match = 1; - break; - } - } - } - if (char_num > 0 && match == 0) - { - fz_stext_span *span2 = span; - int char_num2; - for (char_num2 = 0; char_num2 < char_num; char_num2++) - { - fz_stext_char *chr2 = &span2->text[char_num2]; - if (chr2->style == chr->style) - { - match = 1; - break; - } - } - } - if (match == 0) - insert_line_height(lh, chr->style, line->distance); - style = chr->style; - } - } -list_entry: - {} - } - } - } - - /* Step 2: Find the most popular line height for each style */ - cull_line_heights(lh); - - /* Step 3: Run through the blocks, breaking each block into two if - * the line height isn't right. */ - for (block_num = 0; block_num < page->len; block_num++) - { - int line_num; - fz_stext_block *block; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) - continue; - block = page->blocks[block_num].u.text; - - for (line_num = 0; line_num < block->len; line_num++) - { - /* For every style in the line, check to see if lineheight - * is correct for that style. FIXME: We check each style - * more than once, currently. */ - int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */ - fz_stext_style *style = NULL; - line = &block->lines[line_num]; - - if (line->distance == 0) - continue; - -#ifdef DEBUG_LINE_HEIGHTS - printf("line height=%g\n", line->distance); -#endif - for (span = line->first_span; span; span = span->next) - { - int char_num; - - if (is_list_entry(line, span, &char_num)) - goto force_paragraph; - - /* Now we do the rest of the line */ - for (; char_num < span->len; char_num++) - { - fz_stext_char *chr = &span->text[char_num]; - - /* Ignore any whitespace chars */ - if (is_unicode_wspace(chr->c)) - continue; - - if (chr->style != style) - { - float proper_step = line_height_for_style(lh, chr->style); - if (proper_step * 0.95f <= line->distance && line->distance <= proper_step * 1.05f) - { - ok = 1; - break; - } - style = chr->style; - } - } - if (ok) - break; - } - if (!ok) - { -force_paragraph: - split_block(ctx, page, block_num, line_num); - break; - } - } - } - free_line_heights(lh); - - /* Simple line region analysis: - * For each line: - * form a list of 'start/stop' points (henceforth a 'region mask') - * find the normalised baseline vector for the line. - * Store the region mask and baseline vector. - * Collate lines that have compatible region masks and identical - * baseline vectors. - * If the collated masks are column-like, then split into columns. - * Otherwise split into tables. - */ - rms = new_region_masks(ctx); - - /* Step 1: Form the region masks and store them into a list with the - * normalised baseline vectors. */ - for (block_num = 0; block_num < page->len; block_num++) - { - fz_stext_block *block; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) - continue; - block = page->blocks[block_num].u.text; - - for (line = block->lines; line < block->lines + block->len; line++) - { - fz_point blv; - region_mask *rm; - -#ifdef DEBUG_MASKS - printf("Line: "); - dump_line(line); -#endif - blv = line->first_span->max; - blv.x -= line->first_span->min.x; - blv.y -= line->first_span->min.y; - fz_normalize_vector(&blv); - - rm = new_region_mask(ctx, &blv); - for (span = line->first_span; span; span = span->next) - { - fz_point *region_min = &span->min; - fz_point *region_max = &span->max; - - /* Treat adjacent spans as one big region */ - while (span->next && span->next->spacing < 1.5f) - { - span = span->next; - region_max = &span->max; - } - - region_mask_add(rm, region_min, region_max); - } -#ifdef DEBUG_MASKS - dump_region_mask(rm); -#endif - region_masks_add(rms, rm); - } - } - - /* Step 2: Sort the region_masks by size of masked region */ - region_masks_sort(rms); - -#ifdef DEBUG_MASKS - printf("Sorted list of regions:\n"); - dump_region_masks(rms); -#endif - /* Step 3: Merge the region masks where possible (large ones first) */ - { - int i; - region_masks *rms2; - rms2 = new_region_masks(ctx); - for (i=0; i < rms->len; i++) - { - region_mask *rm = rms->mask[i]; - rms->mask[i] = NULL; - region_masks_merge(rms2, rm); - } - free_region_masks(rms); - rms = rms2; - } - -#ifdef DEBUG_MASKS - printf("Merged list of regions:\n"); - dump_region_masks(rms); -#endif - - /* Step 4: Figure out alignment */ - region_masks_alignment(rms); - - /* Step 5: At this point, we should probably look at the region masks - * to try to guess which ones represent columns on the page. With our - * current code, we could only get blocks of lines that span 2 or more - * columns if the PDF producer wrote text out horizontally across 2 - * or more columns, and we've never seen that (yet!). So we skip this - * step for now. */ - - /* Step 6: Run through the lines again, deciding which ones fit into - * which region mask. */ - { - region_mask *prev_match = NULL; - for (block_num = 0; block_num < page->len; block_num++) - { - fz_stext_block *block; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) - continue; - block = page->blocks[block_num].u.text; - - for (line = block->lines; line < block->lines + block->len; line++) - { - fz_point blv; - region_mask *rm; - region_mask *match; - - blv = line->first_span->max; - blv.x -= line->first_span->min.x; - blv.y -= line->first_span->min.y; - fz_normalize_vector(&blv); - -#ifdef DEBUG_MASKS - dump_line(line); -#endif - rm = new_region_mask(ctx, &blv); - for (span = line->first_span; span; span = span->next) - { - fz_point *region_min = &span->min; - fz_point *region_max = &span->max; - - /* Treat adjacent spans as one big region */ - while (span->next && span->next->spacing < 1.5f) - { - span = span->next; - region_max = &span->max; - } - - region_mask_add(rm, region_min, region_max); - } -#ifdef DEBUG_MASKS - printf("Mask: "); - dump_region_mask(rm); -#endif - match = region_masks_match(rms, rm, line, prev_match); - prev_match = match; -#ifdef DEBUG_MASKS - printf("Matches: "); - dump_region_mask(match); -#endif - free_region_mask(rm); - span = line->first_span; - while (span) - { - fz_point *region_min = &span->min; - fz_point *region_max = &span->max; - fz_stext_span *sn; - int col, align; - float colw, left; - - /* Treat adjacent spans as one big region */ -#ifdef DEBUG_ALIGN - dump_span(span); -#endif - for (sn = span->next; sn && sn->spacing < 1.5f; sn = sn->next) - { - region_max = &sn->max; -#ifdef DEBUG_ALIGN - dump_span(sn); -#endif - } - col = region_mask_column(match, region_min, region_max, &align, &colw, &left); -#ifdef DEBUG_ALIGN - printf(" = col%d colw=%g align=%d\n", col, colw, align); -#endif - do - { - span->column = col; - span->align = align; - span->indent = left; - span->column_width = colw; - span = span->next; - } - while (span != sn); - - if (span) - span = span->next; - } - line->region = match; - } - } - free_region_masks(rms); - } - - /* Step 7: Collate lines within a block that share the same region - * mask. */ - for (block_num = 0; block_num < page->len; block_num++) - { - int line_num; - int prev_line_num; - - fz_stext_block *block; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) - continue; - block = page->blocks[block_num].u.text; - - /* First merge lines. This may leave empty lines behind. */ - for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++) - { - fz_stext_line *prev_line; - line = &block->lines[line_num]; - if (!line->first_span) - continue; - prev_line = &block->lines[prev_line_num]; - if (prev_line->region == line->region) - { - /* We only merge lines if the second line - * only uses 1 of the columns. */ - int col = line->first_span->column; - /* Copy the left value for the first span - * in the first column in this line forward - * for all the rest of the spans in the same - * column. */ - float indent = line->first_span->indent; - for (span = line->first_span->next; span; span = span->next) - { - if (col != span->column) - break; - span->indent = indent; - } - if (span) - { - prev_line_num = line_num; - continue; - } - - /* Merge line into prev_line */ - { - fz_stext_span **prev_line_span = &prev_line->first_span; - int try_dehyphen = -1; - fz_stext_span *prev_span = NULL; - span = line->first_span; - while (span && *prev_line_span) - { - /* Skip forwards through the original - * line, until we find a place where - * span should go. */ - if ((*prev_line_span)->column <= span->column) - { - /* The current span we are considering - * in prev_line is earlier than span. - * Just skip forwards in prev_line. */ - prev_span = (*prev_line_span); - prev_line_span = &prev_span->next; - try_dehyphen = span->column; - } - else - { - /* We want to copy span into prev_line. */ - fz_stext_span *next = (*prev_line_span)->next; - - if (prev_line_span == &prev_line->first_span) - prev_line->first_span = span; - if (next == NULL) - prev_line->last_span = span; - if (try_dehyphen == span->column) - dehyphenate(prev_span, span); - try_dehyphen = -1; - prev_span = *prev_line_span = span; - span = span->next; - (*prev_line_span)->next = next; - prev_line_span = &(*prev_line_span)->next; - } - } - if (span) - { - *prev_line_span = span; - prev_line->last_span = line->last_span; - } - - line->first_span = NULL; - line->last_span = NULL; - } - } - else - prev_line_num = line_num; - } - - /* Now get rid of the empty lines */ - for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++) - { - line = &block->lines[line_num]; - if (line->first_span) - block->lines[prev_line_num++] = *line; - } - block->len = prev_line_num; - - /* Now try to spot indents */ - for (line_num = 0; line_num < block->len; line_num++) - { - fz_stext_span *span_num, *sn; - int col, count; - line = &block->lines[line_num]; - - /* Run through the spans... */ - span_num = line->first_span; - { - float indent = 0; - /* For each set of spans that share the same - * column... */ - col = span_num->column; -#ifdef DEBUG_INDENTS - printf("Indent %g: ", span_num->indent); - dump_span(span_num); - printf("\n"); -#endif - - /* find the average indent of all but the first.. */ - for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++) - { -#ifdef DEBUG_INDENTS - printf("Indent %g: ", sn->indent); - dump_span(sn); - printf("\n"); -#endif - indent += sn->indent; - sn->indent = 0; - } - if (sn != span_num->next) - indent /= count; - - /* And compare this indent with the first one... */ -#ifdef DEBUG_INDENTS - printf("Average indent %g ", indent); -#endif - indent -= span_num->indent; -#ifdef DEBUG_INDENTS - printf("delta %g ", indent); -#endif - if (fabsf(indent) < 1) - { - /* No indent worth speaking of */ - indent = 0; - } -#ifdef DEBUG_INDENTS - printf("recorded %g\n", indent); -#endif - span_num->indent = indent; - span_num = sn; - } - for (; span_num; span_num = span_num->next) - { - span_num->indent = 0; - } - } - } -} diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c index 00705208..6c30ea29 100644 --- a/source/fitz/stext-search.c +++ b/source/fitz/stext-search.c @@ -18,30 +18,28 @@ static inline int iswhite(int c) fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx) { - int block_num; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; int ofs = 0; - for (block_num = 0; block_num < page->len; block_num++) + for (block = page->first_block; block; block = block->next) { - fz_stext_block *block; - fz_stext_line *line; - fz_stext_span *span; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) + if (block->type != FZ_STEXT_BLOCK_TEXT) continue; - block = page->blocks[block_num].u.text; - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { - for (span = line->first_span; span; span = span->next) + for (ch = line->first_char; ch; ch = ch->next) { - if (idx < ofs + span->len) + if (ofs == idx) { - cab->c = span->text[idx - ofs].c; - fz_stext_char_bbox(ctx, &cab->bbox, span, idx - ofs); + cab->c = ch->c; + fz_stext_char_bbox(ctx, &cab->bbox, line, ch); return cab; } - ofs += span->len; + ++ofs; } + /* pseudo-newline */ if (idx == ofs) { @@ -49,7 +47,7 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex cab->c = ' '; return cab; } - ofs++; + ++ofs; } } cab->bbox = fz_empty_rect; @@ -73,27 +71,23 @@ static fz_rect *bboxat(fz_context *ctx, fz_stext_page *page, int idx, fz_rect *b static int textlen_stext(fz_context *ctx, fz_stext_page *page) { + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; int len = 0; - int block_num; - for (block_num = 0; block_num < page->len; block_num++) + for (block = page->first_block; block; block = block->next) { - fz_stext_block *block; - fz_stext_line *line; - fz_stext_span *span; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) + if (block->type != FZ_STEXT_BLOCK_TEXT) continue; - block = page->blocks[block_num].u.text; - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { - for (span = line->first_span; span; span = span->next) - { - len += span->len; - } - len++; /* pseudo-newline */ + for (ch = line->first_char; ch; ch = ch->next) + ++len; + ++len; /* pseudo-newline */ } } + return len; } @@ -181,8 +175,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re fz_rect linebox, charbox; fz_stext_block *block; fz_stext_line *line; - fz_stext_span *span; - int i, block_num, hit_count; + fz_stext_char *ch; + int hit_count; float x0 = rect.x0; float x1 = rect.x1; @@ -191,31 +185,27 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re hit_count = 0; - for (block_num = 0; block_num < page->len; block_num++) + for (block = page->first_block; block; block = block->next) { - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) + if (block->type != FZ_STEXT_BLOCK_TEXT) continue; - block = page->blocks[block_num].u.text; - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { linebox = fz_empty_rect; - for (span = line->first_span; span; span = span->next) + for (ch = line->first_char; ch; ch = ch->next) { - for (i = 0; i < span->len; i++) + fz_stext_char_bbox(ctx, &charbox, line, ch); + if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1) { - fz_stext_char_bbox(ctx, &charbox, span, i); - if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1) + if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) { - if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) - { - if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) - hit_bbox[hit_count++] = linebox; - linebox = charbox; - } - else - { - fz_union_rect(&linebox, &charbox); - } + if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) + hit_bbox[hit_count++] = linebox; + linebox = charbox; + } + else + { + fz_union_rect(&linebox, &charbox); } } } @@ -232,8 +222,11 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect) { fz_buffer *buffer; fz_rect hitbox; - int c, i, block_num, seen = 0; + int c, seen = 0; unsigned char *s; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; float x0 = rect.x0; float x1 = rect.x1; @@ -242,41 +235,33 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect) buffer = fz_new_buffer(ctx, 1024); - for (block_num = 0; block_num < page->len; block_num++) + for (block = page->first_block; block; block = block->next) { - fz_stext_block *block; - fz_stext_line *line; - fz_stext_span *span; - - if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) + if (block->type != FZ_STEXT_BLOCK_TEXT) continue; - block = page->blocks[block_num].u.text; - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { - for (span = line->first_span; span; span = span->next) + if (seen) { - if (seen) - { - fz_append_byte(ctx, buffer, '\n'); - } + fz_append_byte(ctx, buffer, '\n'); + } - seen = 0; + seen = 0; - for (i = 0; i < span->len; i++) + for (ch = line->first_char; ch; ch = ch->next) + { + fz_stext_char_bbox(ctx, &hitbox, line, ch); + c = ch->c; + if (c < 32) + c = 0xFFFD; + if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) { - fz_stext_char_bbox(ctx, &hitbox, span, i); - c = span->text[i].c; - if (c < 32) - c = 0xFFFD; - if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) - { - fz_append_rune(ctx, buffer, c); - seen = 1; - } + fz_append_rune(ctx, buffer, c); + seen = 1; } - - seen = (seen && span == line->last_span); } + + seen = (seen && line == block->u.t.last_line); } } diff --git a/source/fitz/util.c b/source/fitz/util.c index 6f900174..d6a7f317 100644 --- a/source/fitz/util.c +++ b/source/fitz/util.c @@ -267,7 +267,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co } fz_stext_page * -fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options) +fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options) { fz_stext_page *text; fz_device *dev; @@ -279,7 +279,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox)); fz_try(ctx) { - dev = fz_new_stext_device(ctx, sheet, text, options); + dev = fz_new_stext_device(ctx, text, options); fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL); fz_close_device(ctx, dev); } @@ -297,7 +297,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s } fz_stext_page * -fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options) +fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options) { fz_stext_page *text; fz_device *dev; @@ -309,7 +309,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox)); fz_try(ctx) { - dev = fz_new_stext_device(ctx, sheet, text, options); + dev = fz_new_stext_device(ctx, text, options); fz_run_page(ctx, page, dev, &fz_identity, NULL); fz_close_device(ctx, dev); } @@ -327,14 +327,14 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee } fz_stext_page * -fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options) +fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options) { fz_page *page; fz_stext_page *text; page = fz_load_page(ctx, doc, number); fz_try(ctx) - text = fz_new_stext_page_from_page(ctx, page, sheet, options); + text = fz_new_stext_page_from_page(ctx, page, options); fz_always(ctx) fz_drop_page(ctx, page); fz_catch(ctx) @@ -345,24 +345,14 @@ fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number int fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needle, fz_rect *hit_bbox, int hit_max) { - fz_stext_sheet *sheet = NULL; - fz_stext_page *text = NULL; + fz_stext_page *text; int count; - fz_var(sheet); - fz_var(text); - + text = fz_new_stext_page_from_display_list(ctx, list, NULL); fz_try(ctx) - { - sheet = fz_new_stext_sheet(ctx); - text = fz_new_stext_page_from_display_list(ctx, list, sheet, NULL); count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); - } fz_always(ctx) - { fz_drop_stext_page(ctx, text); - fz_drop_stext_sheet(ctx, sheet); - } fz_catch(ctx) fz_rethrow(ctx); return count; @@ -371,24 +361,14 @@ fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needl int fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max) { - fz_stext_sheet *sheet = NULL; - fz_stext_page *text = NULL; + fz_stext_page *text; int count; - fz_var(sheet); - fz_var(text); - + text = fz_new_stext_page_from_page(ctx, page, NULL); fz_try(ctx) - { - sheet = fz_new_stext_sheet(ctx); - text = fz_new_stext_page_from_page(ctx, page, sheet, NULL); count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max); - } fz_always(ctx) - { fz_drop_stext_page(ctx, text); - fz_drop_stext_sheet(ctx, sheet); - } fz_catch(ctx) fz_rethrow(ctx); return count; @@ -411,14 +391,15 @@ fz_search_page_number(fz_context *ctx, fz_document *doc, int number, const char } fz_buffer * -fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rect *sel, int crlf) +fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page, const fz_rect *sel, int crlf) { fz_buffer *buf; fz_rect hitbox; float x0, y0, x1, y1; - int block_num; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; int need_newline; - int i; need_newline = 0; @@ -438,45 +419,33 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec buf = fz_new_buffer(ctx, 256); fz_try(ctx) { - for (block_num = 0; block_num < text->len; block_num++) + for (block = page->first_block; block; block = block->next) { - fz_stext_line *line; - fz_stext_block *block; - fz_stext_span *span; - - if (text->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT) + if (block->type != FZ_STEXT_BLOCK_TEXT) continue; - block = text->blocks[block_num].u.text; - for (line = block->lines; line < block->lines + block->len; line++) + for (line = block->u.t.first_line; line; line = line->next) { int saw_text = 0; - for (span = line->first_span; span; span = span->next) + for (ch = line->first_char; ch; ch = ch->next) { - if (span->spacing > 1) - fz_append_byte(ctx, buf, ' '); - for (i = 0; i < span->len; i++) + int c = ch->c; + fz_stext_char_bbox(ctx, &hitbox, line, ch); + if (c < 32) + c = 0xFFFD; + if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) { - int c; - fz_stext_char_bbox(ctx, &hitbox, span, i); - c = span->text[i].c; - if (c < 32) - c = 0xFFFD; - if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1) + saw_text = 1; + if (need_newline) { - saw_text = 1; - if (need_newline) - { - if (crlf) - fz_append_byte(ctx, buf, '\r'); - fz_append_byte(ctx, buf, '\n'); - need_newline = 0; - } - fz_append_rune(ctx, buf, c); + if (crlf) + fz_append_byte(ctx, buf, '\r'); + fz_append_byte(ctx, buf, '\n'); + need_newline = 0; } + fz_append_rune(ctx, buf, c); } } - if (saw_text) need_newline = 1; } @@ -494,42 +463,32 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec fz_buffer * fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options) { - fz_stext_sheet *sheet; fz_stext_page *text; fz_buffer *buf; - sheet = fz_new_stext_sheet(ctx); + text = fz_new_stext_page_from_display_list(ctx, list, options); fz_try(ctx) - { - text = fz_new_stext_page_from_display_list(ctx, list, sheet, options); buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); - } fz_always(ctx) - fz_drop_stext_sheet(ctx, sheet); + fz_drop_stext_page(ctx, text); fz_catch(ctx) fz_rethrow(ctx); - fz_drop_stext_page(ctx, text); return buf; } fz_buffer * fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options) { - fz_stext_sheet *sheet; fz_stext_page *text; fz_buffer *buf; - sheet = fz_new_stext_sheet(ctx); + text = fz_new_stext_page_from_page(ctx, page, options); fz_try(ctx) - { - text = fz_new_stext_page_from_page(ctx, page, sheet, options); buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); - } fz_always(ctx) - fz_drop_stext_sheet(ctx, sheet); + fz_drop_stext_page(ctx, text); fz_catch(ctx) fz_rethrow(ctx); - fz_drop_stext_page(ctx, text); return buf; } diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c index de05ab65..e1303fb8 100644 --- a/source/tools/mudraw.c +++ b/source/tools/mudraw.c @@ -248,7 +248,6 @@ static int band_height = 0; static int lowmemory = 0; static int errored = 0; -static fz_stext_sheet *sheet = NULL; static fz_colorspace *colorspace; static int spots = 0; static int alpha; @@ -391,9 +390,6 @@ file_level_headers(fz_context *ctx) if (output_format == OUT_STEXT || output_format == OUT_TRACE) fz_write_printf(ctx, out, "\n"); - if (output_format == OUT_TEXT || output_format == OUT_HTML || output_format == OUT_XHTML || output_format == OUT_STEXT) - sheet = fz_new_stext_sheet(ctx); - if (output_format == OUT_HTML) fz_print_stext_header_as_html(ctx, out); if (output_format == OUT_XHTML) @@ -422,8 +418,6 @@ file_level_trailers(fz_context *ctx) if (output_format == OUT_PS) fz_write_ps_file_trailer(ctx, out, output_pagenum); - - fz_drop_stext_sheet(ctx, sheet); } static void drawband(fz_context *ctx, fz_page *page, fz_display_list *list, const fz_matrix *ctm, const fz_rect *tbounds, fz_cookie *cookie, int band_start, fz_pixmap *pix, fz_bitmap **bit) @@ -534,7 +528,7 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in stext_options.flags = (output_format == OUT_HTML || output_format == OUT_XHTML) ? FZ_STEXT_PRESERVE_IMAGES : 0; text = fz_new_stext_page(ctx, &mediabox); - dev = fz_new_stext_device(ctx, sheet, text, &stext_options); + dev = fz_new_stext_device(ctx, text, &stext_options); if (lowmemory) fz_enable_device_hints(ctx, dev, FZ_NO_CACHE); if (list) @@ -550,12 +544,10 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in } else if (output_format == OUT_HTML) { - fz_analyze_text(ctx, sheet, text); fz_print_stext_page_as_html(ctx, out, text); } else if (output_format == OUT_XHTML) { - fz_analyze_text(ctx, sheet, text); fz_print_stext_page_as_xhtml(ctx, out, text); } else if (output_format == OUT_TEXT) diff --git a/source/tools/murun.c b/source/tools/murun.c index b7443286..7a713903 100644 --- a/source/tools/murun.c +++ b/source/tools/murun.c @@ -1827,19 +1827,13 @@ static void ffi_Page_toStructuredText(js_State *J) fz_context *ctx = js_getcontext(J); fz_page *page = ffi_topage(J, 0); const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; - fz_stext_sheet *sheet = NULL; fz_stext_options so; fz_stext_page *text; - fz_var(sheet); - fz_try(ctx) { - sheet = fz_new_stext_sheet(ctx); fz_parse_stext_options(ctx, &so, options); - text = fz_new_stext_page_from_page(ctx, page, sheet, &so); + text = fz_new_stext_page_from_page(ctx, page, &so); } - fz_always(ctx) - fz_drop_stext_sheet(ctx, sheet); fz_catch(ctx) rethrow(J); @@ -2673,19 +2667,13 @@ static void ffi_DisplayList_toStructuredText(js_State *J) fz_context *ctx = js_getcontext(J); fz_display_list *list = js_touserdata(J, 0, "fz_display_list"); const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL; - fz_stext_sheet *sheet = NULL; fz_stext_options so; fz_stext_page *text; - fz_var(sheet); - fz_try(ctx) { - sheet = fz_new_stext_sheet(ctx); fz_parse_stext_options(ctx, &so, options); - text = fz_new_stext_page_from_display_list(ctx, list, sheet, &so); + text = fz_new_stext_page_from_display_list(ctx, list, &so); } - fz_always(ctx) - fz_drop_stext_sheet(ctx, sheet); fz_catch(ctx) rethrow(J); -- cgit v1.2.3