summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2017-08-01 18:15:23 +0200
committerTor Andersson <tor.andersson@artifex.com>2017-08-17 13:38:48 +0200
commit626ea2ea771735492c9a4350ae02b26ea09d1423 (patch)
treec92241b181a51719cbb47402bad98bb1984bf963
parente349ba5984fe837d3eec9649d718efe16169ca44 (diff)
downloadmupdf-626ea2ea771735492c9a4350ae02b26ea09d1423.tar.xz
Simplify stext structure and device.
* Use pool allocator and linked lists for all levels. * Remove separate fz_stext_sheet struct. * Remove unused 'script' style. * Remove 'span' level items. * Detect visual/logical RTL layouts. * Detect indented paragraphs.
-rw-r--r--include/mupdf/fitz/font.h6
-rw-r--r--include/mupdf/fitz/structured-text.h188
-rw-r--r--include/mupdf/fitz/util.h8
-rw-r--r--platform/java/mupdf_native.c2
-rw-r--r--platform/win32/libmupdf.vcproj4
-rw-r--r--platform/x11/pdfapp.c61
-rw-r--r--platform/x11/pdfapp.h1
-rw-r--r--source/fitz/font.c22
-rw-r--r--source/fitz/stext-device.c1047
-rw-r--r--source/fitz/stext-output.c386
-rw-r--r--source/fitz/stext-paragraph.c1538
-rw-r--r--source/fitz/stext-search.c137
-rw-r--r--source/fitz/util.c109
-rw-r--r--source/tools/mudraw.c10
-rw-r--r--source/tools/murun.c16
15 files changed, 681 insertions, 2854 deletions
diff --git a/include/mupdf/fitz/font.h b/include/mupdf/fitz/font.h
index a6e172a1..ef4cd74d 100644
--- a/include/mupdf/fitz/font.h
+++ b/include/mupdf/fitz/font.h
@@ -601,6 +601,12 @@ int fz_encode_character_with_fallback(fz_context *ctx, fz_font *font, int unicod
void fz_get_glyph_name(fz_context *ctx, fz_font *font, int glyph, char *buf, int size);
/*
+ Get font ascender and descender values.
+*/
+float fz_font_ascender(fz_context *ctx, fz_font *font);
+float fz_font_descender(fz_context *ctx, fz_font *font);
+
+/*
Internal functions for our Harfbuzz integration
to work around the lack of thread safety.
*/
diff --git a/include/mupdf/fitz/structured-text.h b/include/mupdf/fitz/structured-text.h
index 61ee30ad..0f3364b3 100644
--- a/include/mupdf/fitz/structured-text.h
+++ b/include/mupdf/fitz/structured-text.h
@@ -16,15 +16,9 @@
(In development - Subject to change in future versions)
*/
-typedef struct fz_stext_style_s fz_stext_style;
typedef struct fz_stext_char_s fz_stext_char;
-typedef struct fz_stext_span_s fz_stext_span;
typedef struct fz_stext_line_s fz_stext_line;
typedef struct fz_stext_block_s fz_stext_block;
-typedef struct fz_image_block_s fz_image_block;
-typedef struct fz_page_block_s fz_page_block;
-
-typedef struct fz_stext_sheet_s fz_stext_sheet;
typedef struct fz_stext_page_s fz_stext_page;
/*
@@ -52,150 +46,58 @@ enum
};
/*
- fz_stext_sheet: A text sheet contains a list of distinct text styles
- used on a page (or a series of pages).
-*/
-struct fz_stext_sheet_s
-{
- int maxid;
- fz_stext_style *style;
-};
-
-/*
- fz_stext_style: A text style contains details of a distinct text style
- used on a page.
-*/
-struct fz_stext_style_s
-{
- fz_stext_style *next;
- int id;
- fz_font *font;
- float size;
- int wmode;
- int script;
- /* Ascender and Descender only have the conventional sense in
- * horizontal mode; in vertical mode they are rotated too - they are
- * the maximum and minimum bounds respectively. */
- float ascender;
- float descender;
- /* etc... */
-};
-
-/*
- fz_stext_page: A text page is a list of page blocks, together with
- an overall bounding box.
+ A text page is a list of blocks, together with an overall bounding box.
*/
struct fz_stext_page_s
{
+ fz_pool *pool;
fz_rect mediabox;
- int len, cap;
- fz_page_block *blocks;
- fz_stext_page *next;
-};
-
-/*
- fz_page_block: A page block is a typed block pointer.
-*/
-struct fz_page_block_s
-{
- int type;
- union
- {
- fz_stext_block *text;
- fz_image_block *image;
- } u;
+ fz_stext_block *first_block, *last_block;
};
enum
{
- FZ_PAGE_BLOCK_TEXT = 0,
- FZ_PAGE_BLOCK_IMAGE = 1
+ FZ_STEXT_BLOCK_TEXT = 0,
+ FZ_STEXT_BLOCK_IMAGE = 1
};
/*
- fz_stext_block: A text block is a list of lines of text. In typical
- cases this may correspond to a paragraph or a column of text. A
- collection of blocks makes up a page.
+ A text block is a list of lines of text, or an image.
*/
struct fz_stext_block_s
{
+ int type;
fz_rect bbox;
- int len, cap;
- fz_stext_line *lines;
-};
-
-/*
- fz_image_block: An image block is an image, together with the list of lines of text. In typical
- cases this may correspond to a paragraph or a column of text. A
- collection of blocks makes up a page.
-*/
-struct fz_image_block_s
-{
- fz_rect bbox;
- fz_matrix mat;
- fz_image *image;
- fz_colorspace *cspace;
- float colors[FZ_MAX_COLORS];
+ union {
+ struct { fz_stext_line *first_line, *last_line; } t;
+ struct { fz_matrix transform; fz_image *image; } i;
+ } u;
+ fz_stext_block *next;
};
/*
- fz_stext_line: A text line is a list of text spans, with the same
- baseline. In typical cases this should correspond (as expected) to
- complete lines of text. A collection of lines makes up a block.
+ A text line is a list of characters that share a common baseline.
*/
struct fz_stext_line_s
{
- fz_stext_span *first_span, *last_span;
-
- /* Cached information */
- float distance; /* Perpendicular distance from previous line */
- fz_rect bbox;
- void *region; /* Opaque value for matching line masks */
-};
-
-/*
- fz_stext_span: A text span is a list of characters that share a common
- baseline/transformation. In typical cases a single span may be enough
- to represent a complete line. In cases where the text has big gaps in
- it (perhaps as it crosses columns or tables), a line may be represented
- by multiple spans.
-*/
-struct fz_stext_span_s
-{
- int len, cap;
- fz_stext_char *text;
- fz_point min; /* Device space */
- fz_point max; /* Device space */
int wmode; /* 0 for horizontal, 1 for vertical */
- fz_matrix transform; /* e and f are always 0 here */
- /* Ascender_max and Descender_min only have the conventional sense in
- * horizontal mode; in vertical mode they are rotated too - they are
- * the maximum and minimum bounds respectively. */
- float ascender_max; /* Document space */
- float descender_min; /* Document space */
- fz_rect bbox; /* Device space */
-
- /* Cached information */
- float base_offset; /* Perpendicular distance from baseline of line */
- float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
- int column; /* If non zero, the column that it's in */
- float column_width; /* Percentage */
- int align; /* 0 = left, 1 = centre, 2 = right */
- float indent; /* The indent position for this column. */
-
- fz_stext_span *next;
+ fz_rect bbox;
+ fz_stext_char *first_char, *last_char;
+ fz_stext_line *next;
};
/*
- fz_stext_char: A text char is a unicode character, the style in which
- is appears, and the point at which it is positioned. Transform
- (and hence bbox) information is given by the enclosing span.
+ A text char is a unicode character, the style in which is appears, and
+ the point at which it is positioned.
*/
struct fz_stext_char_s
{
- fz_point p; /* Device space */
- int c;
- fz_stext_style *style;
+ int c, rtl;
+ fz_point origin;
+ fz_rect bbox;
+ float size;
+ fz_font *font;
+ fz_stext_char *next;
};
typedef struct fz_char_and_box_s fz_char_and_box;
@@ -212,43 +114,29 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex
/*
fz_stext_char_bbox: Return the bbox of a text char. Calculated from
- the supplied enclosing span.
-
- bbox: A place to store the bbox
+ the supplied enclosing line.
- span: The enclosing span
+ bbox: A place to store the bbox.
- idx: The index of the char within the span
+ line: The enclosing line.
- Returns bbox (updated)
+ ch: The character.
- Does not throw exceptions
+ Returns bbox (updated).
*/
-fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int idx);
-
-/*
- fz_new_stext_sheet: Create an empty style sheet.
-
- The style sheet is filled out by the text device, creating
- one style for each unique font, color, size combination that
- is used.
-*/
-fz_stext_sheet *fz_new_stext_sheet(fz_context *ctx);
-void fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet);
+fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch);
/*
fz_new_stext_page: Create an empty text page.
- The text page is filled out by the text device to contain the blocks,
- lines and spans of text on the page.
+ The text page is filled out by the text device to contain the blocks
+ and lines of text on the page.
mediabox: optional mediabox information.
*/
fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox);
void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
-void fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page);
-
/*
fz_print_stext_page_as_html: Output a page to a file in HTML (visual) format.
*/
@@ -314,14 +202,10 @@ fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts
/*
fz_new_stext_device: Create a device to extract the text on a page.
- Gather and sort the text on a page into spans of uniform style,
- arranged into lines and blocks by reading order. The reading order
- is determined by various heuristics, so may not be accurate.
+ Gather the text on a page into blocks and lines.
- sheet: The text sheet to which styles should be added. This can
- either be a newly created (empty) text sheet, or one containing
- styles from a previous text device. The same sheet cannot be used
- in multiple threads simultaneously.
+ The reading order is taken from the order the text is drawn in the
+ source file, so may not be accurate.
page: The text page to which content should be added. This will
usually be a newly created (empty) text page, but it can be one
@@ -330,6 +214,6 @@ fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts
options: Options to configure the stext device.
*/
-fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options);
+fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
#endif
diff --git a/include/mupdf/fitz/util.h b/include/mupdf/fitz/util.h
index d452b58a..4b827cad 100644
--- a/include/mupdf/fitz/util.h
+++ b/include/mupdf/fitz/util.h
@@ -36,11 +36,11 @@ fz_pixmap *fz_new_pixmap_from_page_contents(fz_context *ctx, fz_page *page, cons
fz_pixmap *fz_new_pixmap_from_annot(fz_context *ctx, fz_annot *annot, const fz_matrix *ctm, fz_colorspace *cs, int alpha);
/*
- fz_new_stext_page_from_page: Extract structured text from a page. The sheet must not be NULL.
+ fz_new_stext_page_from_page: Extract structured text from a page.
*/
-fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options);
-fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options);
-fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options);
+fz_stext_page *fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options);
+fz_stext_page *fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options);
+fz_stext_page *fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options);
/*
fz_new_buffer_from_stext_page: Convert structured text into plain text, cropped by the selection rectangle.
diff --git a/platform/java/mupdf_native.c b/platform/java/mupdf_native.c
index bed3358d..ce5e6fea 100644
--- a/platform/java/mupdf_native.c
+++ b/platform/java/mupdf_native.c
@@ -5111,8 +5111,6 @@ FUN(Page_textAsHtml)(JNIEnv *env, jobject self)
fz_run_page(ctx, page, dev, &ctm, NULL);
fz_close_device(ctx, dev);
- fz_analyze_text(ctx, sheet, text);
-
buf = fz_new_buffer(ctx, 256);
out = fz_new_output_with_buffer(ctx, buf);
fz_write_printf(ctx, out, "<html>\n");
diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj
index fc195fce..3add80ed 100644
--- a/platform/win32/libmupdf.vcproj
+++ b/platform/win32/libmupdf.vcproj
@@ -1869,10 +1869,6 @@
>
</File>
<File
- RelativePath="..\..\source\fitz\stext-paragraph.c"
- >
- </File>
- <File
RelativePath="..\..\source\fitz\stext-search.c"
>
</File>
diff --git a/platform/x11/pdfapp.c b/platform/x11/pdfapp.c
index 61366a44..6b08c4aa 100644
--- a/platform/x11/pdfapp.c
+++ b/platform/x11/pdfapp.c
@@ -470,9 +470,6 @@ void pdfapp_close(pdfapp_t *app)
fz_drop_stext_page(app->ctx, app->page_text);
app->page_text = NULL;
- fz_drop_stext_sheet(app->ctx, app->page_sheet);
- app->page_sheet = NULL;
-
fz_drop_link(app->ctx, app->page_links);
app->page_links = NULL;
@@ -655,14 +652,12 @@ static void pdfapp_loadpage(pdfapp_t *app, int no_cache)
fz_drop_display_list(app->ctx, app->page_list);
fz_drop_display_list(app->ctx, app->annotations_list);
fz_drop_stext_page(app->ctx, app->page_text);
- fz_drop_stext_sheet(app->ctx, app->page_sheet);
fz_drop_link(app->ctx, app->page_links);
fz_drop_page(app->ctx, app->page);
app->page_list = NULL;
app->annotations_list = NULL;
app->page_text = NULL;
- app->page_sheet = NULL;
app->page_links = NULL;
app->page = NULL;
app->page_bbox.x0 = 0;
@@ -875,12 +870,11 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai
app->hit_count = 0;
/* Extract text */
- app->page_sheet = fz_new_stext_sheet(app->ctx);
app->page_text = fz_new_stext_page(app->ctx, fz_bound_page(app->ctx, app->page, &mediabox));
if (app->page_list || app->annotations_list)
{
- tdev = fz_new_stext_device(app->ctx, app->page_sheet, app->page_text, NULL);
+ tdev = fz_new_stext_device(app->ctx, app->page_text, NULL);
pdfapp_runpage(app, tdev, &fz_identity, &fz_infinite_rect, &cookie);
fz_close_device(app->ctx, tdev);
fz_drop_device(app->ctx, tdev);
@@ -1905,8 +1899,10 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
fz_rect hitbox;
fz_matrix ctm;
fz_stext_page *page = app->page_text;
- int c, i, p, need_newline;
- int block_num;
+ int p, need_newline;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
int x0 = app->selr.x0;
int x1 = app->selr.x1;
@@ -1918,50 +1914,37 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
p = 0;
need_newline = 0;
- for (block_num = 0; block_num < page->len; block_num++)
+ for (block = page->first_block; block; block = block->next)
{
- fz_stext_line *line;
- fz_stext_block *block;
- fz_stext_span *span;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
- block = page->blocks[block_num].u.text;
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
int saw_text = 0;
-
- for (span = line->first_span; span; span = span->next)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- for (i = 0; i < span->len; i++)
+ int c = ch->c;
+ fz_stext_char_bbox(app->ctx, &hitbox, line, ch);
+ if (c < 32)
+ c = 0xFFFD;
+ if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
{
- fz_stext_char_bbox(app->ctx, &hitbox, span, i);
- fz_transform_rect(&hitbox, &ctm);
- c = span->text[i].c;
- if (c < 32)
- c = '?';
- if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
+ saw_text = 1;
+ if (need_newline)
{
- saw_text = 1;
-
- if (need_newline)
- {
#if defined(_WIN32) || defined(_WIN64)
- if (p < ucslen - 1)
- ucsbuf[p++] = '\r';
+ if (p < ucslen - 1)
+ ucsbuf[p++] = '\r';
#endif
- if (p < ucslen - 1)
- ucsbuf[p++] = '\n';
- need_newline = 0;
- }
-
if (p < ucslen - 1)
- ucsbuf[p++] = c;
+ ucsbuf[p++] = '\n';
+ need_newline = 0;
}
+ if (p < ucslen - 1)
+ ucsbuf[p++] = c;
}
}
-
if (saw_text)
need_newline = 1;
}
diff --git a/platform/x11/pdfapp.h b/platform/x11/pdfapp.h
index 28a83481..09d8f16a 100644
--- a/platform/x11/pdfapp.h
+++ b/platform/x11/pdfapp.h
@@ -91,7 +91,6 @@ struct pdfapp_s
fz_display_list *page_list;
fz_display_list *annotations_list;
fz_stext_page *page_text;
- fz_stext_sheet *page_sheet;
fz_link *page_links;
int errored;
int incomplete;
diff --git a/source/fitz/font.c b/source/fitz/font.c
index eb7c8c35..dfe4ab24 100644
--- a/source/fitz/font.c
+++ b/source/fitz/font.c
@@ -193,6 +193,28 @@ fz_set_font_bbox(fz_context *ctx, fz_font *font, float xmin, float ymin, float x
}
}
+float fz_font_ascender(fz_context *ctx, fz_font *font)
+{
+ if (font->t3procs)
+ return font->bbox.y1;
+ else
+ {
+ FT_Face face = font->ft_face;
+ return (float)face->ascender / face->units_per_EM;
+ }
+}
+
+float fz_font_descender(fz_context *ctx, fz_font *font)
+{
+ if (font->t3procs)
+ return font->bbox.y0;
+ else
+ {
+ FT_Face face = font->ft_face;
+ return (float)face->descender / face->units_per_EM;
+ }
+}
+
/*
* Freetype hooks
*/
diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
index 73fa309e..166f5aa0 100644
--- a/source/fitz/stext-device.c
+++ b/source/fitz/stext-device.c
@@ -4,36 +4,25 @@
#include <math.h>
#include <float.h>
-/* Extract text into an unsorted span soup. */
+#include <stdio.h> /* for debug printing */
+
+/* Extract text into blocks and lines. */
#define LINE_DIST 0.9f
#define SPACE_DIST 0.15f
#define SPACE_MAX_DIST 0.8f
#define PARAGRAPH_DIST 0.5f
-#include <stdio.h> /* for debug printing */
-#undef DEBUG_SPANS
-#undef DEBUG_INTERNALS
-#undef DEBUG_LINE_HEIGHTS
-#undef DEBUG_MASKS
-#undef DEBUG_ALIGN
-#undef DEBUG_INDENTS
-
-#include <ft2build.h>
-#include FT_FREETYPE_H
-#include FT_ADVANCES_H
-
typedef struct fz_stext_device_s fz_stext_device;
-typedef struct span_soup_s span_soup;
-
struct fz_stext_device_s
{
fz_device super;
- fz_stext_sheet *sheet;
fz_stext_page *page;
- span_soup *spans;
- fz_stext_span *cur_span;
+ fz_point pen, start;
+ fz_matrix trm;
+ int new_obj;
+ int curdir;
int lastchar;
int flags;
};
@@ -42,553 +31,235 @@ const char *fz_stext_options_usage =
"Structured text output options:\n"
"\tpreserve-ligatures: do not expand all ligatures into constituent characters\n"
"\tpreserve-whitespace: do not convert all whitespace characters into spaces\n"
+ "\tpreserve-images: keep images in output\n"
"\n";
-static fz_rect *
-add_point_to_rect(fz_rect *a, const fz_point *p)
+fz_rect *
+fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_line *line, fz_stext_char *ch)
{
- if (p->x < a->x0)
- a->x0 = p->x;
- if (p->x > a->x1)
- a->x1 = p->x;
- if (p->y < a->y0)
- a->y0 = p->y;
- if (p->y > a->y1)
- a->y1 = p->y;
- return a;
+ *bbox = ch->bbox;
+ return bbox;
}
-fz_rect *
-fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int i)
+fz_stext_page *
+fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox)
{
- fz_point a, d;
- const fz_point *max;
- fz_stext_char *ch;
-
- if (!span || i >= span->len)
- {
- *bbox = fz_empty_rect;
- return bbox;
- }
- ch = &span->text[i];
- if (i == span->len-1)
- max = &span->max;
- else
- max = &span->text[i+1].p;
- if (span->wmode == 0)
+ fz_pool *pool = fz_new_pool(ctx);
+ fz_stext_page *page;
+ fz_try(ctx)
{
- a.x = 0;
- a.y = span->ascender_max;
- d.x = 0;
- d.y = span->descender_min;
+ page = fz_pool_alloc(ctx, pool, sizeof(*page));
+ page->pool = pool;
+ page->mediabox = *mediabox;
+ page->first_block = NULL;
+ page->last_block = NULL;
}
- else
+ fz_catch(ctx)
{
- a.x = span->ascender_max;
- a.y = 0;
- d.x = span->descender_min;
- d.y = 0;
+ fz_drop_pool(ctx, pool);
+ fz_rethrow(ctx);
}
- fz_transform_vector(&a, &span->transform);
- fz_transform_vector(&d, &span->transform);
- bbox->x0 = bbox->x1 = ch->p.x + a.x;
- bbox->y0 = bbox->y1 = ch->p.y + a.y;
- a.x += max->x;
- a.y += max->y;
- add_point_to_rect(bbox, &a);
- a.x = ch->p.x + d.x;
- a.y = ch->p.y + d.y;
- add_point_to_rect(bbox, &a);
- a.x = max->x + d.x;
- a.y = max->y + d.y;
- add_point_to_rect(bbox, &a);
- return bbox;
+ return page;
}
-static void
-add_bbox_to_span(fz_stext_span *span)
+void
+fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
{
- fz_point a, d;
- fz_rect *bbox = &span->bbox;
-
- if (!span)
- return;
- if (span->wmode == 0)
- {
- a.x = 0;
- a.y = span->ascender_max;
- d.x = 0;
- d.y = span->descender_min;
- }
- else
+ if (page)
{
- a.x = span->ascender_max;
- a.y = 0;
- d.x = span->descender_min;
- d.y = 0;
+ fz_stext_block *block;
+ for (block = page->first_block; block; block = block->next)
+ if (block->type == FZ_STEXT_BLOCK_IMAGE)
+ fz_drop_image(ctx, block->u.i.image);
+ fz_drop_pool(ctx, page->pool);
}
- fz_transform_vector(&a, &span->transform);
- fz_transform_vector(&d, &span->transform);
- bbox->x0 = bbox->x1 = span->min.x + a.x;
- bbox->y0 = bbox->y1 = span->min.y + a.y;
- a.x += span->max.x;
- a.y += span->max.y;
- add_point_to_rect(bbox, &a);
- a.x = span->min.x + d.x;
- a.y = span->min.y + d.y;
- add_point_to_rect(bbox, &a);
- a.x = span->max.x + d.x;
- a.y = span->max.y + d.y;
- add_point_to_rect(bbox, &a);
}
-struct span_soup_s
-{
- int len, cap;
- fz_stext_span **spans;
-};
-
-static span_soup *
-new_span_soup(fz_context *ctx)
+static fz_stext_block *
+add_block_to_page(fz_context *ctx, fz_stext_page *page)
{
- span_soup *soup = fz_malloc_struct(ctx, span_soup);
- soup->len = 0;
- soup->cap = 0;
- soup->spans = NULL;
- return soup;
+ fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
+ if (!page->first_block)
+ page->first_block = page->last_block = block;
+ else
+ {
+ page->last_block->next = block;
+ page->last_block = block;
+ }
+ return block;
}
-static void
-free_span_soup(fz_context *ctx, span_soup *soup)
+static fz_stext_block *
+add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
{
- int i;
-
- if (soup == NULL)
- return;
- for (i = 0; i < soup->len; i++)
- {
- fz_free(ctx, soup->spans[i]);
- }
- fz_free(ctx, soup->spans);
- fz_free(ctx, soup);
+ fz_stext_block *block = add_block_to_page(ctx, page);
+ block->type = FZ_STEXT_BLOCK_TEXT;
+ return block;
}
-static void
-add_span_to_soup(fz_context *ctx, span_soup *soup, fz_stext_span *span)
+static fz_stext_block *
+add_image_block_to_page(fz_context *ctx, fz_stext_page *page, const fz_matrix *ctm, fz_image *image)
{
- if (span == NULL)
- return;
- if (soup->len == soup->cap)
- {
- int newcap = (soup->cap ? soup->cap * 2 : 16);
- soup->spans = fz_resize_array(ctx, soup->spans, newcap, sizeof(*soup->spans));
- soup->cap = newcap;
- }
- add_bbox_to_span(span);
- soup->spans[soup->len++] = span;
+ fz_stext_block *block = add_block_to_page(ctx, page);
+ block->type = FZ_STEXT_BLOCK_IMAGE;
+ block->u.i.transform = *ctm;
+ block->u.i.image = fz_keep_image(ctx, image);
+ block->bbox.x0 = 0;
+ block->bbox.y0 = 0;
+ block->bbox.x1 = 1;
+ block->bbox.y1 = 1;
+ fz_transform_rect(&block->bbox, ctm);
+ return block;
}
static fz_stext_line *
-push_span(fz_context *ctx, fz_stext_device *tdev, fz_stext_span *span, int new_line, float distance)
+add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, int wmode)
{
- fz_stext_line *line;
- fz_stext_block *block;
- fz_stext_page *page = tdev->page;
- int prev_not_text = 0;
-
- if (page->len == 0 || page->blocks[page->len-1].type != FZ_PAGE_BLOCK_TEXT)
- prev_not_text = 1;
-
- if (new_line || prev_not_text)
- {
- float size = fz_matrix_expansion(&span->transform);
- /* So, a new line. Part of the same block or not? */
- if (distance == 0 || distance > size * 1.5f || distance < -size * PARAGRAPH_DIST || page->len == 0 || prev_not_text)
- {
- /* New block */
- if (page->len == page->cap)
- {
- int newcap = (page->cap ? page->cap*2 : 4);
- page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks));
- page->cap = newcap;
- }
- block = fz_malloc_struct(ctx, fz_stext_block);
- page->blocks[page->len].type = FZ_PAGE_BLOCK_TEXT;
- page->blocks[page->len].u.text = block;
- block->cap = 0;
- block->len = 0;
- block->lines = 0;
- block->bbox = fz_empty_rect;
- page->len++;
- distance = 0;
- }
-
- /* New line */
- block = page->blocks[page->len-1].u.text;
- if (block->len == block->cap)
- {
- int newcap = (block->cap ? block->cap*2 : 4);
- block->lines = fz_resize_array(ctx, block->lines, newcap, sizeof(*block->lines));
- block->cap = newcap;
- }
- block->lines[block->len].first_span = NULL;
- block->lines[block->len].last_span = NULL;
- block->lines[block->len].distance = distance;
- block->lines[block->len].bbox = fz_empty_rect;
- block->len++;
- }
-
- /* Find last line and append to it */
- block = page->blocks[page->len-1].u.text;
- line = &block->lines[block->len-1];
-
- fz_union_rect(&block->lines[block->len-1].bbox, &span->bbox);
- fz_union_rect(&block->bbox, &span->bbox);
- span->base_offset = (new_line ? 0 : distance);
-
- if (!line->first_span)
- {
- line->first_span = line->last_span = span;
- span->next = NULL;
- }
+ fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
+ if (!block->u.t.first_line)
+ block->u.t.first_line = block->u.t.last_line = line;
else
{
- line->last_span->next = span;
- line->last_span = span;
+ block->u.t.last_line->next = line;
+ block->u.t.last_line = line;
}
+ line->wmode = wmode;
+
return line;
}
-#if defined(DEBUG_SPANS) || defined(DEBUG_ALIGN) || defined(DEBUG_INDENTS)
-static void
-dump_span(fz_stext_span *s)
+static float min4(float a, float b, float c, float d)
{
- int i;
- for (i=0; i < s->len; i++)
- {
- printf("%c", s->text[i].c);
- }
+ return fz_min(fz_min(a, b), fz_min(c, d));
}
-#endif
-#ifdef DEBUG_ALIGN
-static void
-dump_line(fz_stext_line *line)
+static float max4(float a, float b, float c, float d)
{
- int i;
- for (i=0; i < line->len; i++)
- {
- fz_stext_span *s = line->spans[i];
- if (s->spacing > 1)
- printf(" ");
- dump_span(s);
- }
- printf("\n");
+ return fz_max(fz_max(a, b), fz_max(c, d));
}
-#endif
-static void
-strain_soup(fz_context *ctx, fz_stext_device *tdev)
+static fz_stext_char *
+add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, const fz_matrix *trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int rtl)
{
- span_soup *soup = tdev->spans;
- fz_stext_line *last_line = NULL;
- fz_stext_span *last_span = NULL;
- int span_num;
-
- if (soup == NULL)
- return;
+ fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
+ fz_point a, d;
- /* Really dumb implementation to match what we had before */
- for (span_num=0; span_num < soup->len; span_num++)
+ if (!line->first_char)
+ line->first_char = line->last_char = ch;
+ else
{
- fz_stext_span *span = soup->spans[span_num];
- int new_line = 1;
- float distance = 0;
- float spacing = 0;
- soup->spans[span_num] = NULL;
- if (last_span)
- {
- /* If we have a last_span, we must have a last_line */
- /* Do span and last_line share the same baseline? */
- fz_point p, q, perp_r;
- float dot;
- float size = fz_matrix_expansion(&span->transform);
-
-#ifdef DEBUG_SPANS
- {
- printf("Comparing: \"");
- dump_span(last_span);
- printf("\" and \"");
- dump_span(span);
- printf("\"\n");
- }
-#endif
-
- p.x = last_line->first_span->max.x - last_line->first_span->min.x;
- p.y = last_line->first_span->max.y - last_line->first_span->min.y;
- fz_normalize_vector(&p);
- q.x = span->max.x - span->min.x;
- q.y = span->max.y - span->min.y;
- fz_normalize_vector(&q);
-#ifdef DEBUG_SPANS
- printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y);
- printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y);
-#endif
- perp_r.y = last_line->first_span->min.x - span->min.x;
- perp_r.x = -(last_line->first_span->min.y - span->min.y);
- /* Check if p and q are parallel. If so, then this
- * line is parallel with the last one. */
- dot = p.x * q.x + p.y * q.y;
- if (fabsf(dot) > 0.9995f)
- {
- /* If we take the dot product of normalised(p) and
- * perp(r), we get the perpendicular distance from
- * one line to the next (assuming they are parallel). */
- distance = p.x * perp_r.x + p.y * perp_r.y;
- /* We allow 'small' distances of baseline changes
- * to cope with super/subscript. FIXME: We should
- * gather subscript/superscript information here. */
- new_line = (fabsf(distance) > size * LINE_DIST);
- }
- else
- {
- new_line = 1;
- distance = 0;
- }
- if (!new_line)
- {
- fz_point delta;
-
- delta.x = span->min.x - last_span->max.x;
- delta.y = span->min.y - last_span->max.y;
-
- spacing = (p.x * delta.x + p.y * delta.y);
- spacing = fabsf(spacing);
- /* Only allow changes in baseline (subscript/superscript etc)
- * when the spacing is small. */
- if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f)
- {
- new_line = 1;
- distance = 0;
- spacing = 0;
- }
- else
- {
- spacing /= size * SPACE_DIST;
- /* Apply the same logic here as when we're adding chars to build spans. */
- if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
- spacing = 1;
- }
- }
-#ifdef DEBUG_SPANS
- printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing);
-#endif
- }
- span->spacing = spacing;
- last_line = push_span(ctx, tdev, span, new_line, distance);
- last_span = span;
+ line->last_char->next = ch;
+ line->last_char = ch;
}
-}
-
-fz_stext_sheet *
-fz_new_stext_sheet(fz_context *ctx)
-{
- fz_stext_sheet *sheet = fz_malloc(ctx, sizeof *sheet);
- sheet->maxid = 0;
- sheet->style = NULL;
- return sheet;
-}
-void
-fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet)
-{
- fz_stext_style *style;
+ ch->c = c;
+ ch->rtl = rtl;
+ ch->origin = *p;
+ ch->size = size;
+ ch->font = font; /* TODO: keep and drop */
- if (sheet == NULL)
- return;
-
- style = sheet->style;
- while (style)
+ if (line->wmode == 0)
{
- fz_stext_style *next = style->next;
- fz_drop_font(ctx, style->font);
- fz_free(ctx, style);
- style = next;
+ a.x = 0;
+ d.x = 0;
+ a.y = fz_font_ascender(ctx, font);
+ d.y = fz_font_descender(ctx, font);
}
- fz_free(ctx, sheet);
-}
-
-static fz_stext_style *
-fz_lookup_stext_style_imp(fz_context *ctx, fz_stext_sheet *sheet,
- float size, fz_font *font, int wmode, int script)
-{
- fz_stext_style *style;
-
- for (style = sheet->style; style; style = style->next)
+ else
{
- if (style->font == font &&
- style->size == size &&
- style->wmode == wmode &&
- style->script == script) /* FIXME: others */
- {
- return style;
- }
+ fz_rect *bbox = fz_font_bbox(ctx, font);
+ a.x = bbox->x1;
+ d.x = bbox->x0;
+ a.y = 0;
+ d.y = 0;
}
+ fz_transform_vector(&a, trm);
+ fz_transform_vector(&d, trm);
- /* Better make a new one and add it to our list */
- style = fz_malloc(ctx, sizeof *style);
- style->id = sheet->maxid++;
- style->font = fz_keep_font(ctx, font);
- style->size = size;
- style->wmode = wmode;
- style->script = script;
- style->next = sheet->style;
- sheet->style = style;
- return style;
-}
+ ch->bbox.x0 = min4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x);
+ ch->bbox.x1 = max4(p->x + a.x, q->x + a.x, p->x + d.x, q->x + d.x);
+ ch->bbox.y0 = min4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y);
+ ch->bbox.y1 = max4(p->y + a.y, q->y + a.y, p->y + d.y, q->y + d.y);
-static fz_stext_style *
-fz_lookup_stext_style(fz_context *ctx, fz_stext_sheet *sheet, fz_text_span *span, const fz_matrix *ctm,
- fz_colorspace *colorspace, const float *color, float alpha, const fz_stroke_state *stroke)
-{
- float size = 1.0f;
- fz_font *font = span ? span->font : NULL;
- int wmode = span ? span->wmode : 0;
- if (ctm && span)
+ if (fz_is_empty_rect(&line->bbox))
+ line->bbox = ch->bbox;
+ else
{
- fz_matrix tm = span->trm;
- fz_matrix trm;
- tm.e = 0;
- tm.f = 0;
- fz_concat(&trm, &tm, ctm);
- size = fz_matrix_expansion(&trm);
+ line->bbox.x0 = fz_min(line->bbox.x0, ch->bbox.x0);
+ line->bbox.y0 = fz_min(line->bbox.y0, ch->bbox.y0);
+ line->bbox.x1 = fz_min(line->bbox.x1, ch->bbox.x1);
+ line->bbox.y1 = fz_min(line->bbox.y1, ch->bbox.y1);
}
- return fz_lookup_stext_style_imp(ctx, sheet, size, font, wmode, 0);
-}
-fz_stext_page *
-fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox)
-{
- fz_stext_page *page = fz_malloc(ctx, sizeof(*page));
- page->mediabox = *mediabox;
- page->len = 0;
- page->cap = 0;
- page->blocks = NULL;
- page->next = NULL;
- return page;
+ return ch;
}
-static void
-fz_drop_stext_line_contents(fz_context *ctx, fz_stext_line *line)
+static int
+direction_from_bidi_class(int bidiclass, int curdir)
{
- fz_stext_span *span, *next;
- for (span = line->first_span; span; span=next)
+ switch (bidiclass)
{
- next = span->next;
- fz_free(ctx, span->text);
- fz_free(ctx, span);
- }
-}
+ /* strong */
+ case UCDN_BIDI_CLASS_L: return 1;
+ case UCDN_BIDI_CLASS_R: return -1;
+ case UCDN_BIDI_CLASS_AL: return -1;
-static void
-fz_drop_stext_block(fz_context *ctx, fz_stext_block *block)
-{
- fz_stext_line *line;
- if (block == NULL)
- return;
- for (line = block->lines; line < block->lines + block->len; line++)
- fz_drop_stext_line_contents(ctx, line);
- fz_free(ctx, block->lines);
- fz_free(ctx, block);
-}
+ /* weak */
+ case UCDN_BIDI_CLASS_EN:
+ case UCDN_BIDI_CLASS_ES:
+ case UCDN_BIDI_CLASS_ET:
+ case UCDN_BIDI_CLASS_AN:
+ case UCDN_BIDI_CLASS_CS:
+ case UCDN_BIDI_CLASS_NSM:
+ case UCDN_BIDI_CLASS_BN:
+ return curdir;
-static void
-fz_drop_image_block(fz_context *ctx, fz_image_block *block)
-{
- if (block == NULL)
- return;
- fz_drop_image(ctx, block->image);
- fz_drop_colorspace(ctx, block->cspace);
- fz_free(ctx, block);
-}
+ /* neutral */
+ case UCDN_BIDI_CLASS_B:
+ case UCDN_BIDI_CLASS_S:
+ case UCDN_BIDI_CLASS_WS:
+ case UCDN_BIDI_CLASS_ON:
+ return curdir;
-void
-fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
-{
- fz_page_block *block;
- if (page == NULL)
- return;
- for (block = page->blocks; block < page->blocks + page->len; block++)
- {
- switch (block->type)
- {
- case FZ_PAGE_BLOCK_TEXT:
- fz_drop_stext_block(ctx, block->u.text);
- break;
- case FZ_PAGE_BLOCK_IMAGE:
- fz_drop_image_block(ctx, block->u.image);
- break;
- }
+ /* embedding, override, pop ... we don't support them */
+ default:
+ return 0;
}
- fz_free(ctx, page->blocks);
- fz_free(ctx, page);
}
-static fz_stext_span *
-fz_new_stext_span(fz_context *ctx, const fz_point *p, int wmode, const fz_matrix *trm)
+static int
+sign_eq(float x, float y)
{
- fz_stext_span *span = fz_malloc_struct(ctx, fz_stext_span);
- span->ascender_max = 0;
- span->descender_min = 0;
- span->cap = 0;
- span->len = 0;
- span->min = *p;
- span->max = *p;
- span->wmode = wmode;
- span->transform.a = trm->a;
- span->transform.b = trm->b;
- span->transform.c = trm->c;
- span->transform.d = trm->d;
- span->transform.e = 0;
- span->transform.f = 0;
- span->text = NULL;
- span->next = NULL;
- return span;
+ return (x < 0 && y < 0) || (x > 0 && y > 0) || (x == 0 && y == 0);
}
-static void
-add_char_to_span(fz_context *ctx, fz_stext_span *span, int c, fz_point *p, fz_point *max, fz_stext_style *style)
+static int
+mat_sign_eq(const fz_matrix *x, const fz_matrix *y)
{
- if (span->len == span->cap)
- {
- int newcap = (span->cap ? span->cap * 2 : 16);
- span->text = fz_resize_array(ctx, span->text, newcap, sizeof(fz_stext_char));
- span->cap = newcap;
- span->bbox = fz_empty_rect;
- }
- span->max = *max;
- if (style->ascender > span->ascender_max)
- span->ascender_max = style->ascender;
- if (style->descender < span->descender_min)
- span->descender_min = style->descender;
- span->text[span->len].c = c;
- span->text[span->len].p = *p;
- span->text[span->len].style = style;
- span->len++;
+ return sign_eq(x->a, y->a) && sign_eq(x->b, y->b) && sign_eq(x->c, y->c) && sign_eq(x->d, y->d);
}
static void
-fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode)
+fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode)
{
- int can_append = 1;
+ fz_stext_page *page = dev->page;
+ fz_stext_block *cur_block;
+ fz_stext_line *cur_line;
+
+ int new_para = 0;
+ int new_line = 1;
int add_space = 0;
- fz_point dir, ndir, p, q, r;
+ fz_point dir, ndir, p, q;
float size;
fz_point delta;
float spacing = 0;
float base_offset = 0;
+ int rtl = 0;
+
+ dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir);
+ /* dir = direction vector for motion. ndir = normalised(dir) */
if (wmode == 0)
{
dir.x = 1;
@@ -602,17 +273,16 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty
fz_transform_vector(&dir, trm);
ndir = dir;
fz_normalize_vector(&ndir);
- /* dir = direction vector for motion. ndir = normalised(dir) */
size = fz_matrix_expansion(trm);
/* We need to identify where glyphs 'start' (p) and 'stop' (q).
- * Each glyph holds it's 'start' position, and the next glyph in the
- * span (or span->max if there is no next glyph) holds it's 'end'
+ * Each glyph holds its 'start' position, and the next glyph in the
+ * span (or span->max if there is no next glyph) holds its 'end'
* position.
*
* For both horizontal and vertical motion, trm->{e,f} gives the
- * bottom left corner of the glyph.
+ * origin (usually the bottom left) of the glyph.
*
* In horizontal mode:
* + p is bottom left.
@@ -636,37 +306,38 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty
q.y = trm->f;
}
- if (glyph < 0)
+ /* Find current position to enter new text. */
+ cur_block = page->last_block;
+ if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
+ cur_block = NULL;
+ cur_line = cur_block ? cur_block->u.t.last_line : NULL;
+
+ if (cur_line && glyph < 0)
{
- /* Don't reset 'pen' to start of no-glyph characters in cluster */
- if (dev->cur_span)
- q = dev->cur_span->max;
- goto no_glyph;
+ /* Don't advance pen or break lines for no-glyph characters in a cluster */
+ add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, 0);
+ dev->lastchar = c;
+ return;
}
- if (dev->cur_span == NULL ||
- trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b ||
- trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d ||
- dev->cur_span->wmode != wmode)
+ if (cur_line == NULL || !mat_sign_eq(trm, &dev->trm) || cur_line->wmode != wmode)
{
- /* If the matrix has changed, or the wmode is different (or
- * if we don't have a span at all), then we can't append. */
-#ifdef DEBUG_SPANS
- printf("Transform/WMode changed\n");
-#endif
- can_append = 0;
+ /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
+ * then we can't append to the current block/line. */
+ new_para = 1;
+ new_line = 1;
}
else
{
- delta.x = q.x - dev->cur_span->max.x;
- delta.y = q.y - dev->cur_span->max.y;
+ /* Detect fake bold where text is printed twice in the same place. */
+ delta.x = q.x - dev->pen.x;
+ delta.y = q.y - dev->pen.y;
if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar)
return;
- /* Calculate how far we've moved since the end of the current
- * span. */
- delta.x = p.x - dev->cur_span->max.x;
- delta.y = p.y - dev->cur_span->max.y;
+ /* Calculate how far we've moved since the last character. */
+ delta.x = p.x - dev->pen.x;
+ delta.y = p.y - dev->pen.y;
/* The transform has not changed, so we know we're in the same
* direction. Calculate 2 distances; how far off the previous
@@ -675,102 +346,129 @@ fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_stext_style *sty
spacing = ndir.x * delta.x + ndir.y * delta.y;
base_offset = -ndir.y * delta.x + ndir.x * delta.y;
- spacing /= size * SPACE_DIST;
- if (fabsf(base_offset) < size * 0.1f)
+ /* Only a small amount off the baseline - we'll take this */
+ if (fabsf(base_offset) < size * 0.8f)
{
- /* Only a small amount off the baseline - we'll take this */
- if (fabsf(spacing) < 1.0f)
+ /* LTR or neutral character */
+ if (dev->curdir >= 0)
{
- /* Motion is in line, and small. */
- }
- else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
- {
- /* Motion is in line, but large enough
- * to warrant us adding a space */
- if (dev->lastchar != ' ' && wmode == 0)
- add_space = 1;
+ if (fabs(spacing) < size * SPACE_DIST)
+ {
+ /* Motion is in line, and small. */
+ new_line = 0;
+ }
+ else if (spacing >= size * SPACE_DIST && spacing < size * SPACE_MAX_DIST)
+ {
+ /* Motion is in line, but large enough to warrant us adding a space. */
+ if (dev->lastchar != ' ' && wmode == 0)
+ add_space = 1;
+ new_line = 0;
+ }
+ else
+ {
+ /* Motion is in line, but large enough to warrant splitting to a new line */
+ new_line = 1;
+ }
}
+
+ /* RTL character -- disable space character and column detection heuristics */
else
{
- /* Motion is in line, but too large - split to a new span */
- can_append = 0;
+ new_line = 0;
+ if (spacing > size * SPACE_DIST || spacing < 0)
+ rtl = 0; /* backward (or big jump to 'right' side) means logical order */
+ else
+ rtl = 1; /* visual order, we need to reverse in a post process pass */
}
}
+
+ /* Enough for a new line, but not enough for a new paragraph */
+ else if (fabsf(base_offset) < size * 1.3f)
+ {
+ /* Check indent to spot text-indent style paragraphs */
+ if (wmode == 0 && cur_line && dev->new_obj)
+ if (fabsf(p.x - dev->start.x) > size * 0.5f)
+ new_para = 1;
+ new_line = 1;
+ }
+
+ /* Way off the baseline - open a new paragraph */
else
{
- can_append = 0;
-#ifdef DEBUG_SPANS
- spacing = 0;
-#endif
+ new_para = 1;
+ new_line = 1;
}
}
-#ifdef DEBUG_SPANS
- printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset);
-#endif
+ /* Start a new block (but only at the beginning of a text object) */
+ if (new_para || !cur_block)
+ {
+ cur_block = add_text_block_to_page(ctx, page);
+ cur_line = cur_block->u.t.last_line;
+ }
- /* Start a new span */
- if (!can_append)
+ /* Start a new line */
+ if (new_line || !cur_line)
{
- add_span_to_soup(ctx, dev->spans, dev->cur_span);
- dev->cur_span = NULL;
- dev->cur_span = fz_new_stext_span(ctx, &p, wmode, trm);
- dev->cur_span->spacing = 0;
+ cur_line = add_line_to_block(ctx, page, cur_block, wmode);
+ dev->start = p;
}
/* Add synthetic space */
if (add_space)
- {
- /* We know we always have a cur_span here */
- r = dev->cur_span->max;
- add_char_to_span(ctx, dev->cur_span, ' ', &r, &p, style);
- }
+ add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, rtl);
-no_glyph:
- add_char_to_span(ctx, dev->cur_span, c, &p, &q, style);
+ add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, rtl);
dev->lastchar = c;
+ dev->pen = q;
+
+ dev->new_obj = 0;
+ dev->trm = *trm;
}
static void
-fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style, int c, int glyph, fz_matrix *trm, float adv, int wmode)
+fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix *trm, float adv, int wmode)
{
/* ignore when one unicode character maps to multiple glyphs */
if (c == -1)
return;
if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
+ {
switch (c)
{
case 0xFB00: /* ff */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
return;
case 0xFB01: /* fi */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode);
return;
case 0xFB02: /* fl */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode);
return;
case 0xFB03: /* ffi */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'i', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode);
return;
case 0xFB04: /* ffl */
- fz_add_stext_char_imp(ctx, dev, style, 'f', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'f', -1, trm, 0, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 'l', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode);
return;
case 0xFB05: /* long st */
case 0xFB06: /* st */
- fz_add_stext_char_imp(ctx, dev, style, 's', glyph, trm, adv, wmode);
- fz_add_stext_char_imp(ctx, dev, style, 't', -1, trm, 0, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode);
return;
}
+ }
if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
+ {
switch (c)
{
case 0x0009: /* tab */
@@ -794,56 +492,23 @@ fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_stext_style *style,
case 0x3000: /* ideographic space */
c = ' ';
}
+ }
- fz_add_stext_char_imp(ctx, dev, style, c, glyph, trm, adv, wmode);
+ fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode);
}
static void
-fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm, fz_stext_style *style)
+fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, const fz_matrix *ctm)
{
fz_font *font = span->font;
- FT_Face face = fz_font_ft_face(ctx, font);
- fz_buffer **t3procs = fz_font_t3_procs(ctx, font);
- fz_rect *bbox = fz_font_bbox(ctx, font);
fz_matrix tm = span->trm;
fz_matrix trm;
float adv;
- float ascender = 1;
- float descender = 0;
- int i, err;
+ int i;
if (span->len == 0)
return;
- if (dev->spans == NULL)
- dev->spans = new_span_soup(ctx);
-
- if (style->wmode == 0)
- {
- if (face)
- {
- fz_lock(ctx, FZ_LOCK_FREETYPE);
- err = FT_Set_Char_Size(face, 64, 64, 72, 72);
- if (err)
- fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
- ascender = (float)face->ascender / face->units_per_EM;
- descender = (float)face->descender / face->units_per_EM;
- fz_unlock(ctx, FZ_LOCK_FREETYPE);
- }
- else if (t3procs && !fz_is_empty_rect(bbox))
- {
- ascender = bbox->y1;
- descender = bbox->y0;
- }
- }
- else
- {
- ascender = bbox->x1;
- descender = bbox->x0;
- }
- style->ascender = ascender;
- style->descender = descender;
-
tm.e = 0;
tm.f = 0;
fz_concat(&trm, &tm, ctm);
@@ -857,11 +522,11 @@ fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, cons
/* Calculate bounding box and new pen position based on font metrics */
if (span->items[i].gid >= 0)
- adv = fz_advance_glyph(ctx, font, span->items[i].gid, style->wmode);
+ adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode);
else
adv = 0;
- fz_add_stext_char(ctx, dev, style, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode);
+ fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, &trm, adv, span->wmode);
}
}
@@ -870,13 +535,10 @@ fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, const f
fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_style *style;
fz_text_span *span;
+ tdev->new_obj = 1;
for (span = text->head; span; span = span->next)
- {
- style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, NULL);
- fz_stext_extract(ctx, tdev, span, ctm, style);
- }
+ fz_stext_extract(ctx, tdev, span, ctm);
}
static void
@@ -884,94 +546,61 @@ fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const
fz_colorspace *colorspace, const float *color, float alpha, const fz_color_params *color_params)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_style *style;
fz_text_span *span;
+ tdev->new_obj = 1;
for (span = text->head; span; span = span->next)
- {
- style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, colorspace, color, alpha, stroke);
- fz_stext_extract(ctx, tdev, span, ctm, style);
- }
+ fz_stext_extract(ctx, tdev, span, ctm);
}
static void
fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm, const fz_rect *scissor)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_style *style;
fz_text_span *span;
+ tdev->new_obj = 1;
for (span = text->head; span; span = span->next)
- {
- style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL);
- fz_stext_extract(ctx, tdev, span, ctm, style);
- }
+ fz_stext_extract(ctx, tdev, span, ctm);
}
static void
fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, const fz_matrix *ctm, const fz_rect *scissor)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_style *style;
fz_text_span *span;
+ tdev->new_obj = 1;
for (span = text->head; span; span = span->next)
- {
- style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, stroke);
- fz_stext_extract(ctx, tdev, span, ctm, style);
- }
+ fz_stext_extract(ctx, tdev, span, ctm);
}
static void
fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_matrix *ctm)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_style *style;
fz_text_span *span;
+ tdev->new_obj = 1;
for (span = text->head; span; span = span->next)
- {
- style = fz_lookup_stext_style(ctx, tdev->sheet, span, ctm, NULL, NULL, 0, NULL);
- fz_stext_extract(ctx, tdev, span, ctm, style);
- }
+ fz_stext_extract(ctx, tdev, span, ctm);
}
+/* Images and shadings */
+
static void
-fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm,
- fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params)
+fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
- fz_stext_page *page = tdev->page;
- fz_image_block *block;
- /* If the alpha is less than 50% then it's probably a watermark or
- * effect or something. Skip it */
+ /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
if (alpha < 0.5f)
return;
- /* New block */
- if (page->len == page->cap)
- {
- int newcap = (page->cap ? page->cap*2 : 4);
- page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks));
- page->cap = newcap;
- }
- block = fz_malloc_struct(ctx, fz_image_block);
- page->blocks[page->len].type = FZ_PAGE_BLOCK_IMAGE;
- page->blocks[page->len].u.image = block;
- block->image = fz_keep_image(ctx, img);
- block->cspace = fz_keep_colorspace(ctx, cspace);
- if (cspace)
- memcpy(block->colors, color, sizeof(block->colors[0])*fz_colorspace_n(ctx, cspace));
- block->mat = *ctm;
- block->bbox.x0 = 0;
- block->bbox.y0 = 0;
- block->bbox.x1 = 1;
- block->bbox.y1 = 1;
- fz_transform_rect(&block->bbox, ctm);
- page->len++;
+ add_image_block_to_page(ctx, tdev->page, ctm, img);
}
static void
-fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm, float alpha, const fz_color_params *color_params)
+fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, const fz_matrix *ctm,
+ fz_colorspace *cspace, const float *color, float alpha, const fz_color_params *color_params)
{
- fz_stext_fill_image_mask(ctx, dev, img, ctm, NULL, NULL, alpha, color_params);
+ fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
}
static fz_image *
@@ -1025,103 +654,89 @@ fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, const fz_m
fz_rethrow(ctx);
}
-static int
-direction_from_bidi_class(int bidiclass, int curdir)
-{
- switch (bidiclass)
- {
- /* strong */
- case UCDN_BIDI_CLASS_L: return 1;
- case UCDN_BIDI_CLASS_R: return -1;
- case UCDN_BIDI_CLASS_AL: return -1;
-
- /* weak */
- case UCDN_BIDI_CLASS_EN:
- case UCDN_BIDI_CLASS_ES:
- case UCDN_BIDI_CLASS_ET:
- case UCDN_BIDI_CLASS_AN:
- case UCDN_BIDI_CLASS_CS:
- case UCDN_BIDI_CLASS_NSM:
- case UCDN_BIDI_CLASS_BN:
- return curdir;
-
- /* neutral */
- case UCDN_BIDI_CLASS_B:
- case UCDN_BIDI_CLASS_S:
- case UCDN_BIDI_CLASS_WS:
- case UCDN_BIDI_CLASS_ON:
- return curdir;
-
- /* embedding, override, pop ... we don't support them */
- default:
- return 0;
- }
-}
+/* RTL visual to logical order pass */
static void
-fz_bidi_reorder_run(fz_stext_span *span, int a, int b, int dir)
+fz_bidi_reorder_run(fz_stext_char *a, fz_stext_char *b, int dir)
{
if (a < b && dir == -1)
{
- fz_stext_char c;
- int m = a + (b - a) / 2;
+ fz_stext_char tmp;
+ fz_stext_char *m = a + (b - a) / 2;
while (a < m)
{
b--;
- c = span->text[a];
- span->text[a] = span->text[b];
- span->text[b] = c;
+
+ tmp.c = a->c;
+ tmp.origin = a->origin;
+ tmp.bbox = a->bbox;
+ tmp.size = a->size;
+ tmp.font = a->font;
+
+ a->c = b->c;
+ a->origin = b->origin;
+ a->bbox = b->bbox;
+ a->size = b->size;
+ a->font = b->font;
+
+ b->c = tmp.c;
+ b->origin = tmp.origin;
+ b->bbox = tmp.bbox;
+ b->size = tmp.size;
+ b->font = tmp.font;
+
a++;
}
}
}
static void
-fz_bidi_reorder_span(fz_stext_span *span)
+fz_bidi_reorder_line(fz_stext_line *line)
{
- int a, b, dir, curdir;
+ fz_stext_char *a, *b;
+ int dir, curdir;
- a = 0;
- curdir = 1;
- for (b = 0; b < span->len; b++)
+ a = line->first_char;
+ curdir = 0;
+ for (b = line->first_char; b; b = b->next)
{
- dir = direction_from_bidi_class(ucdn_get_bidi_class(span->text[b].c), curdir);
+ dir = b->rtl;
if (dir != curdir)
{
- fz_bidi_reorder_run(span, a, b, curdir);
+ fz_bidi_reorder_run(a, b, curdir);
curdir = dir;
a = b;
}
}
- fz_bidi_reorder_run(span, a, b, curdir);
+ fz_bidi_reorder_run(a, b, curdir);
}
static void
fz_bidi_reorder_stext_page(fz_context *ctx, fz_stext_page *page)
{
- fz_page_block *pageblock;
fz_stext_block *block;
fz_stext_line *line;
- fz_stext_span *span;
- for (pageblock = page->blocks; pageblock < page->blocks + page->len; pageblock++)
- if (pageblock->type == FZ_PAGE_BLOCK_TEXT)
- for (block = pageblock->u.text, line = block->lines; line < block->lines + block->len; line++)
- for (span = line->first_span; span; span = span->next)
- fz_bidi_reorder_span(span);
+ for (block = page->first_block; block; block = block->next)
+ if (block->type == FZ_STEXT_BLOCK_TEXT)
+ for (line = block->u.t.first_line; line; line = line->next)
+ fz_bidi_reorder_line(line);
}
static void
fz_stext_close_device(fz_context *ctx, fz_device *dev)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
+ fz_stext_page *page = tdev->page;
+ fz_stext_block *block;
+ fz_stext_line *line;
- add_span_to_soup(ctx, tdev->spans, tdev->cur_span);
- tdev->cur_span = NULL;
-
- strain_soup(ctx, tdev);
+ for (block = page->first_block; block; block = block->next)
+ if (block->type == FZ_STEXT_BLOCK_TEXT)
+ for (line = block->u.t.first_line; line; line = line->next)
+ fz_union_rect(&block->bbox, &line->bbox);
- /* TODO: smart sorting of blocks in reading order */
+ /* TODO: smart sorting of blocks and lines in reading order */
/* TODO: unicode NFC normalization */
fz_bidi_reorder_stext_page(ctx, tdev->page);
@@ -1130,9 +745,6 @@ fz_stext_close_device(fz_context *ctx, fz_device *dev)
static void
fz_stext_drop_device(fz_context *ctx, fz_device *dev)
{
- fz_stext_device *tdev = (fz_stext_device*)dev;
- free_span_soup(ctx, tdev->spans);
- tdev->spans = NULL;
}
fz_stext_options *
@@ -1153,7 +765,7 @@ fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *stri
}
fz_device *
-fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *opts)
+fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
{
fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
@@ -1174,11 +786,12 @@ fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page,
dev->super.fill_image_mask = fz_stext_fill_image_mask;
}
- dev->sheet = sheet;
dev->page = page;
- dev->spans = NULL;
- dev->cur_span = NULL;
+ dev->pen.x = 0;
+ dev->pen.y = 0;
+ dev->trm = fz_identity;
dev->lastchar = ' ';
+ dev->curdir = 1;
return (fz_device*)dev;
}
diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c
index 63124aa7..f5f72412 100644
--- a/source/fitz/stext-output.c
+++ b/source/fitz/stext-output.c
@@ -9,40 +9,28 @@
/* HTML output (visual formatting with preserved layout) */
static void
-fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int is_serif = fz_font_is_serif(ctx, style->font);
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int script = style->script;
+ int is_bold = fz_font_is_bold(ctx, font);
+ int is_italic = fz_font_is_italic(ctx, font);
+ int is_serif = fz_font_is_serif(ctx, font);
+ int is_mono = fz_font_is_monospaced(ctx, font);
- fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", style->size);
+ fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt;\">", is_serif ? "serif" : "sans-serif", size);
if (is_mono)
fz_write_string(ctx, out, "<tt>");
if (is_bold)
fz_write_string(ctx, out, "<b>");
if (is_italic)
fz_write_string(ctx, out, "<i>");
-
- while (script-- > 0)
- fz_write_string(ctx, out, "<sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "<sub>");
}
static void
-fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int script = style->script;
-
- while (script-- > 0)
- fz_write_string(ctx, out, "</sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "</sub>");
+ int is_mono = fz_font_is_monospaced(ctx, font);
+ int is_bold = fz_font_is_bold(ctx,font);
+ int is_italic = fz_font_is_italic(ctx, font);
if (is_italic)
fz_write_string(ctx, out, "</i>");
@@ -54,7 +42,7 @@ fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_stext_style *style)
}
static void
-fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *block)
+fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
int x = block->bbox.x0;
int y = block->bbox.y0;
@@ -62,90 +50,78 @@ fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_image_block *bl
int h = block->bbox.y1 - block->bbox.y0;
fz_write_printf(ctx, out, "<img style=\"top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"data:", y, x, w, h);
- fz_write_image_as_data_uri(ctx, out, block->image);
+ fz_write_image_as_data_uri(ctx, out, block->u.i.image);
fz_write_string(ctx, out, "\">\n");
}
void
fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
- fz_stext_style *style = NULL;
fz_stext_line *line;
- fz_stext_span *span;
fz_stext_char *ch;
int x, y;
- style = NULL;
+ fz_font *font = NULL;
+ float size = 0;
- for (line = block->lines; line < block->lines + block->len; ++line)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- for (span = line->first_span; span; span = span->next)
+ x = line->bbox.x0;
+ y = line->bbox.y0;
+
+ fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x);
+ font = NULL;
+
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span == line->first_span || span->spacing > 1)
+ if (ch->font != font || ch->size != size)
{
- if (style)
- {
- fz_print_style_end_html(ctx, out, style);
- fz_write_string(ctx, out, "</p>\n");
- style = NULL;
- }
- x = span->bbox.x0;
- y = span->bbox.y0;
- fz_write_printf(ctx, out, "<p style=\"top:%dpt;left:%dpt;\">", y, x);
+ if (font)
+ fz_print_style_end_html(ctx, out, font, size);
+ font = ch->font;
+ size = ch->size;
+ fz_print_style_begin_html(ctx, out, font, size);
}
- for (ch = span->text; ch < span->text + span->len; ++ch)
+ switch (ch->c)
{
- if (ch->style != style)
- {
- if (style)
- fz_print_style_end_html(ctx, out, style);
- style = ch->style;
- fz_print_style_begin_html(ctx, out, style);
- }
-
- switch (ch->c)
- {
- default:
- if (ch->c >= 32 && ch->c <= 127)
- fz_write_byte(ctx, out, ch->c);
- else
- fz_write_printf(ctx, out, "&#x%x;", ch->c);
- break;
- case '<': fz_write_string(ctx, out, "&lt;"); break;
- case '>': fz_write_string(ctx, out, "&gt;"); break;
- case '&': fz_write_string(ctx, out, "&amp;"); break;
- case '"': fz_write_string(ctx, out, "&quot;"); break;
- case '\'': fz_write_string(ctx, out, "&apos;"); break;
- }
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fz_write_byte(ctx, out, ch->c);
+ else
+ fz_write_printf(ctx, out, "&#x%x;", ch->c);
+ break;
+ case '<': fz_write_string(ctx, out, "&lt;"); break;
+ case '>': fz_write_string(ctx, out, "&gt;"); break;
+ case '&': fz_write_string(ctx, out, "&amp;"); break;
+ case '"': fz_write_string(ctx, out, "&quot;"); break;
+ case '\'': fz_write_string(ctx, out, "&apos;"); break;
}
}
- if (style)
- {
- fz_print_style_end_html(ctx, out, style);
- fz_write_string(ctx, out, "</p>\n");
- style = NULL;
- }
+ if (font)
+ fz_print_style_end_html(ctx, out, font, size);
+
+ fz_write_string(ctx, out, "</p>\n");
}
}
void
fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- fz_page_block *block;
+ fz_stext_block *block;
int w = page->mediabox.x1 - page->mediabox.x0;
int h = page->mediabox.y1 - page->mediabox.y0;
fz_write_printf(ctx, out, "<div style=\"width:%dpt;height:%dpt\">\n", w, h);
- for (block = page->blocks; block < page->blocks + page->len; ++block)
+ for (block = page->first_block; block; block = block->next)
{
- if (block->type == FZ_PAGE_BLOCK_IMAGE)
- fz_print_stext_image_as_html(ctx, out, block->u.image);
- else if (block->type == FZ_PAGE_BLOCK_TEXT)
- fz_print_stext_block_as_html(ctx, out, block->u.text);
+ if (block->type == FZ_STEXT_BLOCK_IMAGE)
+ fz_print_stext_image_as_html(ctx, out, block);
+ else if (block->type == FZ_STEXT_BLOCK_TEXT)
+ fz_print_stext_block_as_html(ctx, out, block);
}
fz_write_string(ctx, out, "</div>\n");
@@ -177,23 +153,22 @@ fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
/* XHTML output (semantic, little layout, suitable for reflow) */
static void
-fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_image_block *block)
+fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
int w = block->bbox.x1 - block->bbox.x0;
int h = block->bbox.y1 - block->bbox.y0;
fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"data:", w, h);
- fz_write_image_as_data_uri(ctx, out, block->image);
+ fz_write_image_as_data_uri(ctx, out, block->u.i.image);
fz_write_string(ctx, out, "\"/></p>\n");
}
static void
-fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int script = style->script;
+ int is_mono = fz_font_is_monospaced(ctx, font);
+ int is_bold = fz_font_is_bold(ctx, font);
+ int is_italic = fz_font_is_italic(ctx, font);
if (is_mono)
fz_write_string(ctx, out, "<tt>");
@@ -201,25 +176,14 @@ fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *styl
fz_write_string(ctx, out, "<b>");
if (is_italic)
fz_write_string(ctx, out, "<i>");
-
- while (script-- > 0)
- fz_write_string(ctx, out, "<sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "<sub>");
}
static void
-fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
+fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, float size)
{
- int is_mono = fz_font_is_monospaced(ctx, style->font);
- int is_bold = fz_font_is_bold(ctx, style->font);
- int is_italic = fz_font_is_italic(ctx, style->font);
- int script = style->script;
-
- while (script-- > 0)
- fz_write_string(ctx, out, "</sup>");
- while (++script < 0)
- fz_write_string(ctx, out, "</sub>");
+ int is_mono = fz_font_is_monospaced(ctx, font);
+ int is_bold = fz_font_is_bold(ctx, font);
+ int is_italic = fz_font_is_italic(ctx, font);
if (is_italic)
fz_write_string(ctx, out, "</i>");
@@ -232,68 +196,63 @@ fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_stext_style *style)
static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
{
fz_stext_line *line;
- fz_stext_span *span;
fz_stext_char *ch;
- fz_stext_style *style;
- style = NULL;
- fz_write_string(ctx, out, "<p>\n");
+ fz_font *font = NULL;
+ float size = 0;
+
+ fz_write_string(ctx, out, "<p>");
- for (line = block->lines; line < block->lines + block->len; ++line)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- if (line > block->lines)
- fz_write_string(ctx, out, "<br/>\n");
- for (span = line->first_span; span; span = span->next)
+ if (line != block->u.t.first_line)
+ fz_write_string(ctx, out, "\n");
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span->spacing > 1)
- fz_write_byte(ctx, out, ' ');
-
- for (ch = span->text; ch < span->text + span->len; ++ch)
+ if (ch->font != font || ch->size != size)
{
- if (ch->style != style)
- {
- if (style)
- fz_print_style_end_xhtml(ctx, out, style);
- style = ch->style;
- fz_print_style_begin_xhtml(ctx, out, style);
- }
+ if (font)
+ fz_print_style_end_xhtml(ctx, out, font, size);
+ font = ch->font;
+ size = ch->size;
+ fz_print_style_begin_xhtml(ctx, out, font, size);
+ }
- switch (ch->c)
- {
- default:
- if (ch->c >= 32 && ch->c <= 127)
- fz_write_byte(ctx, out, ch->c);
- else
- fz_write_printf(ctx, out, "&#x%x;", ch->c);
- break;
- case '<': fz_write_string(ctx, out, "&lt;"); break;
- case '>': fz_write_string(ctx, out, "&gt;"); break;
- case '&': fz_write_string(ctx, out, "&amp;"); break;
- case '"': fz_write_string(ctx, out, "&quot;"); break;
- case '\'': fz_write_string(ctx, out, "&apos;"); break;
- }
+ switch (ch->c)
+ {
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fz_write_byte(ctx, out, ch->c);
+ else
+ fz_write_printf(ctx, out, "&#x%x;", ch->c);
+ break;
+ case '<': fz_write_string(ctx, out, "&lt;"); break;
+ case '>': fz_write_string(ctx, out, "&gt;"); break;
+ case '&': fz_write_string(ctx, out, "&amp;"); break;
+ case '"': fz_write_string(ctx, out, "&quot;"); break;
+ case '\'': fz_write_string(ctx, out, "&apos;"); break;
}
}
}
- if (style)
- fz_print_style_end_xhtml(ctx, out, style);
- fz_write_string(ctx, out, "\n</p>\n");
+ if (font)
+ fz_print_style_end_xhtml(ctx, out, font, size);
+ fz_write_string(ctx, out, "</p>\n");
}
void
fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- fz_page_block *block;
+ fz_stext_block *block;
fz_write_string(ctx, out, "<div>\n");
- for (block = page->blocks; block < page->blocks + page->len; ++block)
+ for (block = page->first_block; block; block = block->next)
{
- if (block->type == FZ_PAGE_BLOCK_IMAGE)
- fz_print_stext_image_as_xhtml(ctx, out, block->u.image);
- else if (block->type == FZ_PAGE_BLOCK_TEXT)
- fz_print_stext_block_as_xhtml(ctx, out, block->u.text);
+ if (block->type == FZ_STEXT_BLOCK_IMAGE)
+ fz_print_stext_image_as_xhtml(ctx, out, block);
+ else if (block->type == FZ_STEXT_BLOCK_TEXT)
+ fz_print_stext_block_as_xhtml(ctx, out, block);
}
fz_write_string(ctx, out, "</div>\n");
@@ -311,6 +270,7 @@ fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
fz_write_string(ctx, out, "<style>\n");
fz_write_string(ctx, out, "body{background-color:gray}\n");
fz_write_string(ctx, out, "div{background-color:white;margin:1em;padding:1em}\n");
+ fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
fz_write_string(ctx, out, "</style>\n");
fz_write_string(ctx, out, "</head>\n");
fz_write_string(ctx, out, "<body>\n");
@@ -328,87 +288,79 @@ fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
void
fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- int block_n;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
fz_write_printf(ctx, out, "<page width=\"%g\" height=\"%g\">\n",
page->mediabox.x1 - page->mediabox.x0,
page->mediabox.y1 - page->mediabox.y0);
- for (block_n = 0; block_n < page->len; block_n++)
+ for (block = page->first_block; block; block = block->next)
{
- switch (page->blocks[block_n].type)
- {
- case FZ_PAGE_BLOCK_TEXT:
+ switch (block->type)
{
- fz_stext_block *block = page->blocks[block_n].u.text;
- fz_stext_line *line;
- const char *s;
-
+ case FZ_STEXT_BLOCK_TEXT:
fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
- block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
- for (line = block->lines; line < block->lines + block->len; line++)
+ block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
+ for (line = block->u.t.first_line; line; line = line->next)
{
- fz_stext_span *span;
+ fz_font *font = NULL;
+ float size = 0;
+ const char *name = NULL;
+ const char *s;
+ fz_rect rect;
+
fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\">\n",
- line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
- for (span = line->first_span; span; span = span->next)
+ line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
+
+ for (ch = line->first_char; ch; ch = ch->next)
{
- fz_stext_style *style = NULL;
- const char *name = NULL;
- int char_num;
- for (char_num = 0; char_num < span->len; char_num++)
+ if (ch->font != font || ch->size != size)
+ {
+ if (font)
+ fz_write_string(ctx, out, "</font>\n");
+ font = ch->font;
+ size = ch->size;
+ name = fz_font_name(ctx, font);
+ s = strchr(name, '+');
+ s = s ? s + 1 : name;
+ fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", s, size);
+ }
+ fz_stext_char_bbox(ctx, &rect, line, ch);
+ fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"",
+ rect.x0, rect.y0, rect.x1, rect.y1, ch->origin.x, ch->origin.y);
+ switch (ch->c)
{
- fz_stext_char *ch = &span->text[char_num];
- if (ch->style != style)
- {
- if (style)
- {
- fz_write_string(ctx, out, "</span>\n");
- }
- style = ch->style;
- name = fz_font_name(ctx, style->font);
- s = strchr(name, '+');
- s = s ? s + 1 : name;
- fz_write_printf(ctx, out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n",
- span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1,
- s, style->size);
- }
- {
- fz_rect rect;
- fz_stext_char_bbox(ctx, &rect, span, char_num);
- fz_write_printf(ctx, out, "<char bbox=\"%g %g %g %g\" x=\"%g\" y=\"%g\" c=\"",
- rect.x0, rect.y0, rect.x1, rect.y1, ch->p.x, ch->p.y);
- }
- switch (ch->c)
- {
- case '<': fz_write_string(ctx, out, "&lt;"); break;
- case '>': fz_write_string(ctx, out, "&gt;"); break;
- case '&': fz_write_string(ctx, out, "&amp;"); break;
- case '"': fz_write_string(ctx, out, "&quot;"); break;
- case '\'': fz_write_string(ctx, out, "&apos;"); break;
- default:
- if (ch->c >= 32 && ch->c <= 127)
- fz_write_printf(ctx, out, "%c", ch->c);
- else
- fz_write_printf(ctx, out, "&#x%x;", ch->c);
- break;
- }
- fz_write_string(ctx, out, "\"/>\n");
+ case '<': fz_write_string(ctx, out, "&lt;"); break;
+ case '>': fz_write_string(ctx, out, "&gt;"); break;
+ case '&': fz_write_string(ctx, out, "&amp;"); break;
+ case '"': fz_write_string(ctx, out, "&quot;"); break;
+ case '\'': fz_write_string(ctx, out, "&apos;"); break;
+ default:
+ if (ch->c >= 32 && ch->c <= 127)
+ fz_write_printf(ctx, out, "%c", ch->c);
+ else
+ fz_write_printf(ctx, out, "&#x%x;", ch->c);
+ break;
}
- if (style)
- fz_write_string(ctx, out, "</span>\n");
+ fz_write_string(ctx, out, "\"/>\n");
}
+
+ if (font)
+ fz_write_string(ctx, out, "</font>\n");
+
fz_write_string(ctx, out, "</line>\n");
}
fz_write_string(ctx, out, "</block>\n");
break;
- }
- case FZ_PAGE_BLOCK_IMAGE:
- {
+
+ case FZ_STEXT_BLOCK_IMAGE:
+ fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
+ block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
break;
}
}
- }
fz_write_string(ctx, out, "</page>\n");
}
@@ -417,31 +369,23 @@ fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page)
void
fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
{
- fz_page_block *pblock;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
+ char utf[10];
+ int i, n;
- for (pblock = page->blocks; pblock < page->blocks + page->len; ++pblock)
+ for (block = page->first_block; block; block = block->next)
{
- if (pblock->type == FZ_PAGE_BLOCK_TEXT)
+ if (block->type == FZ_STEXT_BLOCK_TEXT)
{
- fz_stext_block *block = pblock->u.text;
- fz_stext_line *line;
- fz_stext_char *ch;
- char utf[10];
- int i, n;
-
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- fz_stext_span *span;
- for (span = line->first_span; span; span = span->next)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span->spacing > 1)
- fz_write_byte(ctx, out, ' ');
- for (ch = span->text; ch < span->text + span->len; ch++)
- {
- n = fz_runetochar(utf, ch->c);
- for (i = 0; i < n; i++)
- fz_write_byte(ctx, out, utf[i]);
- }
+ n = fz_runetochar(utf, ch->c);
+ for (i = 0; i < n; i++)
+ fz_write_byte(ctx, out, utf[i]);
}
fz_write_string(ctx, out, "\n");
}
@@ -466,7 +410,6 @@ struct fz_text_writer_s
fz_document_writer super;
int format;
fz_stext_options opts;
- fz_stext_sheet *sheet;
fz_stext_page *page;
fz_output *out;
};
@@ -483,7 +426,7 @@ text_begin_page(fz_context *ctx, fz_document_writer *wri_, const fz_rect *mediab
}
wri->page = fz_new_stext_page(ctx, mediabox);
- return fz_new_stext_device(ctx, wri->sheet, wri->page, &wri->opts);
+ return fz_new_stext_device(ctx, wri->page, &wri->opts);
}
static void
@@ -537,7 +480,6 @@ text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
{
fz_text_writer *wri = (fz_text_writer*)wri_;
fz_drop_stext_page(ctx, wri->page);
- fz_drop_stext_sheet(ctx, wri->sheet);
fz_drop_output(ctx, wri->out);
}
@@ -561,7 +503,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const
else if (!strcmp(format, "stext"))
wri->format = FZ_FORMAT_STEXT;
- wri->sheet = fz_new_stext_sheet(ctx);
wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
switch (wri->format)
@@ -581,7 +522,6 @@ fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const
fz_catch(ctx)
{
fz_drop_output(ctx, wri->out);
- fz_drop_stext_sheet(ctx, wri->sheet);
fz_free(ctx, wri);
fz_rethrow(ctx);
}
diff --git a/source/fitz/stext-paragraph.c b/source/fitz/stext-paragraph.c
deleted file mode 100644
index e275ecae..00000000
--- a/source/fitz/stext-paragraph.c
+++ /dev/null
@@ -1,1538 +0,0 @@
-#include "mupdf/fitz.h"
-
-#include <string.h>
-#include <assert.h>
-#include <math.h>
-
-/* Assemble span soup into blocks and lines. */
-
-#define MY_EPSILON 0.001f
-
-#include <stdio.h> /* for debug printing */
-#undef DEBUG_LINE_HEIGHTS
-#undef DEBUG_MASKS
-#undef DEBUG_ALIGN
-#undef DEBUG_INDENTS
-
-#undef SPOT_LINE_NUMBERS
-
-typedef struct line_height_s
-{
- float height;
- int count;
- fz_stext_style *style;
-} line_height;
-
-typedef struct line_heights_s
-{
- fz_context *ctx;
- int cap;
- int len;
- line_height *lh;
-} line_heights;
-
-static line_heights *
-new_line_heights(fz_context *ctx)
-{
- line_heights *lh = fz_malloc_struct(ctx, line_heights);
- lh->ctx = ctx;
- return lh;
-}
-
-static void
-free_line_heights(line_heights *lh)
-{
- if (!lh)
- return;
- fz_free(lh->ctx, lh->lh);
- fz_free(lh->ctx, lh);
-}
-
-static void
-insert_line_height(line_heights *lh, fz_stext_style *style, float height)
-{
- int i;
-
-#ifdef DEBUG_LINE_HEIGHTS
- printf("style=%x height=%g\n", style, height);
-#endif
-
- /* If we have one already, add it in */
- for (i=0; i < lh->len; i++)
- {
- /* Match if we are within 5% */
- if (lh->lh[i].style == style && lh->lh[i].height * 0.95f <= height && lh->lh[i].height * 1.05f >= height)
- {
- /* Ensure that the average height is correct */
- lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1);
- lh->lh[i].count++;
- return;
- }
- }
-
- /* Otherwise extend (if required) and add it */
- if (lh->cap == lh->len)
- {
- int newcap = (lh->cap ? lh->cap * 2 : 4);
- lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height));
- lh->cap = newcap;
- }
-
- lh->lh[lh->len].count = 1;
- lh->lh[lh->len].height = height;
- lh->lh[lh->len].style = style;
- lh->len++;
-}
-
-static void
-cull_line_heights(line_heights *lh)
-{
- int i, j, k;
-
-#ifdef DEBUG_LINE_HEIGHTS
- printf("Before culling:\n");
- for (i = 0; i < lh->len; i++)
- {
- fz_stext_style *style = lh->lh[i].style;
- printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
- }
-#endif
- for (i = 0; i < lh->len; i++)
- {
- fz_stext_style *style = lh->lh[i].style;
- int count = lh->lh[i].count;
- int max = i;
-
- /* Find the max for this style */
- for (j = i+1; j < lh->len; j++)
- {
- if (lh->lh[j].style == style && lh->lh[j].count > count)
- {
- max = j;
- count = lh->lh[j].count;
- }
- }
-
- /* Destroy all the ones other than the max */
- if (max != i)
- {
- lh->lh[i].count = count;
- lh->lh[i].height = lh->lh[max].height;
- lh->lh[max].count = 0;
- }
- j = i+1;
- for (k = j; k < lh->len; k++)
- {
- if (lh->lh[k].style != style)
- lh->lh[j++] = lh->lh[k];
- }
- lh->len = j;
- }
-#ifdef DEBUG_LINE_HEIGHTS
- printf("After culling:\n");
- for (i = 0; i < lh->len; i++)
- {
- fz_stext_style *style = lh->lh[i].style;
- printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
- }
-#endif
-}
-
-static float
-line_height_for_style(line_heights *lh, fz_stext_style *style)
-{
- int i;
-
- for (i=0; i < lh->len; i++)
- {
- if (lh->lh[i].style == style)
- return lh->lh[i].height;
- }
- return 0.0f; /* Never reached */
-}
-
-static void
-split_block(fz_context *ctx, fz_stext_page *page, int block_num, int linenum)
-{
- int split_len;
- fz_stext_block *block, *block2;
-
- if (page->len == page->cap)
- {
- int new_cap = fz_maxi(16, page->cap * 2);
- page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
- page->cap = new_cap;
- }
-
- memmove(page->blocks+block_num+1, page->blocks+block_num, (page->len - block_num)*sizeof(*page->blocks));
- page->len++;
-
- block2 = fz_malloc_struct(ctx, fz_stext_block);
- block = page->blocks[block_num].u.text;
-
- page->blocks[block_num+1].type = FZ_PAGE_BLOCK_TEXT;
- page->blocks[block_num+1].u.text = block2;
- split_len = block->len - linenum;
- block2->bbox = block->bbox; /* FIXME! */
- block2->cap = 0;
- block2->len = 0;
- block2->lines = NULL;
- block2->lines = fz_malloc_array(ctx, split_len, sizeof(fz_stext_line));
- block2->cap = block2->len;
- block2->len = split_len;
- block->len = linenum;
- memcpy(block2->lines, block->lines + linenum, split_len * sizeof(fz_stext_line));
- block2->lines[0].distance = 0;
-}
-
-static inline int
-is_unicode_wspace(int c)
-{
- return (c == 9 || /* TAB */
- c == 0x0a || /* HT */
- c == 0x0b || /* LF */
- c == 0x0c || /* VT */
- c == 0x0d || /* FF */
- c == 0x20 || /* CR */
- c == 0x85 || /* NEL */
- c == 0xA0 || /* No break space */
- c == 0x1680 || /* Ogham space mark */
- c == 0x180E || /* Mongolian Vowel Separator */
- c == 0x2000 || /* En quad */
- c == 0x2001 || /* Em quad */
- c == 0x2002 || /* En space */
- c == 0x2003 || /* Em space */
- c == 0x2004 || /* Three-per-Em space */
- c == 0x2005 || /* Four-per-Em space */
- c == 0x2006 || /* Five-per-Em space */
- c == 0x2007 || /* Figure space */
- c == 0x2008 || /* Punctuation space */
- c == 0x2009 || /* Thin space */
- c == 0x200A || /* Hair space */
- c == 0x2028 || /* Line separator */
- c == 0x2029 || /* Paragraph separator */
- c == 0x202F || /* Narrow no-break space */
- c == 0x205F || /* Medium mathematical space */
- c == 0x3000); /* Ideographic space */
-}
-
-static inline int
-is_unicode_bullet(int c)
-{
- /* The last 2 aren't strictly bullets, but will do for our usage here */
- return (c == 0x2022 || /* Bullet */
- c == 0x2023 || /* Triangular bullet */
- c == 0x25e6 || /* White bullet */
- c == 0x2043 || /* Hyphen bullet */
- c == 0x2219 || /* Bullet operator */
- c == 149 || /* Ascii bullet */
- c == '*');
-}
-
-#ifdef SPOT_LINE_NUMBERS
-static inline int
-is_number(int c)
-{
- return ((c >= '0' && c <= '9') ||
- (c == '.'));
-}
-
-static inline int
-is_latin_char(int c)
-{
- return ((c >= 'A' && c <= 'Z') ||
- (c >= 'a' && c <= 'z'));
-}
-
-static inline int
-is_roman(int c)
-{
- return (c == 'i' || c == 'I' ||
- c == 'v' || c == 'V' ||
- c == 'x' || c == 'X' ||
- c == 'l' || c == 'L' ||
- c == 'c' || c == 'C' ||
- c == 'm' || c == 'M');
-}
-#endif
-
-static int
-is_list_entry(fz_stext_line *line, fz_stext_span *span, int *char_num_ptr)
-{
- int char_num;
- fz_stext_char *chr;
-
- /* First, skip over any whitespace */
- for (char_num = 0; char_num < span->len; char_num++)
- {
- chr = &span->text[char_num];
- if (!is_unicode_wspace(chr->c))
- break;
- }
- *char_num_ptr = char_num;
-
- if (span != line->first_span || char_num >= span->len)
- return 0;
-
- /* Now we check for various special cases, which we consider to mean
- * that this is probably a list entry and therefore should always count
- * as a separate paragraph (and hence not be entered in the line height
- * table). */
- chr = &span->text[char_num];
-
- /* Is the first char on the line, a bullet point? */
- if (is_unicode_bullet(chr->c))
- return 1;
-
-#ifdef SPOT_LINE_NUMBERS
- /* Is the entire first span a number? Or does it start with a number
- * followed by ) or : ? Allowed to involve single latin chars too. */
- if (is_number(chr->c) || is_latin_char(chr->c))
- {
- int cn = char_num;
- int met_char = is_latin_char(chr->c);
- for (cn = char_num+1; cn < span->len; cn++)
- {
- fz_stext_char *chr2 = &span->text[cn];
-
- if (is_latin_char(chr2->c) && !met_char)
- {
- met_char = 1;
- continue;
- }
- met_char = 0;
- if (!is_number(chr2->c) && !is_unicode_wspace(chr2->c))
- break;
- else if (chr2->c == ')' || chr2->c == ':')
- {
- cn = span->len;
- break;
- }
- }
- if (cn == span->len)
- return 1;
- }
-
- /* Is the entire first span a roman numeral? Or does it start with
- * a roman numeral followed by ) or : ? */
- if (is_roman(chr->c))
- {
- int cn = char_num;
- for (cn = char_num+1; cn < span->len; cn++)
- {
- fz_stext_char *chr2 = &span->text[cn];
-
- if (!is_roman(chr2->c) && !is_unicode_wspace(chr2->c))
- break;
- else if (chr2->c == ')' || chr2->c == ':')
- {
- cn = span->len;
- break;
- }
- }
- if (cn == span->len)
- return 1;
- }
-#endif
- return 0;
-}
-
-typedef struct region_masks_s region_masks;
-
-typedef struct region_mask_s region_mask;
-
-typedef struct region_s region;
-
-struct region_s
-{
- float start;
- float stop;
- float ave_start;
- float ave_stop;
- int align;
- float colw;
-};
-
-struct region_mask_s
-{
- fz_context *ctx;
- int freq;
- fz_point blv;
- int cap;
- int len;
- float size;
- region *mask;
-};
-
-struct region_masks_s
-{
- fz_context *ctx;
- int cap;
- int len;
- region_mask **mask;
-};
-
-static region_masks *
-new_region_masks(fz_context *ctx)
-{
- region_masks *rms = fz_malloc_struct(ctx, region_masks);
- rms->ctx = ctx;
- rms->cap = 0;
- rms->len = 0;
- rms->mask = NULL;
- return rms;
-}
-
-static void
-free_region_mask(region_mask *rm)
-{
- if (!rm)
- return;
- fz_free(rm->ctx, rm->mask);
- fz_free(rm->ctx, rm);
-}
-
-static void
-free_region_masks(region_masks *rms)
-{
- int i;
-
- if (!rms)
- return;
- for (i=0; i < rms->len; i++)
- {
- free_region_mask(rms->mask[i]);
- }
- fz_free(rms->ctx, rms->mask);
- fz_free(rms->ctx, rms);
-}
-
-static int region_masks_mergeable(const region_mask *rm1, const region_mask *rm2, float *score)
-{
- int i1, i2;
- int count = 0;
-
- *score = 0;
- if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
- return 0;
-
- for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
- {
- if (rm1->mask[i1].stop < rm2->mask[i2].start)
- {
- /* rm1's region is entirely before rm2's */
- *score += rm1->mask[i1].stop - rm1->mask[i1].start;
- i1++;
- }
- else if (rm1->mask[i1].start > rm2->mask[i2].stop)
- {
- /* rm2's region is entirely before rm1's */
- *score += rm2->mask[i2].stop - rm2->mask[i2].start;
- i2++;
- }
- else
- {
- float lscore, rscore;
- if (rm1->mask[i1].start < rm2->mask[i2].start)
- {
- if (i2 > 0 && rm2->mask[i2-1].stop >= rm1->mask[i1].start)
- return 0; /* Not compatible */
- lscore = rm2->mask[i2].start - rm1->mask[i1].start;
- }
- else
- {
- if (i1 > 0 && rm1->mask[i1-1].stop >= rm2->mask[i2].start)
- return 0; /* Not compatible */
- lscore = rm1->mask[i1].start - rm2->mask[i2].start;
- }
- if (rm1->mask[i1].stop > rm2->mask[i2].stop)
- {
- if (i2+1 < rm2->len && rm2->mask[i2+1].start <= rm1->mask[i1].stop)
- return 0; /* Not compatible */
- rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
- }
- else
- {
- if (i1+1 < rm1->len && rm1->mask[i1+1].start <= rm2->mask[i2].stop)
- return 0; /* Not compatible */
- rscore = rm2->mask[i2].stop - rm1->mask[i1].stop;
- }
- /* In order to allow a region to merge, either the
- * left, the right, or the centre must agree */
- if (lscore < 1)
- {
- if (rscore < 1)
- {
- rscore = 0;
- }
- lscore = 0;
- }
- else if (rscore < 1)
- {
- rscore = 0;
- }
- else
- {
- /* Neither Left or right agree. Does the centre? */
- float ave1 = rm1->mask[i1].start + rm1->mask[i1].stop;
- float ave2 = rm2->mask[i2].start + rm2->mask[i2].stop;
- if (fabsf(ave1-ave2) > 1)
- {
- /* Nothing agrees, so don't merge */
- return 0;
- }
- lscore = 0;
- rscore = 0;
- }
- *score += lscore + rscore;
- /* These two regions could be merged */
- i1++;
- i2++;
- }
- count++;
- }
- count += rm1->len-i1 + rm2->len-i2;
- return count;
-}
-
-static int region_mask_matches(const region_mask *rm1, const region_mask *rm2, float *score)
-{
- int i1, i2;
- int close = 1;
-
- *score = 0;
- if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
- return 0;
-
- for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
- {
- if (rm1->mask[i1].stop < rm2->mask[i2].start)
- {
- /* rm1's region is entirely before rm2's */
- *score += rm1->mask[i1].stop - rm1->mask[i1].start;
- i1++;
- }
- else if (rm1->mask[i1].start > rm2->mask[i2].stop)
- {
- /* Not compatible */
- return 0;
- }
- else
- {
- float lscore, rscore;
- if (rm1->mask[i1].start > rm2->mask[i2].start)
- {
- /* Not compatible */
- return 0;
- }
- if (rm1->mask[i1].stop < rm2->mask[i2].stop)
- {
- /* Not compatible */
- return 0;
- }
- lscore = rm2->mask[i2].start - rm1->mask[i1].start;
- rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
- if (lscore < 1)
- {
- if (rscore < 1)
- close++;
- close++;
- }
- else if (rscore < 1)
- close++;
- else if (fabsf(lscore - rscore) < 1)
- {
- lscore = fabsf(lscore-rscore);
- rscore = 0;
- close++;
- }
- *score += lscore + rscore;
- i1++;
- i2++;
- }
- }
- if (i1 < rm1->len)
- {
- /* Still more to go in rm1 */
- if (rm1->mask[i1].start < rm2->mask[rm2->len-1].stop)
- return 0;
- }
- else if (i2 < rm2->len)
- {
- /* Still more to go in rm2 */
- if (rm2->mask[i2].start < rm1->mask[rm1->len-1].stop)
- return 0;
- }
-
- return close;
-}
-
-static void region_mask_merge(region_mask *rm1, const region_mask *rm2, int newlen)
-{
- int o, i1, i2;
-
- /* First, ensure that rm1 is long enough */
- if (rm1->cap < newlen)
- {
- int newcap = rm1->cap ? rm1->cap : 2;
- do
- {
- newcap *= 2;
- }
- while (newcap < newlen);
- rm1->mask = fz_resize_array(rm1->ctx, rm1->mask, newcap, sizeof(*rm1->mask));
- rm1->cap = newcap;
- }
-
- /* Now run backwards along rm1, filling it out with the merged regions */
- for (o = newlen-1, i1 = rm1->len-1, i2 = rm2->len-1; o >= 0; o--)
- {
- /* So we read from i1 and i2 and store in o */
- if (i1 < 0)
- {
- /* Just copy i2 */
- rm1->mask[o] = rm2->mask[i2];
- i2--;
- }
- else if (i2 < 0)
- {
- /* Just copy i1 */
- rm1->mask[o] = rm1->mask[i1];
- i1--;
- }
- else if (rm1->mask[i1].stop < rm2->mask[i2].start)
- {
- /* rm1's region is entirely before rm2's - copy rm2's */
- rm1->mask[o] = rm2->mask[i2];
- i2--;
- }
- else if (rm2->mask[i2].stop < rm1->mask[i1].start)
- {
- /* rm2's region is entirely before rm1's - copy rm1's */
- rm1->mask[o] = rm1->mask[i1];
- i1--;
- }
- else
- {
- /* We must be merging */
- rm1->mask[o].ave_start = (rm1->mask[i1].start * rm1->freq + rm2->mask[i2].start * rm2->freq)/(rm1->freq + rm2->freq);
- rm1->mask[o].ave_stop = (rm1->mask[i1].stop * rm1->freq + rm2->mask[i2].stop * rm2->freq)/(rm1->freq + rm2->freq);
- rm1->mask[o].start = fz_min(rm1->mask[i1].start, rm2->mask[i2].start);
- rm1->mask[o].stop = fz_max(rm1->mask[i1].stop, rm2->mask[i2].stop);
- i1--;
- i2--;
- }
- }
- rm1->freq += rm2->freq;
- rm1->len = newlen;
-}
-
-static region_mask *region_masks_match(const region_masks *rms, const region_mask *rm, fz_stext_line *line, region_mask *prev_match)
-{
- int i;
- float best_score = 9999999;
- float score;
- int best = -1;
- int best_count = 0;
-
- /* If the 'previous match' matches, use it regardless. */
- if (prev_match && region_mask_matches(prev_match, rm, &score))
- {
- return prev_match;
- }
-
- /* Run through and find the 'most compatible' region mask. We are
- * guaranteed that there will always be at least one compatible one!
- */
- for (i=0; i < rms->len; i++)
- {
- int count = region_mask_matches(rms->mask[i], rm, &score);
- if (count > best_count || (count == best_count && (score < best_score || best == -1)))
- {
- best = i;
- best_score = score;
- best_count = count;
- }
- }
- assert(best >= 0 && best < rms->len);
-
- /* So we have the matching mask. */
- return rms->mask[best];
-}
-
-#ifdef DEBUG_MASKS
-static void
-dump_region_mask(const region_mask *rm)
-{
- int j;
- for (j = 0; j < rm->len; j++)
- {
- printf("%g->%g ", rm->mask[j].start, rm->mask[j].stop);
- }
- printf("* %d\n", rm->freq);
-}
-
-static void
-dump_region_masks(const region_masks *rms)
-{
- int i;
-
- for (i = 0; i < rms->len; i++)
- {
- region_mask *rm = rms->mask[i];
- dump_region_mask(rm);
- }
-}
-#endif
-
-static void region_masks_add(region_masks *rms, region_mask *rm)
-{
- /* Add rm to rms */
- if (rms->len == rms->cap)
- {
- int newcap = (rms->cap ? rms->cap * 2 : 4);
- rms->mask = fz_resize_array(rms->ctx, rms->mask, newcap, sizeof(*rms->mask));
- rms->cap = newcap;
- }
- rms->mask[rms->len] = rm;
- rms->len++;
-}
-
-static void region_masks_sort(region_masks *rms)
-{
- int i, j;
-
- /* First calculate sizes */
- for (i=0; i < rms->len; i++)
- {
- region_mask *rm = rms->mask[i];
- float size = 0;
- for (j=0; j < rm->len; j++)
- {
- size += rm->mask[j].stop - rm->mask[j].start;
- }
- rm->size = size;
- }
-
- /* Now, sort on size */
- /* FIXME: bubble sort - use heapsort for efficiency */
- for (i=0; i < rms->len-1; i++)
- {
- for (j=i+1; j < rms->len; j++)
- {
- if (rms->mask[i]->size < rms->mask[j]->size)
- {
- region_mask *tmp = rms->mask[i];
- rms->mask[i] = rms->mask[j];
- rms->mask[j] = tmp;
- }
- }
- }
-}
-
-static void region_masks_merge(region_masks *rms, region_mask *rm)
-{
- int i;
- float best_score = 9999999;
- float score;
- int best = -1;
- int best_count = 0;
-
-#ifdef DEBUG_MASKS
- printf("\nAdding:\n");
- dump_region_mask(rm);
- printf("To:\n");
- dump_region_masks(rms);
-#endif
- for (i=0; i < rms->len; i++)
- {
- int count = region_masks_mergeable(rms->mask[i], rm, &score);
- if (count && (score < best_score || best == -1))
- {
- best = i;
- best_count = count;
- best_score = score;
- }
- }
- if (best != -1)
- {
- region_mask_merge(rms->mask[best], rm, best_count);
-#ifdef DEBUG_MASKS
- printf("Merges to give:\n");
- dump_region_masks(rms);
-#endif
- free_region_mask(rm);
- return;
- }
- region_masks_add(rms, rm);
-#ifdef DEBUG_MASKS
- printf("Adding new one to give:\n");
- dump_region_masks(rms);
-#endif
-}
-
-static region_mask *
-new_region_mask(fz_context *ctx, const fz_point *blv)
-{
- region_mask *rm = fz_malloc_struct(ctx, region_mask);
- rm->ctx = ctx;
- rm->freq = 1;
- rm->blv = *blv;
- rm->cap = 0;
- rm->len = 0;
- rm->mask = NULL;
- return rm;
-}
-
-static void
-region_mask_project(const region_mask *rm, const fz_point *min, const fz_point *max, float *start, float *end)
-{
- /* We project min and max down onto the blv */
- float s = min->x * rm->blv.x + min->y * rm->blv.y;
- float e = max->x * rm->blv.x + max->y * rm->blv.y;
- if (s > e)
- {
- *start = e;
- *end = s;
- }
- else
- {
- *start = s;
- *end = e;
- }
-}
-
-static void
-region_mask_add(region_mask *rm, const fz_point *min, const fz_point *max)
-{
- float start, end;
- int i, j;
-
- region_mask_project(rm, min, max, &start, &end);
-
- /* Now add start/end into our region list. Typically we will be adding
- * to the end of the region list, so search from there backwards. */
- for (i = rm->len; i > 0;)
- {
- if (start > rm->mask[i-1].stop)
- break;
- i--;
- }
- /* So we know that our interval can only affect list items >= i.
- * We know that start is after our previous end. */
- if (i == rm->len || end < rm->mask[i].start)
- {
- /* Insert new one. No overlap. No merging */
- if (rm->len == rm->cap)
- {
- int newcap = (rm->cap ? rm->cap * 2 : 4);
- rm->mask = fz_resize_array(rm->ctx, rm->mask, newcap, sizeof(*rm->mask));
- rm->cap = newcap;
- }
- if (rm->len > i)
- memmove(&rm->mask[i+1], &rm->mask[i], (rm->len - i) * sizeof(*rm->mask));
- rm->mask[i].ave_start = start;
- rm->mask[i].ave_stop = end;
- rm->mask[i].start = start;
- rm->mask[i].stop = end;
- rm->len++;
- }
- else
- {
- /* Extend current one down. */
- rm->mask[i].ave_start = start;
- rm->mask[i].start = start;
- if (rm->mask[i].stop < end)
- {
- rm->mask[i].stop = end;
- rm->mask[i].ave_stop = end;
- /* Our region may now extend upwards too far */
- i++;
- j = i;
- while (j < rm->len && rm->mask[j].start <= end)
- {
- rm->mask[i-1].stop = end = rm->mask[j].stop;
- j++;
- }
- if (i != j)
- {
- /* Move everything from j down to i */
- while (j < rm->len)
- {
- rm->mask[i++] = rm->mask[j++];
- }
- }
- rm->len -= j-i;
- }
- }
-}
-
-static int
-region_mask_column(region_mask *rm, const fz_point *min, const fz_point *max, int *align, float *colw, float *left_)
-{
- float start, end, left, right;
- int i;
-
- region_mask_project(rm, min, max, &start, &end);
-
- for (i = 0; i < rm->len; i++)
- {
- /* The use of MY_EPSILON here is because we might be matching
- * start/end values calculated with slightly different blv's */
- if (rm->mask[i].start - MY_EPSILON <= start && rm->mask[i].stop + MY_EPSILON >= end)
- break;
- }
- if (i >= rm->len)
- {
- *align = 0;
- *colw = 0;
- return 0;
- }
- left = start - rm->mask[i].start;
- right = rm->mask[i].stop - end;
- if (left < 1 && right < 1)
- *align = rm->mask[i].align;
- else if (left*2 <= right)
- *align = 0; /* Left */
- else if (right * 2 < left)
- *align = 2; /* Right */
- else
- *align = 1;
- *left_ = left;
- *colw = rm->mask[i].colw;
- return i;
-}
-
-static void
-region_mask_alignment(region_mask *rm)
-{
- int i;
- float width = 0;
-
- for (i = 0; i < rm->len; i++)
- {
- width += rm->mask[i].stop - rm->mask[i].start;
- }
- for (i = 0; i < rm->len; i++)
- {
- region *r = &rm->mask[i];
- float left = r->ave_start - r->start;
- float right = r->stop - r->ave_stop;
- if (left*2 <= right)
- r->align = 0; /* Left */
- else if (right * 2 < left)
- r->align = 2; /* Right */
- else
- r->align = 1;
- r->colw = 100 * (rm->mask[i].stop - rm->mask[i].start) / width;
- }
-}
-
-static void
-region_masks_alignment(region_masks *rms)
-{
- int i;
-
- for (i = 0; i < rms->len; i++)
- {
- region_mask_alignment(rms->mask[i]);
- }
-}
-
-static int
-is_unicode_hyphen(int c)
-{
- /* We omit 0x2011 (Non breaking hyphen) and 0x2043 (Hyphen Bullet)
- * from this list. */
- return (c == '-' ||
- c == 0x2010 || /* Hyphen */
- c == 0x002d || /* Hyphen-Minus */
- c == 0x00ad || /* Soft hyphen */
- c == 0x058a || /* Armenian Hyphen */
- c == 0x1400 || /* Canadian Syllabive Hyphen */
- c == 0x1806); /* Mongolian Todo soft hyphen */
-}
-
-static int
-is_unicode_hyphenatable(int c)
-{
- /* This is a pretty ad-hoc collection. It may need tuning. */
- return ((c >= 'A' && c <= 'Z') ||
- (c >= 'a' && c <= 'z') ||
- (c >= 0x00c0 && c <= 0x00d6) ||
- (c >= 0x00d8 && c <= 0x00f6) ||
- (c >= 0x00f8 && c <= 0x02af) ||
- (c >= 0x1d00 && c <= 0x1dbf) ||
- (c >= 0x1e00 && c <= 0x1eff) ||
- (c >= 0x2c60 && c <= 0x2c7f) ||
- (c >= 0xa722 && c <= 0xa78e) ||
- (c >= 0xa790 && c <= 0xa793) ||
- (c >= 0xa7a8 && c <= 0xa7af) ||
- (c >= 0xfb00 && c <= 0xfb07) ||
- (c >= 0xff21 && c <= 0xff3a) ||
- (c >= 0xff41 && c <= 0xff5a));
-}
-
-static void
-dehyphenate(fz_stext_span *s1, fz_stext_span *s2)
-{
- int i;
-
- for (i = s1->len-1; i > 0; i--)
- if (!is_unicode_wspace(s1->text[i].c))
- break;
- /* Can't leave an empty span. */
- if (i == 0)
- return;
-
- if (!is_unicode_hyphen(s1->text[i].c))
- return;
- if (!is_unicode_hyphenatable(s1->text[i-1].c))
- return;
- if (!is_unicode_hyphenatable(s2->text[0].c))
- return;
- s1->len = i;
- s2->spacing = 0;
-}
-
-#ifdef DEBUG_ALIGN
-static void
-dump_span(fz_stext_span *span)
-{
-}
-
-static void
-dump_line(fz_stext_line *line)
-{
- fz_stext_span *span;
-
- if (!line)
- return;
- printf("d=%g: ", line->distance);
-
- span = line->first_span;
- while (span)
- {
- dump_span(span);
- span = span->next;
- }
-
- printf("\n");
-}
-#endif
-
-void
-fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page)
-{
- fz_stext_line *line;
- fz_stext_span *span;
- line_heights *lh;
- region_masks *rms;
- int block_num;
-
- /* Simple paragraph analysis; look for the most common 'inter line'
- * spacing. This will be assumed to be our line spacing. Anything
- * more than 25% wider than this will be assumed to be a paragraph
- * space. */
-
- /* Step 1: Gather the line height information */
- lh = new_line_heights(ctx);
- for (block_num = 0; block_num < page->len; block_num++)
- {
- fz_stext_block *block;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
- continue;
- block = page->blocks[block_num].u.text;
-
- for (line = block->lines; line < block->lines + block->len; line++)
- {
- /* For every style in the line, add lineheight to the
- * record for that style. FIXME: This is a nasty n^2
- * algorithm at the moment. */
- fz_stext_style *style = NULL;
-
- if (line->distance == 0)
- continue;
-
- for (span = line->first_span; span; span = span->next)
- {
- int char_num;
-
- if (is_list_entry(line, span, &char_num))
- goto list_entry;
-
- for (; char_num < span->len; char_num++)
- {
- fz_stext_char *chr = &span->text[char_num];
-
- /* Ignore any whitespace chars */
- if (is_unicode_wspace(chr->c))
- continue;
-
- if (chr->style != style)
- {
- /* Have we had this style before? */
- int match = 0;
- fz_stext_span *span2;
- for (span2 = line->first_span; span2 != span; span2 = span2->next)
- {
- int char_num2;
- for (char_num2 = 0; char_num2 < span2->len; char_num2++)
- {
- fz_stext_char *chr2 = &span2->text[char_num2];
- if (chr2->style == chr->style)
- {
- match = 1;
- break;
- }
- }
- }
- if (char_num > 0 && match == 0)
- {
- fz_stext_span *span2 = span;
- int char_num2;
- for (char_num2 = 0; char_num2 < char_num; char_num2++)
- {
- fz_stext_char *chr2 = &span2->text[char_num2];
- if (chr2->style == chr->style)
- {
- match = 1;
- break;
- }
- }
- }
- if (match == 0)
- insert_line_height(lh, chr->style, line->distance);
- style = chr->style;
- }
- }
-list_entry:
- {}
- }
- }
- }
-
- /* Step 2: Find the most popular line height for each style */
- cull_line_heights(lh);
-
- /* Step 3: Run through the blocks, breaking each block into two if
- * the line height isn't right. */
- for (block_num = 0; block_num < page->len; block_num++)
- {
- int line_num;
- fz_stext_block *block;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
- continue;
- block = page->blocks[block_num].u.text;
-
- for (line_num = 0; line_num < block->len; line_num++)
- {
- /* For every style in the line, check to see if lineheight
- * is correct for that style. FIXME: We check each style
- * more than once, currently. */
- int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */
- fz_stext_style *style = NULL;
- line = &block->lines[line_num];
-
- if (line->distance == 0)
- continue;
-
-#ifdef DEBUG_LINE_HEIGHTS
- printf("line height=%g\n", line->distance);
-#endif
- for (span = line->first_span; span; span = span->next)
- {
- int char_num;
-
- if (is_list_entry(line, span, &char_num))
- goto force_paragraph;
-
- /* Now we do the rest of the line */
- for (; char_num < span->len; char_num++)
- {
- fz_stext_char *chr = &span->text[char_num];
-
- /* Ignore any whitespace chars */
- if (is_unicode_wspace(chr->c))
- continue;
-
- if (chr->style != style)
- {
- float proper_step = line_height_for_style(lh, chr->style);
- if (proper_step * 0.95f <= line->distance && line->distance <= proper_step * 1.05f)
- {
- ok = 1;
- break;
- }
- style = chr->style;
- }
- }
- if (ok)
- break;
- }
- if (!ok)
- {
-force_paragraph:
- split_block(ctx, page, block_num, line_num);
- break;
- }
- }
- }
- free_line_heights(lh);
-
- /* Simple line region analysis:
- * For each line:
- * form a list of 'start/stop' points (henceforth a 'region mask')
- * find the normalised baseline vector for the line.
- * Store the region mask and baseline vector.
- * Collate lines that have compatible region masks and identical
- * baseline vectors.
- * If the collated masks are column-like, then split into columns.
- * Otherwise split into tables.
- */
- rms = new_region_masks(ctx);
-
- /* Step 1: Form the region masks and store them into a list with the
- * normalised baseline vectors. */
- for (block_num = 0; block_num < page->len; block_num++)
- {
- fz_stext_block *block;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
- continue;
- block = page->blocks[block_num].u.text;
-
- for (line = block->lines; line < block->lines + block->len; line++)
- {
- fz_point blv;
- region_mask *rm;
-
-#ifdef DEBUG_MASKS
- printf("Line: ");
- dump_line(line);
-#endif
- blv = line->first_span->max;
- blv.x -= line->first_span->min.x;
- blv.y -= line->first_span->min.y;
- fz_normalize_vector(&blv);
-
- rm = new_region_mask(ctx, &blv);
- for (span = line->first_span; span; span = span->next)
- {
- fz_point *region_min = &span->min;
- fz_point *region_max = &span->max;
-
- /* Treat adjacent spans as one big region */
- while (span->next && span->next->spacing < 1.5f)
- {
- span = span->next;
- region_max = &span->max;
- }
-
- region_mask_add(rm, region_min, region_max);
- }
-#ifdef DEBUG_MASKS
- dump_region_mask(rm);
-#endif
- region_masks_add(rms, rm);
- }
- }
-
- /* Step 2: Sort the region_masks by size of masked region */
- region_masks_sort(rms);
-
-#ifdef DEBUG_MASKS
- printf("Sorted list of regions:\n");
- dump_region_masks(rms);
-#endif
- /* Step 3: Merge the region masks where possible (large ones first) */
- {
- int i;
- region_masks *rms2;
- rms2 = new_region_masks(ctx);
- for (i=0; i < rms->len; i++)
- {
- region_mask *rm = rms->mask[i];
- rms->mask[i] = NULL;
- region_masks_merge(rms2, rm);
- }
- free_region_masks(rms);
- rms = rms2;
- }
-
-#ifdef DEBUG_MASKS
- printf("Merged list of regions:\n");
- dump_region_masks(rms);
-#endif
-
- /* Step 4: Figure out alignment */
- region_masks_alignment(rms);
-
- /* Step 5: At this point, we should probably look at the region masks
- * to try to guess which ones represent columns on the page. With our
- * current code, we could only get blocks of lines that span 2 or more
- * columns if the PDF producer wrote text out horizontally across 2
- * or more columns, and we've never seen that (yet!). So we skip this
- * step for now. */
-
- /* Step 6: Run through the lines again, deciding which ones fit into
- * which region mask. */
- {
- region_mask *prev_match = NULL;
- for (block_num = 0; block_num < page->len; block_num++)
- {
- fz_stext_block *block;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
- continue;
- block = page->blocks[block_num].u.text;
-
- for (line = block->lines; line < block->lines + block->len; line++)
- {
- fz_point blv;
- region_mask *rm;
- region_mask *match;
-
- blv = line->first_span->max;
- blv.x -= line->first_span->min.x;
- blv.y -= line->first_span->min.y;
- fz_normalize_vector(&blv);
-
-#ifdef DEBUG_MASKS
- dump_line(line);
-#endif
- rm = new_region_mask(ctx, &blv);
- for (span = line->first_span; span; span = span->next)
- {
- fz_point *region_min = &span->min;
- fz_point *region_max = &span->max;
-
- /* Treat adjacent spans as one big region */
- while (span->next && span->next->spacing < 1.5f)
- {
- span = span->next;
- region_max = &span->max;
- }
-
- region_mask_add(rm, region_min, region_max);
- }
-#ifdef DEBUG_MASKS
- printf("Mask: ");
- dump_region_mask(rm);
-#endif
- match = region_masks_match(rms, rm, line, prev_match);
- prev_match = match;
-#ifdef DEBUG_MASKS
- printf("Matches: ");
- dump_region_mask(match);
-#endif
- free_region_mask(rm);
- span = line->first_span;
- while (span)
- {
- fz_point *region_min = &span->min;
- fz_point *region_max = &span->max;
- fz_stext_span *sn;
- int col, align;
- float colw, left;
-
- /* Treat adjacent spans as one big region */
-#ifdef DEBUG_ALIGN
- dump_span(span);
-#endif
- for (sn = span->next; sn && sn->spacing < 1.5f; sn = sn->next)
- {
- region_max = &sn->max;
-#ifdef DEBUG_ALIGN
- dump_span(sn);
-#endif
- }
- col = region_mask_column(match, region_min, region_max, &align, &colw, &left);
-#ifdef DEBUG_ALIGN
- printf(" = col%d colw=%g align=%d\n", col, colw, align);
-#endif
- do
- {
- span->column = col;
- span->align = align;
- span->indent = left;
- span->column_width = colw;
- span = span->next;
- }
- while (span != sn);
-
- if (span)
- span = span->next;
- }
- line->region = match;
- }
- }
- free_region_masks(rms);
- }
-
- /* Step 7: Collate lines within a block that share the same region
- * mask. */
- for (block_num = 0; block_num < page->len; block_num++)
- {
- int line_num;
- int prev_line_num;
-
- fz_stext_block *block;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
- continue;
- block = page->blocks[block_num].u.text;
-
- /* First merge lines. This may leave empty lines behind. */
- for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++)
- {
- fz_stext_line *prev_line;
- line = &block->lines[line_num];
- if (!line->first_span)
- continue;
- prev_line = &block->lines[prev_line_num];
- if (prev_line->region == line->region)
- {
- /* We only merge lines if the second line
- * only uses 1 of the columns. */
- int col = line->first_span->column;
- /* Copy the left value for the first span
- * in the first column in this line forward
- * for all the rest of the spans in the same
- * column. */
- float indent = line->first_span->indent;
- for (span = line->first_span->next; span; span = span->next)
- {
- if (col != span->column)
- break;
- span->indent = indent;
- }
- if (span)
- {
- prev_line_num = line_num;
- continue;
- }
-
- /* Merge line into prev_line */
- {
- fz_stext_span **prev_line_span = &prev_line->first_span;
- int try_dehyphen = -1;
- fz_stext_span *prev_span = NULL;
- span = line->first_span;
- while (span && *prev_line_span)
- {
- /* Skip forwards through the original
- * line, until we find a place where
- * span should go. */
- if ((*prev_line_span)->column <= span->column)
- {
- /* The current span we are considering
- * in prev_line is earlier than span.
- * Just skip forwards in prev_line. */
- prev_span = (*prev_line_span);
- prev_line_span = &prev_span->next;
- try_dehyphen = span->column;
- }
- else
- {
- /* We want to copy span into prev_line. */
- fz_stext_span *next = (*prev_line_span)->next;
-
- if (prev_line_span == &prev_line->first_span)
- prev_line->first_span = span;
- if (next == NULL)
- prev_line->last_span = span;
- if (try_dehyphen == span->column)
- dehyphenate(prev_span, span);
- try_dehyphen = -1;
- prev_span = *prev_line_span = span;
- span = span->next;
- (*prev_line_span)->next = next;
- prev_line_span = &(*prev_line_span)->next;
- }
- }
- if (span)
- {
- *prev_line_span = span;
- prev_line->last_span = line->last_span;
- }
-
- line->first_span = NULL;
- line->last_span = NULL;
- }
- }
- else
- prev_line_num = line_num;
- }
-
- /* Now get rid of the empty lines */
- for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++)
- {
- line = &block->lines[line_num];
- if (line->first_span)
- block->lines[prev_line_num++] = *line;
- }
- block->len = prev_line_num;
-
- /* Now try to spot indents */
- for (line_num = 0; line_num < block->len; line_num++)
- {
- fz_stext_span *span_num, *sn;
- int col, count;
- line = &block->lines[line_num];
-
- /* Run through the spans... */
- span_num = line->first_span;
- {
- float indent = 0;
- /* For each set of spans that share the same
- * column... */
- col = span_num->column;
-#ifdef DEBUG_INDENTS
- printf("Indent %g: ", span_num->indent);
- dump_span(span_num);
- printf("\n");
-#endif
-
- /* find the average indent of all but the first.. */
- for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++)
- {
-#ifdef DEBUG_INDENTS
- printf("Indent %g: ", sn->indent);
- dump_span(sn);
- printf("\n");
-#endif
- indent += sn->indent;
- sn->indent = 0;
- }
- if (sn != span_num->next)
- indent /= count;
-
- /* And compare this indent with the first one... */
-#ifdef DEBUG_INDENTS
- printf("Average indent %g ", indent);
-#endif
- indent -= span_num->indent;
-#ifdef DEBUG_INDENTS
- printf("delta %g ", indent);
-#endif
- if (fabsf(indent) < 1)
- {
- /* No indent worth speaking of */
- indent = 0;
- }
-#ifdef DEBUG_INDENTS
- printf("recorded %g\n", indent);
-#endif
- span_num->indent = indent;
- span_num = sn;
- }
- for (; span_num; span_num = span_num->next)
- {
- span_num->indent = 0;
- }
- }
- }
-}
diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c
index 00705208..6c30ea29 100644
--- a/source/fitz/stext-search.c
+++ b/source/fitz/stext-search.c
@@ -18,30 +18,28 @@ static inline int iswhite(int c)
fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx)
{
- int block_num;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
int ofs = 0;
- for (block_num = 0; block_num < page->len; block_num++)
+ for (block = page->first_block; block; block = block->next)
{
- fz_stext_block *block;
- fz_stext_line *line;
- fz_stext_span *span;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
- block = page->blocks[block_num].u.text;
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- for (span = line->first_span; span; span = span->next)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (idx < ofs + span->len)
+ if (ofs == idx)
{
- cab->c = span->text[idx - ofs].c;
- fz_stext_char_bbox(ctx, &cab->bbox, span, idx - ofs);
+ cab->c = ch->c;
+ fz_stext_char_bbox(ctx, &cab->bbox, line, ch);
return cab;
}
- ofs += span->len;
+ ++ofs;
}
+
/* pseudo-newline */
if (idx == ofs)
{
@@ -49,7 +47,7 @@ fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stex
cab->c = ' ';
return cab;
}
- ofs++;
+ ++ofs;
}
}
cab->bbox = fz_empty_rect;
@@ -73,27 +71,23 @@ static fz_rect *bboxat(fz_context *ctx, fz_stext_page *page, int idx, fz_rect *b
static int textlen_stext(fz_context *ctx, fz_stext_page *page)
{
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
int len = 0;
- int block_num;
- for (block_num = 0; block_num < page->len; block_num++)
+ for (block = page->first_block; block; block = block->next)
{
- fz_stext_block *block;
- fz_stext_line *line;
- fz_stext_span *span;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
- block = page->blocks[block_num].u.text;
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- for (span = line->first_span; span; span = span->next)
- {
- len += span->len;
- }
- len++; /* pseudo-newline */
+ for (ch = line->first_char; ch; ch = ch->next)
+ ++len;
+ ++len; /* pseudo-newline */
}
}
+
return len;
}
@@ -181,8 +175,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re
fz_rect linebox, charbox;
fz_stext_block *block;
fz_stext_line *line;
- fz_stext_span *span;
- int i, block_num, hit_count;
+ fz_stext_char *ch;
+ int hit_count;
float x0 = rect.x0;
float x1 = rect.x1;
@@ -191,31 +185,27 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re
hit_count = 0;
- for (block_num = 0; block_num < page->len; block_num++)
+ for (block = page->first_block; block; block = block->next)
{
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
- block = page->blocks[block_num].u.text;
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
linebox = fz_empty_rect;
- for (span = line->first_span; span; span = span->next)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- for (i = 0; i < span->len; i++)
+ fz_stext_char_bbox(ctx, &charbox, line, ch);
+ if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1)
{
- fz_stext_char_bbox(ctx, &charbox, span, i);
- if (charbox.x1 >= x0 && charbox.x0 <= x1 && charbox.y1 >= y0 && charbox.y0 <= y1)
+ if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5)
{
- if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5)
- {
- if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
- hit_bbox[hit_count++] = linebox;
- linebox = charbox;
- }
- else
- {
- fz_union_rect(&linebox, &charbox);
- }
+ if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
+ hit_bbox[hit_count++] = linebox;
+ linebox = charbox;
+ }
+ else
+ {
+ fz_union_rect(&linebox, &charbox);
}
}
}
@@ -232,8 +222,11 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect)
{
fz_buffer *buffer;
fz_rect hitbox;
- int c, i, block_num, seen = 0;
+ int c, seen = 0;
unsigned char *s;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
float x0 = rect.x0;
float x1 = rect.x1;
@@ -242,41 +235,33 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect)
buffer = fz_new_buffer(ctx, 1024);
- for (block_num = 0; block_num < page->len; block_num++)
+ for (block = page->first_block; block; block = block->next)
{
- fz_stext_block *block;
- fz_stext_line *line;
- fz_stext_span *span;
-
- if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
- block = page->blocks[block_num].u.text;
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
- for (span = line->first_span; span; span = span->next)
+ if (seen)
{
- if (seen)
- {
- fz_append_byte(ctx, buffer, '\n');
- }
+ fz_append_byte(ctx, buffer, '\n');
+ }
- seen = 0;
+ seen = 0;
- for (i = 0; i < span->len; i++)
+ for (ch = line->first_char; ch; ch = ch->next)
+ {
+ fz_stext_char_bbox(ctx, &hitbox, line, ch);
+ c = ch->c;
+ if (c < 32)
+ c = 0xFFFD;
+ if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
{
- fz_stext_char_bbox(ctx, &hitbox, span, i);
- c = span->text[i].c;
- if (c < 32)
- c = 0xFFFD;
- if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
- {
- fz_append_rune(ctx, buffer, c);
- seen = 1;
- }
+ fz_append_rune(ctx, buffer, c);
+ seen = 1;
}
-
- seen = (seen && span == line->last_span);
}
+
+ seen = (seen && line == block->u.t.last_line);
}
}
diff --git a/source/fitz/util.c b/source/fitz/util.c
index 6f900174..d6a7f317 100644
--- a/source/fitz/util.c
+++ b/source/fitz/util.c
@@ -267,7 +267,7 @@ fz_new_pixmap_from_page_number(fz_context *ctx, fz_document *doc, int number, co
}
fz_stext_page *
-fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_stext_sheet *sheet, const fz_stext_options *options)
+fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options)
{
fz_stext_page *text;
fz_device *dev;
@@ -279,7 +279,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
text = fz_new_stext_page(ctx, fz_bound_display_list(ctx, list, &mediabox));
fz_try(ctx)
{
- dev = fz_new_stext_device(ctx, sheet, text, options);
+ dev = fz_new_stext_device(ctx, text, options);
fz_run_display_list(ctx, list, dev, &fz_identity, NULL, NULL);
fz_close_device(ctx, dev);
}
@@ -297,7 +297,7 @@ fz_new_stext_page_from_display_list(fz_context *ctx, fz_display_list *list, fz_s
}
fz_stext_page *
-fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *sheet, const fz_stext_options *options)
+fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options)
{
fz_stext_page *text;
fz_device *dev;
@@ -309,7 +309,7 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
text = fz_new_stext_page(ctx, fz_bound_page(ctx, page, &mediabox));
fz_try(ctx)
{
- dev = fz_new_stext_device(ctx, sheet, text, options);
+ dev = fz_new_stext_device(ctx, text, options);
fz_run_page(ctx, page, dev, &fz_identity, NULL);
fz_close_device(ctx, dev);
}
@@ -327,14 +327,14 @@ fz_new_stext_page_from_page(fz_context *ctx, fz_page *page, fz_stext_sheet *shee
}
fz_stext_page *
-fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, fz_stext_sheet *sheet, const fz_stext_options *options)
+fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options)
{
fz_page *page;
fz_stext_page *text;
page = fz_load_page(ctx, doc, number);
fz_try(ctx)
- text = fz_new_stext_page_from_page(ctx, page, sheet, options);
+ text = fz_new_stext_page_from_page(ctx, page, options);
fz_always(ctx)
fz_drop_page(ctx, page);
fz_catch(ctx)
@@ -345,24 +345,14 @@ fz_new_stext_page_from_page_number(fz_context *ctx, fz_document *doc, int number
int
fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needle, fz_rect *hit_bbox, int hit_max)
{
- fz_stext_sheet *sheet = NULL;
- fz_stext_page *text = NULL;
+ fz_stext_page *text;
int count;
- fz_var(sheet);
- fz_var(text);
-
+ text = fz_new_stext_page_from_display_list(ctx, list, NULL);
fz_try(ctx)
- {
- sheet = fz_new_stext_sheet(ctx);
- text = fz_new_stext_page_from_display_list(ctx, list, sheet, NULL);
count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max);
- }
fz_always(ctx)
- {
fz_drop_stext_page(ctx, text);
- fz_drop_stext_sheet(ctx, sheet);
- }
fz_catch(ctx)
fz_rethrow(ctx);
return count;
@@ -371,24 +361,14 @@ fz_search_display_list(fz_context *ctx, fz_display_list *list, const char *needl
int
fz_search_page(fz_context *ctx, fz_page *page, const char *needle, fz_rect *hit_bbox, int hit_max)
{
- fz_stext_sheet *sheet = NULL;
- fz_stext_page *text = NULL;
+ fz_stext_page *text;
int count;
- fz_var(sheet);
- fz_var(text);
-
+ text = fz_new_stext_page_from_page(ctx, page, NULL);
fz_try(ctx)
- {
- sheet = fz_new_stext_sheet(ctx);
- text = fz_new_stext_page_from_page(ctx, page, sheet, NULL);
count = fz_search_stext_page(ctx, text, needle, hit_bbox, hit_max);
- }
fz_always(ctx)
- {
fz_drop_stext_page(ctx, text);
- fz_drop_stext_sheet(ctx, sheet);
- }
fz_catch(ctx)
fz_rethrow(ctx);
return count;
@@ -411,14 +391,15 @@ fz_search_page_number(fz_context *ctx, fz_document *doc, int number, const char
}
fz_buffer *
-fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rect *sel, int crlf)
+fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page, const fz_rect *sel, int crlf)
{
fz_buffer *buf;
fz_rect hitbox;
float x0, y0, x1, y1;
- int block_num;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
int need_newline;
- int i;
need_newline = 0;
@@ -438,45 +419,33 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec
buf = fz_new_buffer(ctx, 256);
fz_try(ctx)
{
- for (block_num = 0; block_num < text->len; block_num++)
+ for (block = page->first_block; block; block = block->next)
{
- fz_stext_line *line;
- fz_stext_block *block;
- fz_stext_span *span;
-
- if (text->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
- block = text->blocks[block_num].u.text;
- for (line = block->lines; line < block->lines + block->len; line++)
+ for (line = block->u.t.first_line; line; line = line->next)
{
int saw_text = 0;
- for (span = line->first_span; span; span = span->next)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- if (span->spacing > 1)
- fz_append_byte(ctx, buf, ' ');
- for (i = 0; i < span->len; i++)
+ int c = ch->c;
+ fz_stext_char_bbox(ctx, &hitbox, line, ch);
+ if (c < 32)
+ c = 0xFFFD;
+ if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
{
- int c;
- fz_stext_char_bbox(ctx, &hitbox, span, i);
- c = span->text[i].c;
- if (c < 32)
- c = 0xFFFD;
- if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
+ saw_text = 1;
+ if (need_newline)
{
- saw_text = 1;
- if (need_newline)
- {
- if (crlf)
- fz_append_byte(ctx, buf, '\r');
- fz_append_byte(ctx, buf, '\n');
- need_newline = 0;
- }
- fz_append_rune(ctx, buf, c);
+ if (crlf)
+ fz_append_byte(ctx, buf, '\r');
+ fz_append_byte(ctx, buf, '\n');
+ need_newline = 0;
}
+ fz_append_rune(ctx, buf, c);
}
}
-
if (saw_text)
need_newline = 1;
}
@@ -494,42 +463,32 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *text, const fz_rec
fz_buffer *
fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options)
{
- fz_stext_sheet *sheet;
fz_stext_page *text;
fz_buffer *buf;
- sheet = fz_new_stext_sheet(ctx);
+ text = fz_new_stext_page_from_display_list(ctx, list, options);
fz_try(ctx)
- {
- text = fz_new_stext_page_from_display_list(ctx, list, sheet, options);
buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf);
- }
fz_always(ctx)
- fz_drop_stext_sheet(ctx, sheet);
+ fz_drop_stext_page(ctx, text);
fz_catch(ctx)
fz_rethrow(ctx);
- fz_drop_stext_page(ctx, text);
return buf;
}
fz_buffer *
fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options)
{
- fz_stext_sheet *sheet;
fz_stext_page *text;
fz_buffer *buf;
- sheet = fz_new_stext_sheet(ctx);
+ text = fz_new_stext_page_from_page(ctx, page, options);
fz_try(ctx)
- {
- text = fz_new_stext_page_from_page(ctx, page, sheet, options);
buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf);
- }
fz_always(ctx)
- fz_drop_stext_sheet(ctx, sheet);
+ fz_drop_stext_page(ctx, text);
fz_catch(ctx)
fz_rethrow(ctx);
- fz_drop_stext_page(ctx, text);
return buf;
}
diff --git a/source/tools/mudraw.c b/source/tools/mudraw.c
index de05ab65..e1303fb8 100644
--- a/source/tools/mudraw.c
+++ b/source/tools/mudraw.c
@@ -248,7 +248,6 @@ static int band_height = 0;
static int lowmemory = 0;
static int errored = 0;
-static fz_stext_sheet *sheet = NULL;
static fz_colorspace *colorspace;
static int spots = 0;
static int alpha;
@@ -391,9 +390,6 @@ file_level_headers(fz_context *ctx)
if (output_format == OUT_STEXT || output_format == OUT_TRACE)
fz_write_printf(ctx, out, "<?xml version=\"1.0\"?>\n");
- if (output_format == OUT_TEXT || output_format == OUT_HTML || output_format == OUT_XHTML || output_format == OUT_STEXT)
- sheet = fz_new_stext_sheet(ctx);
-
if (output_format == OUT_HTML)
fz_print_stext_header_as_html(ctx, out);
if (output_format == OUT_XHTML)
@@ -422,8 +418,6 @@ file_level_trailers(fz_context *ctx)
if (output_format == OUT_PS)
fz_write_ps_file_trailer(ctx, out, output_pagenum);
-
- fz_drop_stext_sheet(ctx, sheet);
}
static void drawband(fz_context *ctx, fz_page *page, fz_display_list *list, const fz_matrix *ctm, const fz_rect *tbounds, fz_cookie *cookie, int band_start, fz_pixmap *pix, fz_bitmap **bit)
@@ -534,7 +528,7 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in
stext_options.flags = (output_format == OUT_HTML || output_format == OUT_XHTML) ? FZ_STEXT_PRESERVE_IMAGES : 0;
text = fz_new_stext_page(ctx, &mediabox);
- dev = fz_new_stext_device(ctx, sheet, text, &stext_options);
+ dev = fz_new_stext_device(ctx, text, &stext_options);
if (lowmemory)
fz_enable_device_hints(ctx, dev, FZ_NO_CACHE);
if (list)
@@ -550,12 +544,10 @@ static void dodrawpage(fz_context *ctx, fz_page *page, fz_display_list *list, in
}
else if (output_format == OUT_HTML)
{
- fz_analyze_text(ctx, sheet, text);
fz_print_stext_page_as_html(ctx, out, text);
}
else if (output_format == OUT_XHTML)
{
- fz_analyze_text(ctx, sheet, text);
fz_print_stext_page_as_xhtml(ctx, out, text);
}
else if (output_format == OUT_TEXT)
diff --git a/source/tools/murun.c b/source/tools/murun.c
index b7443286..7a713903 100644
--- a/source/tools/murun.c
+++ b/source/tools/murun.c
@@ -1827,19 +1827,13 @@ static void ffi_Page_toStructuredText(js_State *J)
fz_context *ctx = js_getcontext(J);
fz_page *page = ffi_topage(J, 0);
const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL;
- fz_stext_sheet *sheet = NULL;
fz_stext_options so;
fz_stext_page *text;
- fz_var(sheet);
-
fz_try(ctx) {
- sheet = fz_new_stext_sheet(ctx);
fz_parse_stext_options(ctx, &so, options);
- text = fz_new_stext_page_from_page(ctx, page, sheet, &so);
+ text = fz_new_stext_page_from_page(ctx, page, &so);
}
- fz_always(ctx)
- fz_drop_stext_sheet(ctx, sheet);
fz_catch(ctx)
rethrow(J);
@@ -2673,19 +2667,13 @@ static void ffi_DisplayList_toStructuredText(js_State *J)
fz_context *ctx = js_getcontext(J);
fz_display_list *list = js_touserdata(J, 0, "fz_display_list");
const char *options = js_iscoercible(J, 1) ? js_tostring(J, 1) : NULL;
- fz_stext_sheet *sheet = NULL;
fz_stext_options so;
fz_stext_page *text;
- fz_var(sheet);
-
fz_try(ctx) {
- sheet = fz_new_stext_sheet(ctx);
fz_parse_stext_options(ctx, &so, options);
- text = fz_new_stext_page_from_display_list(ctx, list, sheet, &so);
+ text = fz_new_stext_page_from_display_list(ctx, list, &so);
}
- fz_always(ctx)
- fz_drop_stext_sheet(ctx, sheet);
fz_catch(ctx)
rethrow(J);