#include "fitz-internal.h"
#define LINE_DIST 0.9f
#define SPACE_DIST 0.2f
#define SPACE_MAX_DIST 0.8f
#define PARAGRAPH_DIST 0.5f
#define MY_EPSILON 0.001f
#define SUBSCRIPT_OFFSET 0.2f
#define SUPERSCRIPT_OFFSET -0.2f
#undef DEBUG_SPANS
#undef DEBUG_INTERNALS
#undef DEBUG_LINE_HEIGHTS
#undef DEBUG_MASKS
#undef DEBUG_ALIGN
#undef DEBUG_INDENTS
#undef SPOT_LINE_NUMBERS
#include
#include FT_FREETYPE_H
#include FT_ADVANCES_H
typedef struct fz_text_device_s fz_text_device;
typedef struct span_soup_s span_soup;
struct fz_text_device_s
{
fz_text_sheet *sheet;
fz_text_page *page;
span_soup *spans;
fz_text_span *cur_span;
int lastchar;
};
static fz_rect *
add_point_to_rect(fz_rect *a, const fz_point *p)
{
if (p->x < a->x0)
a->x0 = p->x;
if (p->x > a->x1)
a->x1 = p->x;
if (p->y < a->y0)
a->y0 = p->y;
if (p->y > a->y1)
a->y1 = p->y;
return a;
}
fz_rect *
fz_text_char_bbox(fz_rect *bbox, fz_text_span *span, int i)
{
fz_point a, d;
const fz_point *max;
fz_text_char *ch;
if (!span || i >= span->len)
{
*bbox = fz_empty_rect;
}
ch = &span->text[i];
if (i == span->len-1)
max = &span->max;
else
max = &span->text[i+1].p;
a.x = 0;
a.y = span->ascender_max;
fz_transform_vector(&a, &span->transform);
d.x = 0;
d.y = span->descender_min;
fz_transform_vector(&d, &span->transform);
bbox->x0 = bbox->x1 = ch->p.x + a.x;
bbox->y0 = bbox->y1 = ch->p.y + a.y;
a.x += max->x;
a.y += max->y;
add_point_to_rect(bbox, &a);
a.x = ch->p.x + d.x;
a.y = ch->p.y + d.y;
add_point_to_rect(bbox, &a);
a.x = max->x + d.x;
a.y = max->y + d.y;
add_point_to_rect(bbox, &a);
return bbox;
}
static void
add_bbox_to_span(fz_text_span *span)
{
fz_point a, d;
fz_rect *bbox = &span->bbox;
if (!span)
return;
a.x = 0;
a.y = span->ascender_max;
fz_transform_vector(&a, &span->transform);
d.x = 0;
d.y = span->descender_min;
fz_transform_vector(&d, &span->transform);
bbox->x0 = bbox->x1 = span->min.x + a.x;
bbox->y0 = bbox->y1 = span->min.y + a.y;
a.x += span->max.x;
a.y += span->max.y;
add_point_to_rect(bbox, &a);
a.x = span->min.x + d.x;
a.y = span->min.y + d.y;
add_point_to_rect(bbox, &a);
a.x = span->max.x + d.x;
a.y = span->max.y + d.y;
add_point_to_rect(bbox, &a);
}
struct span_soup_s
{
fz_context *ctx;
int len, cap;
fz_text_span **spans;
};
static span_soup *
new_span_soup(fz_context *ctx)
{
span_soup *soup = fz_malloc_struct(ctx, span_soup);
soup->ctx = ctx;
soup->len = 0;
soup->cap = 0;
soup->spans = NULL;
return soup;
}
static void
free_span_soup(span_soup *soup)
{
int i;
if (soup == NULL)
return;
for (i = 0; i < soup->len; i++)
{
fz_free(soup->ctx, soup->spans[i]);
}
fz_free(soup->ctx, soup->spans);
fz_free(soup->ctx, soup);
}
static void
add_span_to_soup(span_soup *soup, fz_text_span *span)
{
if (span == NULL)
return;
if (soup->len == soup->cap)
{
int newcap = (soup->cap ? soup->cap * 2 : 16);
soup->spans = fz_resize_array(soup->ctx, soup->spans, newcap, sizeof(*soup->spans));
soup->cap = newcap;
}
add_bbox_to_span(span);
soup->spans[soup->len++] = span;
}
static fz_text_line *
push_span(fz_context *ctx, fz_text_device *tdev, fz_text_span *span, int new_line, float distance)
{
fz_text_line *line;
fz_text_block *block;
fz_text_page *page = tdev->page;
if (new_line)
{
float size = fz_matrix_expansion(&span->transform);
/* So, a new line. Part of the same block or not? */
if (distance == 0 || distance > size * 1.5 || distance < -size * PARAGRAPH_DIST || page->len == 0)
{
/* New block */
if (page->len == page->cap)
{
int newcap = (page->cap ? page->cap*2 : 4);
page->blocks = fz_resize_array(ctx, page->blocks, newcap, sizeof(*page->blocks));
page->cap = newcap;
}
page->blocks[page->len].cap = 0;
page->blocks[page->len].len = 0;
page->blocks[page->len].lines = 0;
page->blocks[page->len].bbox = fz_empty_rect;
page->len++;
distance = 0;
}
/* New line */
block = &page->blocks[page->len-1];
if (block->len == block->cap)
{
int newcap = (block->cap ? block->cap*2 : 4);
block->lines = fz_resize_array(ctx, block->lines, newcap, sizeof(*block->lines));
block->cap = newcap;
}
block->lines[block->len].cap = 0;
block->lines[block->len].len = 0;
block->lines[block->len].spans = NULL;
block->lines[block->len].distance = distance;
block->lines[block->len].bbox = fz_empty_rect;
block->len++;
}
/* Find last line and append to it */
block = &page->blocks[page->len-1];
line = &block->lines[block->len-1];
if (line->len == line->cap)
{
int newcap = (line->cap ? line->cap*2 : 4);
line->spans = fz_resize_array(ctx, line->spans, newcap, sizeof(*line->spans));
line->cap = newcap;
}
fz_union_rect(&block->lines[block->len-1].bbox, &span->bbox);
fz_union_rect(&block->bbox, &span->bbox);
span->base_offset = (new_line ? 0 : distance);
line->spans[line->len++] = span;
return line;
}
#if defined(DEBUG_SPANS) || defined(DEBUG_ALIGN) || defined(DEBUG_INDENTS)
static void
dump_span(fz_text_span *s)
{
int i;
for (i=0; i < s->len; i++)
{
printf("%c", s->text[i].c);
}
}
#endif
#ifdef DEBUG_ALIGN
static void
dump_line(fz_text_line *line)
{
int i;
for (i=0; i < line->len; i++)
{
fz_text_span *s = line->spans[i];
if (s->spacing > 1)
printf(" ");
dump_span(s);
}
printf("\n");
}
#endif
static inline void
normalise(fz_point *p)
{
float len = p->x * p->x + p->y * p->y;
if (len != 0)
{
len = sqrtf(len);
p->x /= len;
p->y /= len;
}
}
static void
strain_soup(fz_context *ctx, fz_text_device *tdev)
{
span_soup *soup = tdev->spans;
fz_text_line *last_line = NULL;
fz_text_span *last_span = NULL;
int span_num;
/* Really dumb implementation to match what we had before */
for (span_num=0; span_num < soup->len; span_num++)
{
fz_text_span *span = soup->spans[span_num];
int new_line = 1;
float distance = 0;
float spacing = 0;
soup->spans[span_num] = NULL;
if (last_span)
{
/* If we have a last_span, we must have a last_line */
/* Do span and last_line share the same baseline? */
fz_point p, q, perp_r;
float dot;
float size = fz_matrix_expansion(&span->transform);
#ifdef DEBUG_SPANS
{
printf("Comparing: \"");
dump_span(last_span);
printf("\" and \"");
dump_span(span);
printf("\"\n");
}
#endif
p.x = last_line->spans[0]->max.x - last_line->spans[0]->min.x;
p.y = last_line->spans[0]->max.y - last_line->spans[0]->min.y;
normalise(&p);
q.x = span->max.x - span->min.x;
q.y = span->max.y - span->min.y;
normalise(&q);
#ifdef DEBUG_SPANS
printf("last_span=%g %g -> %g %g = %g %g\n", last_span->min.x, last_span->min.y, last_span->max.x, last_span->max.y, p.x, p.y);
printf("span =%g %g -> %g %g = %g %g\n", span->min.x, span->min.y, span->max.x, span->max.y, q.x, q.y);
#endif
perp_r.y = last_line->spans[0]->min.x - span->min.x;
perp_r.x = -(last_line->spans[0]->min.y - span->min.y);
/* Check if p and q are parallel. If so, then this
* line is parallel with the last one. */
dot = p.x * q.x + p.y * q.y;
if (fabsf(dot) > 0.9995)
{
/* If we take the dot product of normalised(p) and
* perp(r), we get the perpendicular distance from
* one line to the next (assuming they are parallel). */
distance = p.x * perp_r.x + p.y * perp_r.y;
/* We allow 'small' distances of baseline changes
* to cope with super/subscript. FIXME: We should
* gather subscript/superscript information here. */
new_line = (fabsf(distance) > size * LINE_DIST);
}
else
{
new_line = 1;
distance = 0;
}
if (!new_line)
{
fz_point delta;
delta.x = span->min.x - last_span->max.x;
delta.y = span->min.y - last_span->max.y;
spacing = (p.x * delta.x + p.y * delta.y);
spacing = fabsf(spacing);
/* Only allow changes in baseline (subscript/superscript etc)
* when the spacing is small. */
if (spacing * fabsf(distance) > size * LINE_DIST && fabsf(distance) > size * 0.1f)
{
new_line = 1;
distance = 0;
spacing = 0;
}
else
{
spacing /= size * SPACE_DIST;
/* Apply the same logic here as when we're adding chars to build spans. */
if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
spacing = 1;
}
}
#ifdef DEBUG_SPANS
printf("dot=%g new_line=%d distance=%g size=%g spacing=%g\n", dot, new_line, distance, size, spacing);
#endif
}
span->spacing = spacing;
last_line = push_span(ctx, tdev, span, new_line, distance);
last_span = span;
}
}
fz_text_sheet *
fz_new_text_sheet(fz_context *ctx)
{
fz_text_sheet *sheet = fz_malloc(ctx, sizeof *sheet);
sheet->maxid = 0;
sheet->style = NULL;
return sheet;
}
void
fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet)
{
fz_text_style *style = sheet->style;
while (style)
{
fz_text_style *next = style->next;
fz_drop_font(ctx, style->font);
fz_free(ctx, style);
style = next;
}
fz_free(ctx, sheet);
}
static fz_text_style *
fz_lookup_text_style_imp(fz_context *ctx, fz_text_sheet *sheet,
float size, fz_font *font, int wmode, int script)
{
fz_text_style *style;
for (style = sheet->style; style; style = style->next)
{
if (style->font == font &&
style->size == size &&
style->wmode == wmode &&
style->script == script) /* FIXME: others */
{
return style;
}
}
/* Better make a new one and add it to our list */
style = fz_malloc(ctx, sizeof *style);
style->id = sheet->maxid++;
style->font = fz_keep_font(ctx, font);
style->size = size;
style->wmode = wmode;
style->script = script;
style->next = sheet->style;
sheet->style = style;
return style;
}
static fz_text_style *
fz_lookup_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, const fz_matrix *ctm,
fz_colorspace *colorspace, float *color, float alpha, fz_stroke_state *stroke)
{
float size = 1.0f;
fz_font *font = text ? text->font : NULL;
int wmode = text ? text->wmode : 0;
if (ctm && text)
{
fz_matrix tm = text->trm;
fz_matrix trm;
tm.e = 0;
tm.f = 0;
fz_concat(&trm, &tm, ctm);
size = fz_matrix_expansion(&trm);
}
return fz_lookup_text_style_imp(ctx, sheet, size, font, wmode, 0);
}
fz_text_page *
fz_new_text_page(fz_context *ctx, const fz_rect *mediabox)
{
fz_text_page *page = fz_malloc(ctx, sizeof(*page));
page->mediabox = *mediabox;
page->len = 0;
page->cap = 0;
page->blocks = NULL;
return page;
}
static void
fz_free_text_line_contents(fz_context *ctx, fz_text_line *line)
{
int span_num;
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
fz_free(ctx, span->text);
fz_free(ctx, span);
}
fz_free(ctx, line->spans);
}
void
fz_free_text_page(fz_context *ctx, fz_text_page *page)
{
fz_text_block *block;
fz_text_line *line;
for (block = page->blocks; block < page->blocks + page->len; block++)
{
for (line = block->lines; line < block->lines + block->len; line++)
fz_free_text_line_contents(ctx, line);
fz_free(ctx, block->lines);
}
fz_free(ctx, page->blocks);
fz_free(ctx, page);
}
static fz_text_span *
fz_new_text_span(fz_context *ctx, const fz_point *p, int wmode, const fz_matrix *trm)
{
fz_text_span *span = fz_malloc_struct(ctx, fz_text_span);
span->ascender_max = 0;
span->descender_min = 0;
span->cap = 0;
span->len = 0;
span->min = *p;
span->max = *p;
span->wmode = wmode;
span->transform.a = trm->a;
span->transform.b = trm->b;
span->transform.c = trm->c;
span->transform.d = trm->d;
span->transform.e = 0;
span->transform.f = 0;
span->text = NULL;
return span;
}
static void
add_char_to_span(fz_context *ctx, fz_text_span *span, int c, fz_point *p, fz_point *max, fz_text_style *style)
{
if (span->len == span->cap)
{
int newcap = (span->cap ? span->cap * 2 : 16);
span->text = fz_resize_array(ctx, span->text, newcap, sizeof(fz_text_char));
span->cap = newcap;
span->bbox = fz_empty_rect;
}
span->max = *max;
if (style->ascender > span->ascender_max)
span->ascender_max = style->ascender;
if (style->descender < span->descender_min)
span->descender_min = style->descender;
span->text[span->len].c = c;
span->text[span->len].p = *p;
span->text[span->len].style = style;
span->len++;
}
static void
fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_matrix *trm, float adv, int wmode)
{
int can_append = 1;
int add_space = 0;
fz_point dir, ndir, p, q;
float size;
fz_point delta;
float spacing = 0;
float base_offset = 0;
if (wmode == 0)
{
dir.x = 1;
dir.y = 0;
}
else
{
dir.x = 0;
dir.y = 1;
}
fz_transform_vector(&dir, trm);
ndir = dir;
normalise(&ndir);
/* dir = direction vector for motion. ndir = normalised(dir) */
size = fz_matrix_expansion(trm);
if (dev->cur_span == NULL ||
trm->a != dev->cur_span->transform.a || trm->b != dev->cur_span->transform.b ||
trm->c != dev->cur_span->transform.c || trm->d != dev->cur_span->transform.d)
{
/* If the matrix has changed (or if we don't have a span at
* all), then we can't append. */
#ifdef DEBUG_SPANS
printf("Transform changed\n");
#endif
can_append = 0;
}
else
{
/* Calculate how far we've moved since the end of the current
* span. */
delta.x = trm->e - dev->cur_span->max.x;
delta.y = trm->f - dev->cur_span->max.y;
/* The transform has not changed, so we know we're in the same
* direction. Calculate 2 distances; how far off the previous
* baseline we are, together with how far along the baseline
* we are from the expected position. */
spacing = ndir.x * delta.x + ndir.y * delta.y;
base_offset = -ndir.y * delta.x + ndir.x * delta.y;
spacing /= size * SPACE_DIST;
spacing = fabsf(spacing);
if (fabsf(base_offset) < size * 0.1)
{
/* Only a small amount off the baseline - we'll take this */
if (spacing < 1.0)
{
/* Motion is in line, and small. */
}
else if (spacing >= 1 && spacing < (SPACE_MAX_DIST/SPACE_DIST))
{
/* Motion is in line, but large enough
* to warrant us adding a space */
if (dev->lastchar != ' ' && wmode == 0)
add_space = 1;
}
else
{
/* Motion is in line, but too large - split to a new span */
can_append = 0;
}
}
else
{
can_append = 0;
spacing = 0;
}
}
#ifdef DEBUG_SPANS
printf("%c%c append=%d space=%d size=%g spacing=%g base_offset=%g\n", dev->lastchar, c, can_append, add_space, size, spacing, base_offset);
#endif
p.x = trm->e;
p.y = trm->f;
if (can_append == 0)
{
/* Start a new span */
add_span_to_soup(dev->spans, dev->cur_span);
dev->cur_span = NULL;
dev->cur_span = fz_new_text_span(ctx, &p, wmode, trm);
dev->cur_span->spacing = 0;
}
if (add_space)
{
q.x = - 0.2f;
q.y = 0;
fz_transform_point(&q, trm);
add_char_to_span(ctx, dev->cur_span, ' ', &p, &q, style);
}
/* Advance the matrix */
q.x = trm->e += adv * dir.x;
q.y = trm->f += adv * dir.y;
add_char_to_span(ctx, dev->cur_span, c, &p, &q, style);
}
static void
fz_add_text_char(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_matrix *trm, float adv, int wmode)
{
switch (c)
{
case -1: /* ignore when one unicode character maps to multiple glyphs */
break;
case 0xFB00: /* ff */
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/2, wmode);
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/2, wmode);
break;
case 0xFB01: /* fi */
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/2, wmode);
fz_add_text_char_imp(ctx, dev, style, 'i', trm, adv/2, wmode);
break;
case 0xFB02: /* fl */
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/2, wmode);
fz_add_text_char_imp(ctx, dev, style, 'l', trm, adv/2, wmode);
break;
case 0xFB03: /* ffi */
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/3, wmode);
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/3, wmode);
fz_add_text_char_imp(ctx, dev, style, 'i', trm, adv/3, wmode);
break;
case 0xFB04: /* ffl */
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/3, wmode);
fz_add_text_char_imp(ctx, dev, style, 'f', trm, adv/3, wmode);
fz_add_text_char_imp(ctx, dev, style, 'l', trm, adv/3, wmode);
break;
case 0xFB05: /* long st */
case 0xFB06: /* st */
fz_add_text_char_imp(ctx, dev, style, 's', trm, adv/2, wmode);
fz_add_text_char_imp(ctx, dev, style, 't', trm, adv/2, wmode);
break;
default:
fz_add_text_char_imp(ctx, dev, style, c, trm, adv, wmode);
break;
}
}
static void
fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, const fz_matrix *ctm, fz_text_style *style)
{
fz_font *font = text->font;
FT_Face face = font->ft_face;
fz_matrix tm = text->trm;
fz_matrix trm;
float adv;
float ascender = 1;
float descender = 0;
int multi;
int i, j, err;
if (text->len == 0)
return;
if (font->ft_face)
{
fz_lock(ctx, FZ_LOCK_FREETYPE);
err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72);
if (err)
fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
ascender = (float)face->ascender / face->units_per_EM;
descender = (float)face->descender / face->units_per_EM;
fz_unlock(ctx, FZ_LOCK_FREETYPE);
}
else if (font->t3procs && !fz_is_empty_rect(&font->bbox))
{
ascender = font->bbox.y1;
descender = font->bbox.y0;
}
style->ascender = ascender;
style->descender = descender;
tm.e = 0;
tm.f = 0;
fz_concat(&trm, &tm, ctm);
for (i = 0; i < text->len; i++)
{
/* Calculate new pen location and delta */
tm.e = text->items[i].x;
tm.f = text->items[i].y;
fz_concat(&trm, &tm, ctm);
/* Calculate bounding box and new pen position based on font metrics */
if (font->ft_face)
{
FT_Fixed ftadv = 0;
int mask = FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING | FT_LOAD_IGNORE_TRANSFORM;
/* TODO: freetype returns broken vertical metrics */
/* if (text->wmode) mask |= FT_LOAD_VERTICAL_LAYOUT; */
fz_lock(ctx, FZ_LOCK_FREETYPE);
err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72);
if (err)
fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
FT_Get_Advance(font->ft_face, text->items[i].gid, mask, &ftadv);
adv = ftadv / 65536.0f;
fz_unlock(ctx, FZ_LOCK_FREETYPE);
}
else
{
adv = font->t3widths[text->items[i].gid];
}
/* Check for one glyph to many char mapping */
for (j = i + 1; j < text->len; j++)
if (text->items[j].gid >= 0)
break;
multi = j - i;
if (multi == 1)
{
fz_add_text_char(ctx, dev, style, text->items[i].ucs, &trm, adv, text->wmode);
}
else
{
for (j = 0; j < multi; j++)
{
fz_add_text_char(ctx, dev, style, text->items[i + j].ucs, &trm, adv/multi, text->wmode);
}
i += j - 1;
}
dev->lastchar = text->items[i].ucs;
}
}
static void
fz_text_fill_text(fz_device *dev, fz_text *text, const fz_matrix *ctm,
fz_colorspace *colorspace, float *color, float alpha)
{
fz_text_device *tdev = dev->user;
fz_text_style *style;
style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, colorspace, color, alpha, NULL);
fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, const fz_matrix *ctm,
fz_colorspace *colorspace, float *color, float alpha)
{
fz_text_device *tdev = dev->user;
fz_text_style *style;
style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, colorspace, color, alpha, stroke);
fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_clip_text(fz_device *dev, fz_text *text, const fz_matrix *ctm, int accumulate)
{
fz_text_device *tdev = dev->user;
fz_text_style *style;
style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, NULL, NULL, 0, NULL);
fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_clip_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, const fz_matrix *ctm)
{
fz_text_device *tdev = dev->user;
fz_text_style *style;
style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, NULL, NULL, 0, stroke);
fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_ignore_text(fz_device *dev, fz_text *text, const fz_matrix *ctm)
{
fz_text_device *tdev = dev->user;
fz_text_style *style;
style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, NULL, NULL, 0, NULL);
fz_text_extract(dev->ctx, tdev, text, ctm, style);
}
static void
fz_text_free_user(fz_device *dev)
{
fz_context *ctx = dev->ctx;
fz_text_device *tdev = dev->user;
add_span_to_soup(tdev->spans, tdev->cur_span);
tdev->cur_span = NULL;
strain_soup(ctx, tdev);
free_span_soup(tdev->spans);
/* TODO: smart sorting of blocks in reading order */
/* TODO: unicode NFC normalization */
/* TODO: bidi logical reordering */
fz_free(dev->ctx, tdev);
}
fz_device *
fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
{
fz_device *dev;
fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device);
tdev->sheet = sheet;
tdev->page = page;
tdev->spans = new_span_soup(ctx);
tdev->cur_span = NULL;
tdev->lastchar = ' ';
dev = fz_new_device(ctx, tdev);
dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE;
dev->free_user = fz_text_free_user;
dev->fill_text = fz_text_fill_text;
dev->stroke_text = fz_text_stroke_text;
dev->clip_text = fz_text_clip_text;
dev->clip_stroke_text = fz_text_clip_stroke_text;
dev->ignore_text = fz_text_ignore_text;
return dev;
}
/* XML, HTML and plain-text output */
static int font_is_bold(fz_font *font)
{
FT_Face face = font->ft_face;
if (face && (face->style_flags & FT_STYLE_FLAG_BOLD))
return 1;
if (strstr(font->name, "Bold"))
return 1;
return 0;
}
static int font_is_italic(fz_font *font)
{
FT_Face face = font->ft_face;
if (face && (face->style_flags & FT_STYLE_FLAG_ITALIC))
return 1;
if (strstr(font->name, "Italic") || strstr(font->name, "Oblique"))
return 1;
return 0;
}
static void
fz_print_style_begin(fz_output *out, fz_text_style *style)
{
int script = style->script;
fz_printf(out, "", style->id);
while (script-- > 0)
fz_printf(out, "");
while (++script < 0)
fz_printf(out, "");
}
static void
fz_print_style_end(fz_output *out, fz_text_style *style)
{
int script = style->script;
while (script-- > 0)
fz_printf(out, "");
while (++script < 0)
fz_printf(out, "");
fz_printf(out, "");
}
static void
fz_print_style(fz_output *out, fz_text_style *style)
{
char *s = strchr(style->font->name, '+');
s = s ? s + 1 : style->font->name;
fz_printf(out, "span.s%d{font-family:\"%s\";font-size:%gpt;",
style->id, s, style->size);
if (font_is_italic(style->font))
fz_printf(out, "font-style:italic;");
if (font_is_bold(style->font))
fz_printf(out, "font-weight:bold;");
fz_printf(out, "}\n");
}
void
fz_print_text_sheet(fz_context *ctx, fz_output *out, fz_text_sheet *sheet)
{
fz_text_style *style;
for (style = sheet->style; style; style = style->next)
fz_print_style(out, style);
}
void
fz_print_text_page_html(fz_context *ctx, fz_output *out, fz_text_page *page)
{
int block_n, line_n, span_n, ch_n;
fz_text_style *style = NULL;
fz_text_block *block;
fz_text_line *line;
void *last_region = NULL;
fz_printf(out, "\n");
for (block_n = 0; block_n < page->len; block_n++)
{
block = &page->blocks[block_n];
fz_printf(out, "
\n");
for (line_n = 0; line_n < block->len; line_n++)
{
int lastcol=-1;
line = &block->lines[line_n];
style = NULL;
if (line->region != last_region)
{
if (last_region)
fz_printf(out, "
");
fz_printf(out, "
");
last_region = line->region;
}
fz_printf(out, "
region)
fz_printf(out, " region=\"%x\"", line->region);
#endif
fz_printf(out, ">");
for (span_n = 0; span_n < line->len; span_n++)
{
fz_text_span *span = line->spans[span_n];
float size = fz_matrix_expansion(&span->transform);
float base_offset = span->base_offset / size;
if (lastcol != span->column)
{
if (lastcol >= 0)
{
fz_printf(out, "
");
}
/* If we skipped any columns then output some spacer spans */
while (lastcol < span->column-1)
{
fz_printf(out, "
");
lastcol++;
}
lastcol++;
/* Now output the span to contain this entire column */
fz_printf(out, "
len; sn++)
{
if (line->spans[sn]->column != lastcol)
break;
}
fz_printf(out, "width:%g%%;align:%s", span->column_width, (span->align == 0 ? "left" : (span->align == 1 ? "center" : "right")));
}
if (span->indent > 1)
fz_printf(out, ";padding-left:1em;text-indent:-1em");
if (span->indent < -1)
fz_printf(out, ";text-indent:1em");
fz_printf(out, "\">");
}
#ifdef DEBUG_INTERNALS
fz_printf(out, "column)
fz_printf(out, " col=\"%x\"", span->column);
fz_printf(out, ">");
#endif
if (span->spacing >= 1)
fz_printf(out, " ");
if (base_offset > SUBSCRIPT_OFFSET)
fz_printf(out, "");
else if (base_offset < SUPERSCRIPT_OFFSET)
fz_printf(out, "");
for (ch_n = 0; ch_n < span->len; ch_n++)
{
fz_text_char *ch = &span->text[ch_n];
if (style != ch->style)
{
if (style)
fz_print_style_end(out, style);
fz_print_style_begin(out, ch->style);
style = ch->style;
}
if (ch->c == '<')
fz_printf(out, "<");
else if (ch->c == '>')
fz_printf(out, ">");
else if (ch->c == '&')
fz_printf(out, "&");
else if (ch->c >= 32 && ch->c <= 127)
fz_printf(out, "%c", ch->c);
else
fz_printf(out, "%x;", ch->c);
}
if (style)
{
fz_print_style_end(out, style);
style = NULL;
}
if (base_offset > SUBSCRIPT_OFFSET)
fz_printf(out, "");
else if (base_offset < SUPERSCRIPT_OFFSET)
fz_printf(out, "");
#ifdef DEBUG_INTERNALS
fz_printf(out, "");
#endif
}
/* Close our floating span */
fz_printf(out, "
");
#ifdef DEBUG_INTERNALS
#endif
/* Close the line */
fz_printf(out, "
");
fz_printf(out, "\n");
}
/* Close the metaline */
fz_printf(out, "
");
last_region = NULL;
fz_printf(out, "
\n");
}
fz_printf(out, "\n");
}
void
fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page)
{
fz_text_block *block;
fz_text_line *line;
char *s;
fz_printf(out, "\n");
for (block = page->blocks; block < page->blocks + page->len; block++)
{
fz_printf(out, "\n",
block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
for (line = block->lines; line < block->lines + block->len; line++)
{
int span_num;
fz_printf(out, "\n",
line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
fz_text_style *style = NULL;
int char_num;
for (char_num = 0; char_num < span->len; char_num++)
{
fz_text_char *ch = &span->text[char_num];
if (ch->style != style)
{
if (style)
{
fz_printf(out, "\n");
}
style = ch->style;
s = strchr(style->font->name, '+');
s = s ? s + 1 : style->font->name;
fz_printf(out, "\n",
span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1,
s, style->size);
}
{
fz_rect rect;
fz_text_char_bbox(&rect, span, char_num);
fz_printf(out, "p.x, ch->p.y);
}
switch (ch->c)
{
case '<': fz_printf(out, "<"); break;
case '>': fz_printf(out, ">"); break;
case '&': fz_printf(out, "&"); break;
case '"': fz_printf(out, """); break;
case '\'': fz_printf(out, "'"); break;
default:
if (ch->c >= 32 && ch->c <= 127)
fz_printf(out, "%c", ch->c);
else
fz_printf(out, "%x;", ch->c);
break;
}
fz_printf(out, "\"/>\n");
}
if (style)
fz_printf(out, "\n");
}
fz_printf(out, "\n");
}
fz_printf(out, "\n");
}
fz_printf(out, "\n");
}
void
fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page)
{
fz_text_block *block;
fz_text_line *line;
fz_text_char *ch;
char utf[10];
int i, n;
for (block = page->blocks; block < page->blocks + page->len; block++)
{
for (line = block->lines; line < block->lines + block->len; line++)
{
int span_num;
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
for (ch = span->text; ch < span->text + span->len; ch++)
{
n = fz_runetochar(utf, ch->c);
for (i = 0; i < n; i++)
fz_printf(out, "%c", utf[i]);
}
}
fz_printf(out, "\n");
}
fz_printf(out, "\n");
}
}
typedef struct line_height_s
{
float height;
int count;
fz_text_style *style;
} line_height;
typedef struct line_heights_s
{
fz_context *ctx;
int cap;
int len;
line_height *lh;
} line_heights;
static line_heights *
new_line_heights(fz_context *ctx)
{
line_heights *lh = fz_malloc_struct(ctx, line_heights);
lh->ctx = ctx;
return lh;
}
static void
free_line_heights(line_heights *lh)
{
if (!lh)
return;
fz_free(lh->ctx, lh->lh);
fz_free(lh->ctx, lh);
}
static void
insert_line_height(line_heights *lh, fz_text_style *style, float height)
{
int i;
#ifdef DEBUG_LINE_HEIGHTS
printf("style=%x height=%g\n", style, height);
#endif
/* If we have one already, add it in */
for (i=0; i < lh->cap; i++)
{
/* Match if we are within 5% */
if (lh->lh[i].style == style && lh->lh[i].height * 0.95 <= height && lh->lh[i].height * 1.05 >= height)
{
/* Ensure that the average height is correct */
lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1);
lh->lh[i].count++;
return;
}
}
/* Otherwise extend (if required) and add it */
if (lh->cap == lh->len)
{
int newcap = (lh->cap ? lh->cap * 2 : 4);
lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height));
lh->cap = newcap;
}
lh->lh[lh->len].count = 1;
lh->lh[lh->len].height = height;
lh->lh[lh->len].style = style;
lh->len++;
}
static void
cull_line_heights(line_heights *lh)
{
int i, j, k;
#ifdef DEBUG_LINE_HEIGHTS
printf("Before culling:\n");
for (i = 0; i < lh->len; i++)
{
fz_text_style *style = lh->lh[i].style;
printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
}
#endif
for (i = 0; i < lh->len; i++)
{
fz_text_style *style = lh->lh[i].style;
int count = lh->lh[i].count;
int max = i;
/* Find the max for this style */
for (j = i+1; j < lh->len; j++)
{
if (lh->lh[j].style == style && lh->lh[j].count > count)
{
max = j;
count = lh->lh[j].count;
}
}
/* Destroy all the ones other than the max */
if (max != i)
{
lh->lh[i].count = count;
lh->lh[i].height = lh->lh[max].height;
lh->lh[max].count = 0;
}
j = i+1;
for (k = j; k < lh->len; k++)
{
if (lh->lh[k].style != style)
lh->lh[j++] = lh->lh[k];
}
lh->len = j;
}
#ifdef DEBUG_LINE_HEIGHTS
printf("After culling:\n");
for (i = 0; i < lh->len; i++)
{
fz_text_style *style = lh->lh[i].style;
printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
}
#endif
}
static float
line_height_for_style(line_heights *lh, fz_text_style *style)
{
int i;
for (i=0; i < lh->len; i++)
{
if (lh->lh[i].style == style)
return lh->lh[i].height;
}
return 0.0; /* Never reached */
}
static void
split_block(fz_context *ctx, fz_text_page *page, int block_num, int linenum)
{
int split_len;
if (page->len == page->cap)
{
int new_cap = fz_maxi(16, page->cap * 2);
page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
page->cap = new_cap;
}
memmove(page->blocks+block_num+1, page->blocks+block_num, (page->len - block_num)*sizeof(*page->blocks));
page->len++;
split_len = page->blocks[block_num].len - linenum;
page->blocks[block_num+1].bbox = page->blocks[block_num].bbox; /* FIXME! */
page->blocks[block_num+1].cap = 0;
page->blocks[block_num+1].len = 0;
page->blocks[block_num+1].lines = NULL;
page->blocks[block_num+1].lines = fz_malloc_array(ctx, split_len, sizeof(fz_text_line));
page->blocks[block_num+1].cap = page->blocks[block_num+1].len;
page->blocks[block_num+1].len = split_len;
page->blocks[block_num].len = linenum;
memcpy(page->blocks[block_num+1].lines, page->blocks[block_num].lines + linenum, split_len * sizeof(fz_text_line));
page->blocks[block_num+1].lines[0].distance = 0;
}
static inline int
is_unicode_wspace(int c)
{
return (c == 9 || /* TAB */
c == 0x0a || /* HT */
c == 0x0b || /* LF */
c == 0x0c || /* VT */
c == 0x0d || /* FF */
c == 0x20 || /* CR */
c == 0x85 || /* NEL */
c == 0xA0 || /* No break space */
c == 0x1680 || /* Ogham space mark */
c == 0x180E || /* Mongolian Vowel Separator */
c == 0x2000 || /* En quad */
c == 0x2001 || /* Em quad */
c == 0x2002 || /* En space */
c == 0x2003 || /* Em space */
c == 0x2004 || /* Three-per-Em space */
c == 0x2005 || /* Four-per-Em space */
c == 0x2006 || /* Five-per-Em space */
c == 0x2007 || /* Figure space */
c == 0x2008 || /* Punctuation space */
c == 0x2009 || /* Thin space */
c == 0x200A || /* Hair space */
c == 0x2028 || /* Line separator */
c == 0x2029 || /* Paragraph separator */
c == 0x202F || /* Narrow no-break space */
c == 0x205F || /* Medium mathematical space */
c == 0x3000); /* Ideographic space */
}
static inline int
is_unicode_bullet(int c)
{
/* The last 2 aren't strictly bullets, but will do for our usage here */
return (c == 0x2022 || /* Bullet */
c == 0x2023 || /* Triangular bullet */
c == 0x25e6 || /* White bullet */
c == 0x2043 || /* Hyphen bullet */
c == 0x2219 || /* Bullet operator */
c == 149 || /* Ascii bullet */
c == '*');
}
static inline int
is_number(int c)
{
return ((c >= '0' && c <= '9') ||
(c == '.'));
}
static inline int
is_latin_char(int c)
{
return ((c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z'));
}
static inline int
is_roman(int c)
{
return (c == 'i' || c == 'I' ||
c == 'v' || c == 'V' ||
c == 'x' || c == 'X' ||
c == 'l' || c == 'L' ||
c == 'c' || c == 'C' ||
c == 'm' || c == 'M');
}
static int
is_list_entry(fz_text_span *span, int *char_num_ptr, int span_num)
{
int char_num;
fz_text_char *chr;
/* First, skip over any whitespace */
for (char_num = 0; char_num < span->len; char_num++)
{
chr = &span->text[char_num];
if (!is_unicode_wspace(chr->c))
break;
}
*char_num_ptr = char_num;
if (span_num != 0 || char_num >= span->len)
return 0;
/* Now we check for various special cases, which we consider to mean
* that this is probably a list entry and therefore should always count
* as a separate paragraph (and hence not be entered in the line height
* table). */
chr = &span->text[char_num];
/* Is the first char on the line, a bullet point? */
if (is_unicode_bullet(chr->c))
return 1;
#ifdef SPOT_LINE_NUMBERS
/* Is the entire first span a number? Or does it start with a number
* followed by ) or : ? Allow to involve single latin chars too. */
if (is_number(chr->c) || is_latin_char(chr->c))
{
int cn = char_num;
int met_char = is_latin_char(chr->c);
for (cn = char_num+1; cn < span->len; cn++)
{
fz_text_char *chr2 = &span->text[cn];
if (is_latin_char(chr2->c) && !met_char)
{
met_char = 1;
continue;
}
met_char = 0;
if (!is_number(chr2->c) && !is_unicode_wspace(chr2->c))
break;
else if (chr2->c == ')' || chr2->c == ':')
{
cn = span->len;
break;
}
}
if (cn == span->len)
return 1;
}
/* Is the entire first span a roman numeral? Or does it start with
* a roman numeral followed by ) or : ? */
if (is_roman(chr->c))
{
int cn = char_num;
for (cn = char_num+1; cn < span->len; cn++)
{
fz_text_char *chr2 = &span->text[cn];
if (!is_roman(chr2->c) && !is_unicode_wspace(chr2->c))
break;
else if (chr2->c == ')' || chr2->c == ':')
{
cn = span->len;
break;
}
}
if (cn == span->len)
return 1;
}
#endif
return 0;
}
typedef struct region_masks_s region_masks;
typedef struct region_mask_s region_mask;
typedef struct region_s region;
struct region_s
{
float start;
float stop;
float ave_start;
float ave_stop;
int align;
float colw;
};
struct region_mask_s
{
fz_context *ctx;
int freq;
fz_point blv;
int cap;
int len;
float size;
region *mask;
};
struct region_masks_s
{
fz_context *ctx;
int cap;
int len;
region_mask **mask;
};
static region_masks *
new_region_masks(fz_context *ctx)
{
region_masks *rms = fz_malloc_struct(ctx, region_masks);
rms->ctx = ctx;
rms->cap = 0;
rms->len = 0;
rms->mask = NULL;
return rms;
}
static void
free_region_mask(region_mask *rm)
{
if (!rm)
return;
fz_free(rm->ctx, rm->mask);
fz_free(rm->ctx, rm);
}
static void
free_region_masks(region_masks *rms)
{
int i;
if (!rms)
return;
for (i=0; i < rms->len; i++)
{
free_region_mask(rms->mask[i]);
}
fz_free(rms->ctx, rms->mask);
fz_free(rms->ctx, rms);
}
static int region_masks_mergeable(const region_mask *rm1, const region_mask *rm2, float *score)
{
int i1, i2;
int count = 0;
*score = 0;
if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
return 0;
for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
{
if (rm1->mask[i1].stop < rm2->mask[i2].start)
{
/* rm1's region is entirely before rm2's */
*score += rm1->mask[i1].stop - rm1->mask[i1].start;
i1++;
}
else if (rm1->mask[i1].start > rm2->mask[i2].stop)
{
/* rm2's region is entirely before rm1's */
*score += rm2->mask[i2].stop - rm2->mask[i2].start;
i2++;
}
else
{
float lscore, rscore;
if (rm1->mask[i1].start < rm2->mask[i2].start)
{
if (i2 > 0 && rm2->mask[i2-1].stop >= rm1->mask[i1].start)
return 0; /* Not compatible */
lscore = rm2->mask[i2].start - rm1->mask[i1].start;
}
else
{
if (i1 > 0 && rm1->mask[i1-1].stop >= rm2->mask[i2].start)
return 0; /* Not compatible */
lscore = rm1->mask[i1].start - rm2->mask[i2].start;
}
if (rm1->mask[i1].stop > rm2->mask[i2].stop)
{
if (i2+1 < rm2->len && rm2->mask[i2+1].start <= rm1->mask[i1].stop)
return 0; /* Not compatible */
rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
}
else
{
if (i1+1 < rm1->len && rm1->mask[i1+1].start <= rm2->mask[i2].stop)
return 0; /* Not compatible */
rscore = rm2->mask[i2].stop - rm1->mask[i1].stop;
}
/* In order to allow a region to merge, either the
* left, the right, or the centre must agree */
if (lscore < 1)
{
if (rscore < 1)
{
rscore = 0;
}
lscore = 0;
}
else if (rscore < 1)
{
rscore = 0;
}
else
{
/* Neither Left or right agree. Does the centre? */
float ave1 = rm1->mask[i1].start + rm1->mask[i1].stop;
float ave2 = rm2->mask[i2].start + rm2->mask[i2].stop;
if (fabsf(ave1-ave2) > 1)
{
/* Nothing agrees, so don't merge */
return 0;
}
lscore = 0;
rscore = 0;
}
*score += lscore + rscore;
/* These two regions could be merged */
i1++;
i2++;
}
count++;
}
count += rm1->len-i1 + rm2->len-i2;
return count;
}
static int region_mask_matches(const region_mask *rm1, const region_mask *rm2, float *score)
{
int i1, i2;
int close = 1;
*score = 0;
if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
return 0;
for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
{
if (rm1->mask[i1].stop < rm2->mask[i2].start)
{
/* rm1's region is entirely before rm2's */
*score += rm1->mask[i1].stop - rm1->mask[i1].start;
i1++;
}
else if (rm1->mask[i1].start > rm2->mask[i2].stop)
{
/* Not compatible */
return 0;
}
else
{
float lscore, rscore;
if (rm1->mask[i1].start > rm2->mask[i2].start)
{
/* Not compatible */
return 0;
}
if (rm1->mask[i1].stop < rm2->mask[i2].stop)
{
/* Not compatible */
return 0;
}
lscore = rm2->mask[i2].start - rm1->mask[i1].start;
rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
if (lscore < 1)
{
if (rscore < 1)
close++;
close++;
}
else if (rscore < 1)
close++;
else if (fabsf(lscore - rscore) < 1)
{
lscore = fabsf(lscore-rscore);
rscore = 0;
close++;
}
*score += lscore + rscore;
i1++;
i2++;
}
}
if (i1 < rm1->len)
{
/* Still more to go in rm1 */
if (rm1->mask[i1].start < rm2->mask[rm2->len-1].stop)
return 0;
}
else if (i2 < rm2->len)
{
/* Still more to go in rm2 */
if (rm2->mask[i2].start < rm1->mask[rm1->len-1].stop)
return 0;
}
return close;
}
static void region_mask_merge(region_mask *rm1, const region_mask *rm2, int newlen)
{
int o, i1, i2;
/* First, ensure that rm1 is long enough */
if (rm1->cap < newlen)
{
int newcap = rm1->cap ? rm1->cap : 2;
do
{
newcap *= 2;
}
while (newcap < newlen);
rm1->mask = fz_resize_array(rm1->ctx, rm1->mask, newcap, sizeof(*rm1->mask));
rm1->cap = newcap;
}
/* Now run backwards along rm1, filling it out with the merged regions */
for (o = newlen-1, i1 = rm1->len-1, i2 = rm2->len-1; o >= 0; o--)
{
/* So we read from i1 and i2 and store in o */
if (i1 < 0)
{
/* Just copy i2 */
rm1->mask[o] = rm2->mask[i2];
i2--;
}
else if (i2 < 0)
{
/* Just copy i1 */
rm1->mask[o] = rm1->mask[i1];
i1--;
}
else if (rm1->mask[i1].stop < rm2->mask[i2].start)
{
/* rm1's region is entirely before rm2's - copy rm2's */
rm1->mask[o] = rm2->mask[i2];
i2--;
}
else if (rm2->mask[i2].stop < rm1->mask[i1].start)
{
/* rm2's region is entirely before rm1's - copy rm1's */
rm1->mask[o] = rm1->mask[i1];
i1--;
}
else
{
/* We must be merging */
rm1->mask[o].ave_start = (rm1->mask[i1].start * rm1->freq + rm2->mask[i2].start * rm2->freq)/(rm1->freq + rm2->freq);
rm1->mask[o].ave_stop = (rm1->mask[i1].stop * rm1->freq + rm2->mask[i2].stop * rm2->freq)/(rm1->freq + rm2->freq);
rm1->mask[o].start = fz_min(rm1->mask[i1].start, rm2->mask[i2].start);
rm1->mask[o].stop = fz_max(rm1->mask[i1].stop, rm2->mask[i2].stop);
i1--;
i2--;
}
}
rm1->freq += rm2->freq;
rm1->len = newlen;
}
static region_mask *region_masks_match(const region_masks *rms, const region_mask *rm, fz_text_line *line, region_mask *prev_match)
{
int i;
float best_score = 9999999;
float score;
int best = -1;
int best_count = 0;
/* If the 'previous match' matches, use it regardless. */
if (prev_match && region_mask_matches(prev_match, rm, &score))
{
return prev_match;
}
/* Run through and find the 'most compatible' region mask. We are
* guaranteed that there will always be at least one compatible one!
*/
for (i=0; i < rms->len; i++)
{
int count = region_mask_matches(rms->mask[i], rm, &score);
if (count > best_count || (count == best_count && (score < best_score || best == -1)))
{
best = i;
best_score = score;
best_count = count;
}
}
assert(best >= 0 && best < rms->len);
/* So we have the matching mask. */
return rms->mask[best];
}
#ifdef DEBUG_MASKS
static void
dump_region_mask(const region_mask *rm)
{
int j;
for (j = 0; j < rm->len; j++)
{
printf("%g->%g ", rm->mask[j].start, rm->mask[j].stop);
}
printf("* %d\n", rm->freq);
}
static void
dump_region_masks(const region_masks *rms)
{
int i;
for (i = 0; i < rms->len; i++)
{
region_mask *rm = rms->mask[i];
dump_region_mask(rm);
}
}
#endif
static void region_masks_add(region_masks *rms, region_mask *rm)
{
/* Add rm to rms */
if (rms->len == rms->cap)
{
int newcap = (rms->cap ? rms->cap * 2 : 4);
rms->mask = fz_resize_array(rms->ctx, rms->mask, newcap, sizeof(*rms->mask));
rms->cap = newcap;
}
rms->mask[rms->len] = rm;
rms->len++;
}
static void region_masks_sort(region_masks *rms)
{
int i, j;
/* First calculate sizes */
for (i=0; i < rms->len; i++)
{
region_mask *rm = rms->mask[i];
float size = 0;
for (j=0; j < rm->len; j++)
{
size += rm->mask[j].stop - rm->mask[j].start;
}
rm->size = size;
}
/* Now, sort on size */
/* FIXME: bubble sort - use heapsort for efficiency */
for (i=0; i < rms->len-1; i++)
{
for (j=i+1; j < rms->len; j++)
{
if (rms->mask[i]->size < rms->mask[j]->size)
{
region_mask *tmp = rms->mask[i];
rms->mask[i] = rms->mask[j];
rms->mask[j] = tmp;
}
}
}
}
static void region_masks_merge(region_masks *rms, region_mask *rm)
{
int i;
float best_score = 9999999;
float score;
int best = -1;
int best_count = 0;
#ifdef DEBUG_MASKS
printf("\nAdding:\n");
dump_region_mask(rm);
printf("To:\n");
dump_region_masks(rms);
#endif
for (i=0; i < rms->len; i++)
{
int count = region_masks_mergeable(rms->mask[i], rm, &score);
if (count && (score < best_score || best == -1))
{
best = i;
best_count = count;
best_score = score;
}
}
if (best != -1)
{
region_mask_merge(rms->mask[best], rm, best_count);
#ifdef DEBUG_MASKS
printf("Merges to give:\n");
dump_region_masks(rms);
#endif
free_region_mask(rm);
return;
}
region_masks_add(rms, rm);
#ifdef DEBUG_MASKS
printf("Adding new one to give:\n");
dump_region_masks(rms);
#endif
}
static region_mask *
new_region_mask(fz_context *ctx, const fz_point *blv)
{
region_mask *rm = fz_malloc_struct(ctx, region_mask);
rm->ctx = ctx;
rm->freq = 1;
rm->blv = *blv;
rm->cap = 0;
rm->len = 0;
rm->mask = NULL;
return rm;
}
static void
region_mask_project(const region_mask *rm, const fz_point *min, const fz_point *max, float *start, float *end)
{
/* We project min and max down onto the blv */
float s = min->x * rm->blv.x + min->y * rm->blv.y;
float e = max->x * rm->blv.x + max->y * rm->blv.y;
if (s > e)
{
*start = e;
*end = s;
}
else
{
*start = s;
*end = e;
}
}
static void
region_mask_add(region_mask *rm, const fz_point *min, const fz_point *max)
{
float start, end;
int i, j;
region_mask_project(rm, min, max, &start, &end);
/* Now add start/end into our region list. Typically we will be adding
* to the end of the region list, so search from there backwards. */
for (i = rm->len; i > 0;)
{
if (start > rm->mask[i-1].stop)
break;
i--;
}
/* So we know that our interval can only affect list items >= i.
* We know that start is after our previous end. */
if (i == rm->len || end < rm->mask[i].start)
{
/* Insert new one. No overlap. No merging */
if (rm->len == rm->cap)
{
int newcap = (rm->cap ? rm->cap * 2 : 4);
rm->mask = fz_resize_array(rm->ctx, rm->mask, newcap, sizeof(*rm->mask));
rm->cap = newcap;
}
if (rm->len > i)
memmove(&rm->mask[i+1], &rm->mask[i], (rm->len - i) * sizeof(*rm->mask));
rm->mask[i].ave_start = start;
rm->mask[i].ave_stop = end;
rm->mask[i].start = start;
rm->mask[i].stop = end;
rm->len++;
}
else
{
/* Extend current one down. */
rm->mask[i].ave_start = start;
rm->mask[i].start = start;
if (rm->mask[i].stop < end)
{
rm->mask[i].stop = end;
rm->mask[i].ave_stop = end;
/* Our region may now extend upwards too far */
i++;
j = i;
while (j < rm->len && rm->mask[j].start <= end)
{
rm->mask[i-1].stop = end = rm->mask[j].stop;
j++;
}
if (i != j)
{
/* Move everything from j down to i */
while (j < rm->len)
{
rm->mask[i++] = rm->mask[j++];
}
}
rm->len -= j-i;
}
}
}
static int
region_mask_column(region_mask *rm, const fz_point *min, const fz_point *max, int *align, float *colw, float *left_)
{
float start, end, left, right;
int i;
region_mask_project(rm, min, max, &start, &end);
for (i = 0; i < rm->len; i++)
{
/* The use of MY_EPSILON here is because we might be matching
* start/end values calculated with slightly different blv's */
if (rm->mask[i].start - MY_EPSILON <= start && rm->mask[i].stop + MY_EPSILON >= end)
break;
}
if (i >= rm->len)
{
*align = 0;
*colw = 0;
return 0;
}
left = start - rm->mask[i].start;
right = rm->mask[i].stop - end;
if (left < 1 && right < 1)
*align = rm->mask[i].align;
else if (left*2 <= right)
*align = 0; /* Left */
else if (right * 2 < left)
*align = 2; /* Right */
else
*align = 1;
*left_ = left;
*colw = rm->mask[i].colw;
return i;
}
static void
region_mask_alignment(region_mask *rm)
{
int i;
float width = 0;
for (i = 0; i < rm->len; i++)
{
width += rm->mask[i].stop - rm->mask[i].start;
}
for (i = 0; i < rm->len; i++)
{
region *r = &rm->mask[i];
float left = r->ave_start - r->start;
float right = r->stop - r->ave_stop;
if (left*2 <= right)
r->align = 0; /* Left */
else if (right * 2 < left)
r->align = 2; /* Right */
else
r->align = 1;
r->colw = 100 * (rm->mask[i].stop - rm->mask[i].start) / width;
}
}
static void
region_masks_alignment(region_masks *rms)
{
int i;
for (i = 0; i < rms->len; i++)
{
region_mask_alignment(rms->mask[i]);
}
}
static int
is_unicode_hyphen(int c)
{
/* We omit 0x2011 (Non breaking hyphen) and 0x2043 (Hyphen Bullet)
* from this list. */
return (c == '-' ||
c == 0x2010 || /* Hyphen */
c == 0x002d || /* Hyphen-Minus */
c == 0x00ad || /* Soft hyphen */
c == 0x058a || /* Armenian Hyphen */
c == 0x1400 || /* Canadian Syllabive Hyphen */
c == 0x1806); /* Mongolian Todo soft hyphen */
}
static int
is_unicode_hyphenatable(int c)
{
/* This is a pretty ad-hoc collection. It may need tuning. */
return ((c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c >= 0x00c0 && c <= 0x00d6) ||
(c >= 0x00d8 && c <= 0x00f6) ||
(c >= 0x00f8 && c <= 0x02af) ||
(c >= 0x1d00 && c <= 0x1dbf) ||
(c >= 0x1e00 && c <= 0x1eff) ||
(c >= 0x2c60 && c <= 0x2c7f) ||
(c >= 0xa722 && c <= 0xa78e) ||
(c >= 0xa790 && c <= 0xa793) ||
(c >= 0xa7a8 && c <= 0xa7af) ||
(c >= 0xfb00 && c <= 0xfb07) ||
(c >= 0xff21 && c <= 0xff3a) ||
(c >= 0xff41 && c <= 0xff5a));
}
static void
dehyphenate(fz_text_span *s1, fz_text_span *s2)
{
int i;
for (i = s1->len-1; i > 0; i--)
if (!is_unicode_wspace(s1->text[i].c))
break;
/* Can't leave an empty span. */
if (i == 0)
return;
if (!is_unicode_hyphen(s1->text[i].c))
return;
if (!is_unicode_hyphenatable(s1->text[i-1].c))
return;
if (!is_unicode_hyphenatable(s2->text[0].c))
return;
s1->len = i;
s2->spacing = 0;
}
void
fz_text_analysis(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
{
fz_text_block *block;
fz_text_line *line;
line_heights *lh;
region_masks *rms;
int block_num;
/* Simple paragraph analysis; look for the most common 'inter line'
* spacing. This will be assumed to be our line spacing. Anything
* more than 25% wider than this will be assumed to be a paragraph
* space. */
/* Step 1: Gather the line height information */
lh = new_line_heights(ctx);
for (block = page->blocks; block < page->blocks + page->len; block++)
{
for (line = block->lines; line < block->lines + block->len; line++)
{
/* For every style in the line, add lineheight to the
* record for that style. FIXME: This is a nasty n^2
* algorithm at the moment. */
int span_num;
fz_text_style *style = NULL;
if (line->distance == 0)
continue;
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
int char_num;
if (is_list_entry(span, &char_num, span_num))
goto list_entry;
for (; char_num < span->len; char_num++)
{
fz_text_char *chr = &span->text[char_num];
/* Ignore any whitespace chars */
if (is_unicode_wspace(chr->c))
continue;
if (chr->style != style)
{
/* Have we had this style before? */
int match = 0;
int span_num2;
for (span_num2 = 0; span_num2 < span_num; span_num2++)
{
fz_text_span *span2 = line->spans[span_num2];
int char_num2;
for (char_num2 = 0; char_num2 < span2->len; char_num2++)
{
fz_text_char *chr2 = &span2->text[char_num2];
if (chr2->style == chr->style)
{
match = 1;
break;
}
}
}
if (char_num > 0 && match == 0)
{
fz_text_span *span2 = line->spans[span_num];
int char_num2;
for (char_num2 = 0; char_num2 < char_num; char_num2++)
{
fz_text_char *chr2 = &span2->text[char_num2];
if (chr2->style == chr->style)
{
match = 1;
break;
}
}
}
if (match == 0)
insert_line_height(lh, chr->style, line->distance);
style = chr->style;
}
}
list_entry:
{}
}
}
}
/* Step 2: Find the most popular line height for each style */
cull_line_heights(lh);
/* Step 3: Run through the blocks, breaking each block into two if
* the line height isn't right. */
for (block_num = 0; block_num < page->len; block_num++)
{
int line_num;
block = &page->blocks[block_num];
for (line_num = 0; line_num < block->len; line_num++)
{
/* For every style in the line, check to see if lineheight
* is correct for that style. FIXME: We check each style
* more than once, currently. */
int span_num;
int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */
fz_text_style *style = NULL;
line = &block->lines[line_num];
if (line->distance == 0)
continue;
#ifdef DEBUG_LINE_HEIGHTS
printf("line height=%g nspans=%d\n", line->distance, line->len);
#endif
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
int char_num;
if (is_list_entry(span, &char_num, span_num))
goto force_paragraph;
/* Now we do the rest of the line */
for (; char_num < span->len; char_num++)
{
fz_text_char *chr = &span->text[char_num];
/* Ignore any whitespace chars */
if (is_unicode_wspace(chr->c))
continue;
if (chr->style != style)
{
float proper_step = line_height_for_style(lh, chr->style);
if (proper_step * 0.95 <= line->distance && line->distance <= proper_step * 1.05)
{
ok = 1;
break;
}
style = chr->style;
}
}
if (ok)
break;
}
if (!ok)
{
force_paragraph:
split_block(ctx, page, block_num, line_num);
break;
}
}
}
free_line_heights(lh);
/* Simple line region analysis:
* For each line:
* form a list of 'start/stop' points (henceforth a 'region mask')
* find the normalised baseline vector for the line.
* Store the region mask and baseline vector.
* Collate lines that have compatible region masks and identical
* baseline vectors.
* If the collated masks are column-like, then split into columns.
* Otherwise split into tables.
*/
rms = new_region_masks(ctx);
/* Step 1: Form the region masks and store them into a list with the
* normalised baseline vectors. */
for (block = page->blocks; block < page->blocks + page->len; block++)
{
for (line = block->lines; line < block->lines + block->len; line++)
{
fz_point blv;
region_mask *rm;
int span_num;
#ifdef DEBUG_MASKS
printf("Line: ");
dump_line(line);
#endif
blv = line->spans[0]->max;
blv.x -= line->spans[0]->min.x;
blv.y -= line->spans[0]->min.y;
normalise(&blv);
rm = new_region_mask(ctx, &blv);
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
fz_point *region_min = &span->min;
fz_point *region_max = &span->max;
/* Treat adjacent spans as one big region */
while (span_num+1 < line->len && line->spans[span_num+1]->spacing < 1.5)
{
span_num++;
span = line->spans[span_num];
region_max = &span->max;
}
region_mask_add(rm, region_min, region_max);
}
#ifdef DEBUG_MASKS
dump_region_mask(rm);
#endif
region_masks_add(rms, rm);
}
}
/* Step 2: Sort the region_masks by size of masked region */
region_masks_sort(rms);
#ifdef DEBUG_MASKS
printf("Sorted list of regions:\n");
dump_region_masks(rms);
#endif
/* Step 3: Merge the region masks where possible (large ones first) */
{
int i;
region_masks *rms2;
rms2 = new_region_masks(ctx);
for (i=0; i < rms->len; i++)
{
region_mask *rm = rms->mask[i];
rms->mask[i] = NULL;
region_masks_merge(rms2, rm);
}
free_region_masks(rms);
rms = rms2;
}
#ifdef DEBUG_MASKS
printf("Merged list of regions:\n");
dump_region_masks(rms);
#endif
/* Step 4: Figure out alignment */
region_masks_alignment(rms);
/* Step 5: At this point, we should probably look at the region masks
* to try to guess which ones represent columns on the page. With our
* current code, we could only get blocks of lines that span 2 or more
* columns if the PDF producer wrote text out horizontally across 2
* or more columns, and we've never seen that (yet!). So we skip this
* step for now. */
/* Step 6: Run through the lines again, deciding which ones fit into
* which region mask. */
{
region_mask *prev_match = NULL;
for (block = page->blocks; block < page->blocks + page->len; block++)
{
for (line = block->lines; line < block->lines + block->len; line++)
{
fz_point blv;
region_mask *rm;
int span_num;
region_mask *match;
blv = line->spans[0]->max;
blv.x -= line->spans[0]->min.x;
blv.y -= line->spans[0]->min.y;
normalise(&blv);
#ifdef DEBUG_MASKS
dump_line(line);
#endif
rm = new_region_mask(ctx, &blv);
for (span_num = 0; span_num < line->len; span_num++)
{
fz_text_span *span = line->spans[span_num];
fz_point *region_min = &span->min;
fz_point *region_max = &span->max;
/* Treat adjacent spans as one big region */
while (span_num+1 < line->len && line->spans[span_num+1]->spacing < 1.5)
{
span_num++;
span = line->spans[span_num];
region_max = &span->max;
}
region_mask_add(rm, region_min, region_max);
}
#ifdef DEBUG_MASKS
printf("Mask: ");
dump_region_mask(rm);
#endif
match = region_masks_match(rms, rm, line, prev_match);
prev_match = match;
#ifdef DEBUG_MASKS
printf("Matches: ");
dump_region_mask(match);
#endif
free_region_mask(rm);
for (span_num = 0; span_num < line->len; )
{
fz_text_span *span = line->spans[span_num];
fz_point *region_min = &span->min;
fz_point *region_max = &span->max;
int sn;
int col, align;
float colw, left;
/* Treat adjacent spans as one big region */
#ifdef DEBUG_ALIGN
dump_span(line->spans[span_num]);
#endif
for (sn = span_num+1; sn < line->len && line->spans[sn]->spacing < 1.5; sn++)
{
region_max = &line->spans[sn]->max;
#ifdef DEBUG_ALIGN
dump_span(line->spans[sn]);
#endif
}
col = region_mask_column(match, region_min, region_max, &align, &colw, &left);
#ifdef DEBUG_ALIGN
printf(" = col%d colw=%g align=%d\n", col, colw, align);
#endif
do
{
line->spans[span_num]->column = col;
line->spans[span_num]->align = align;
line->spans[span_num]->indent = left;
line->spans[span_num]->column_width = colw;
span_num++;
}
while (span_num < sn);
}
line->region = match;
}
}
free_region_masks(rms);
}
/* Step 7: Collate lines within a block that share the same region
* mask. */
for (block = page->blocks; block < page->blocks + page->len; block++)
{
int line_num;
int prev_line_num;
int last_from = -1;
/* First merge lines. This may leave empty lines behind. */
for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++)
{
fz_text_line *prev_line;
line = &block->lines[line_num];
if (line->len == 0)
continue;
prev_line = &block->lines[prev_line_num];
if (prev_line->region == line->region)
{
int in1, in2, newlen, i, col;
float indent;
/* We only merge lines if the second line
* only uses 1 of the columns. */
col = line->spans[0]->column;
/* Copy the left value for the first span
* in the first column in this line forward
* for all the rest of the spans in the same
* column. */
indent = line->spans[0]->indent;
for (i = 1; i < line->len; i++)
{
if (col != line->spans[i]->column)
break;
line->spans[i]->indent = indent;
}
if (i != line->len)
{
prev_line_num = line_num;
continue;
}
/* Merge line into prev_line */
newlen = prev_line->len + line->len;
if (newlen > prev_line->cap)
{
int newcap = prev_line->cap ? prev_line->cap : 2;
do
{
newcap *= 2;
}
while (newcap < newlen);
prev_line->spans = fz_resize_array(ctx, prev_line->spans, newcap, sizeof(*prev_line->spans));
prev_line->cap = newcap;
}
in1 = prev_line->len-1;
in2 = line->len-1;
prev_line->len = newlen;
for (; in1 >= 0 || in2 >= 0; )
{
newlen--;
if (in1 < 0 || (in2 >= 0 && line->spans[in2]->column >= prev_line->spans[in1]->column))
{
prev_line->spans[newlen] = line->spans[in2];
in2--;
last_from = 1;
}
else
{
prev_line->spans[newlen] = prev_line->spans[in1];
in1--;
if (last_from == 1)
{
prev_line->spans[newlen+1]->spacing = 1;
dehyphenate(prev_line->spans[newlen], prev_line->spans[newlen+1]);
last_from = 0;
}
}
}
/* Leave line empty */
line->len = 0;
}
else
prev_line_num = line_num;
}
/* Now get rid of the empty lines */
for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++)
{
line = &block->lines[line_num];
if (line->len == 0)
fz_free(ctx, line->spans);
else
block->lines[prev_line_num++] = *line;
}
block->len = prev_line_num;
/* Now try to spot indents */
for (line_num = 0; line_num < block->len; line_num++)
{
int span_num, sn, col;
line = &block->lines[line_num];
/* Run through the spans... */
span_num = 0;
{
float indent = 0;
/* For each set of spans that share the same
* column... */
col = line->spans[span_num]->column;
#ifdef DEBUG_INDENTS
printf("Indent %g: ", line->spans[span_num]->indent);
dump_span(line->spans[span_num]);
printf("\n");
#endif
/* find the average indent of all but the first.. */
for (sn = span_num+1; sn < line->len && line->spans[sn]->column == col; sn++)
{
#ifdef DEBUG_INDENTS
printf("Indent %g: ", line->spans[sn]->indent);
dump_span(line->spans[sn]);
printf("\n");
#endif
indent += line->spans[sn]->indent;
line->spans[sn]->indent = 0;
}
if (sn > span_num+1)
indent /= sn-(span_num+1);
/* And compare this indent with the first one... */
#ifdef DEBUG_INDENTS
printf("Average indent %g ", indent);
#endif
indent -= line->spans[span_num]->indent;
#ifdef DEBUG_INDENTS
printf("delta %g ", indent);
#endif
if (fabsf(indent) < 1)
{
/* No indent worth speaking of */
indent = 0;
}
#ifdef DEBUG_INDENTS
printf("recorded %g\n", indent);
#endif
line->spans[span_num]->indent = indent;
span_num = sn;
}
for (; span_num < line->len; span_num++)
{
line->spans[span_num]->indent = 0;
}
}
}
}