Rearrange source files.

author: Tor Andersson <tor.andersson@artifex.com> 2013-06-19 15:29:44 +0200
committer: Tor Andersson <tor.andersson@artifex.com> 2013-06-20 16:45:35 +0200
commit: 0a927854a10e1e6b9770a81e2e1d9f3093631757 (patch)
tree: 3d65d820d9fdba2d0d394d99c36290c851b78ca0 /source/fitz/stext-paragraph.c
parent: 1ae8f19179c5f0f8c6352b3c7855465325d5449a (diff)
download: mupdf-0a927854a10e1e6b9770a81e2e1d9f3093631757.tar.xz
1 files changed, 1500 insertions, 0 deletions
diff --git a/source/fitz/stext-paragraph.c b/source/fitz/stext-paragraph.c
new file mode 100644
index 00000000..51062938
--- /dev/null
+++ b/source/fitz/stext-paragraph.c
@@ -0,0 +1,1500 @@
+#include "mupdf/fitz.h"
+
+/* Assemble span soup into blocks and lines. */
+
+#define MY_EPSILON 0.001f
+
+#undef DEBUG_LINE_HEIGHTS
+#undef DEBUG_MASKS
+#undef DEBUG_ALIGN
+#undef DEBUG_INDENTS
+
+#undef SPOT_LINE_NUMBERS
+
+typedef struct line_height_s
+{
+	float height;
+	int count;
+	fz_text_style *style;
+} line_height;
+
+typedef struct line_heights_s
+{
+	fz_context *ctx;
+	int cap;
+	int len;
+	line_height *lh;
+} line_heights;
+
+static line_heights *
+new_line_heights(fz_context *ctx)
+{
+	line_heights *lh = fz_malloc_struct(ctx, line_heights);
+	lh->ctx = ctx;
+	return lh;
+}
+
+static void
+free_line_heights(line_heights *lh)
+{
+	if (!lh)
+		return;
+	fz_free(lh->ctx, lh->lh);
+	fz_free(lh->ctx, lh);
+}
+
+static void
+insert_line_height(line_heights *lh, fz_text_style *style, float height)
+{
+	int i;
+
+#ifdef DEBUG_LINE_HEIGHTS
+	printf("style=%x height=%g\n", style, height);
+#endif
+
+	/* If we have one already, add it in */
+	for (i=0; i < lh->len; i++)
+	{
+		/* Match if we are within 5% */
+		if (lh->lh[i].style == style && lh->lh[i].height * 0.95 <= height && lh->lh[i].height * 1.05 >= height)
+		{
+			/* Ensure that the average height is correct */
+			lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1);
+			lh->lh[i].count++;
+			return;
+		}
+	}
+
+	/* Otherwise extend (if required) and add it */
+	if (lh->cap == lh->len)
+	{
+		int newcap = (lh->cap ? lh->cap * 2 : 4);
+		lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height));
+		lh->cap = newcap;
+	}
+
+	lh->lh[lh->len].count = 1;
+	lh->lh[lh->len].height = height;
+	lh->lh[lh->len].style = style;
+	lh->len++;
+}
+
+static void
+cull_line_heights(line_heights *lh)
+{
+	int i, j, k;
+
+#ifdef DEBUG_LINE_HEIGHTS
+	printf("Before culling:\n");
+	for (i = 0; i < lh->len; i++)
+	{
+		fz_text_style *style = lh->lh[i].style;
+		printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
+	}
+#endif
+	for (i = 0; i < lh->len; i++)
+	{
+		fz_text_style *style = lh->lh[i].style;
+		int count = lh->lh[i].count;
+		int max = i;
+
+		/* Find the max for this style */
+		for (j = i+1; j < lh->len; j++)
+		{
+			if (lh->lh[j].style == style && lh->lh[j].count > count)
+			{
+				max = j;
+				count = lh->lh[j].count;
+			}
+		}
+
+		/* Destroy all the ones other than the max */
+		if (max != i)
+		{
+			lh->lh[i].count = count;
+			lh->lh[i].height = lh->lh[max].height;
+			lh->lh[max].count = 0;
+		}
+		j = i+1;
+		for (k = j; k < lh->len; k++)
+		{
+			if (lh->lh[k].style != style)
+				lh->lh[j++] = lh->lh[k];
+		}
+		lh->len = j;
+	}
+#ifdef DEBUG_LINE_HEIGHTS
+	printf("After culling:\n");
+	for (i = 0; i < lh->len; i++)
+	{
+		fz_text_style *style = lh->lh[i].style;
+		printf("style=%x height=%g count=%d\n", style, lh->lh[i].height, lh->lh[i].count);
+	}
+#endif
+}
+
+static float
+line_height_for_style(line_heights *lh, fz_text_style *style)
+{
+	int i;
+
+	for (i=0; i < lh->len; i++)
+	{
+		if (lh->lh[i].style == style)
+			return lh->lh[i].height;
+	}
+	return 0.0; /* Never reached */
+}
+
+static void
+split_block(fz_context *ctx, fz_text_page *page, int block_num, int linenum)
+{
+	int split_len;
+	fz_text_block *block, *block2;
+
+	if (page->len == page->cap)
+	{
+		int new_cap = fz_maxi(16, page->cap * 2);
+		page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
+		page->cap = new_cap;
+	}
+
+	memmove(page->blocks+block_num+1, page->blocks+block_num, (page->len - block_num)*sizeof(*page->blocks));
+	page->len++;
+
+	block2 = fz_malloc_struct(ctx, fz_text_block);
+	block = page->blocks[block_num].u.text;
+
+	page->blocks[block_num+1].type = FZ_PAGE_BLOCK_TEXT;
+	page->blocks[block_num+1].u.text = block2;
+	split_len = block->len - linenum;
+	block2->bbox = block->bbox; /* FIXME! */
+	block2->cap = 0;
+	block2->len = 0;
+	block2->lines = NULL;
+	block2->lines = fz_malloc_array(ctx, split_len, sizeof(fz_text_line));
+	block2->cap = block2->len;
+	block2->len = split_len;
+	block->len = linenum;
+	memcpy(block2->lines, block->lines + linenum, split_len * sizeof(fz_text_line));
+	block2->lines[0].distance = 0;
+}
+
+static inline int
+is_unicode_wspace(int c)
+{
+	return (c == 9 || /* TAB */
+		c == 0x0a || /* HT */
+		c == 0x0b || /* LF */
+		c == 0x0c || /* VT */
+		c == 0x0d || /* FF */
+		c == 0x20 || /* CR */
+		c == 0x85 || /* NEL */
+		c == 0xA0 || /* No break space */
+		c == 0x1680 || /* Ogham space mark */
+		c == 0x180E || /* Mongolian Vowel Separator */
+		c == 0x2000 || /* En quad */
+		c == 0x2001 || /* Em quad */
+		c == 0x2002 || /* En space */
+		c == 0x2003 || /* Em space */
+		c == 0x2004 || /* Three-per-Em space */
+		c == 0x2005 || /* Four-per-Em space */
+		c == 0x2006 || /* Five-per-Em space */
+		c == 0x2007 || /* Figure space */
+		c == 0x2008 || /* Punctuation space */
+		c == 0x2009 || /* Thin space */
+		c == 0x200A || /* Hair space */
+		c == 0x2028 || /* Line separator */
+		c == 0x2029 || /* Paragraph separator */
+		c == 0x202F || /* Narrow no-break space */
+		c == 0x205F || /* Medium mathematical space */
+		c == 0x3000); /* Ideographic space */
+}
+
+static inline int
+is_unicode_bullet(int c)
+{
+	/* The last 2 aren't strictly bullets, but will do for our usage here */
+	return (c == 0x2022 || /* Bullet */
+		c == 0x2023 || /* Triangular bullet */
+		c == 0x25e6 || /* White bullet */
+		c == 0x2043 || /* Hyphen bullet */
+		c == 0x2219 || /* Bullet operator */
+		c == 149 || /* Ascii bullet */
+		c == '*');
+}
+
+static inline int
+is_number(int c)
+{
+	return ((c >= '0' && c <= '9') ||
+		(c == '.'));
+}
+
+static inline int
+is_latin_char(int c)
+{
+	return ((c >= 'A' && c <= 'Z') ||
+		(c >= 'a' && c <= 'z'));
+}
+
+static inline int
+is_roman(int c)
+{
+	return (c == 'i' || c == 'I' ||
+		c == 'v' || c == 'V' ||
+		c == 'x' || c == 'X' ||
+		c == 'l' || c == 'L' ||
+		c == 'c' || c == 'C' ||
+		c == 'm' || c == 'M');
+}
+
+static int
+is_list_entry(fz_text_line *line, fz_text_span *span, int *char_num_ptr)
+{
+	int char_num;
+	fz_text_char *chr;
+
+	/* First, skip over any whitespace */
+	for (char_num = 0; char_num < span->len; char_num++)
+	{
+		chr = &span->text[char_num];
+		if (!is_unicode_wspace(chr->c))
+			break;
+	}
+	*char_num_ptr = char_num;
+
+	if (span != line->first_span || char_num >= span->len)
+		return 0;
+
+	/* Now we check for various special cases, which we consider to mean
+	 * that this is probably a list entry and therefore should always count
+	 * as a separate paragraph (and hence not be entered in the line height
+	 * table). */
+	chr = &span->text[char_num];
+
+	/* Is the first char on the line, a bullet point? */
+	if (is_unicode_bullet(chr->c))
+		return 1;
+
+#ifdef SPOT_LINE_NUMBERS
+	/* Is the entire first span a number? Or does it start with a number
+	 * followed by ) or : ? Allow to involve single latin chars too. */
+	if (is_number(chr->c) || is_latin_char(chr->c))
+	{
+		int cn = char_num;
+		int met_char = is_latin_char(chr->c);
+		for (cn = char_num+1; cn < span->len; cn++)
+		{
+			fz_text_char *chr2 = &span->text[cn];
+
+			if (is_latin_char(chr2->c) && !met_char)
+			{
+				met_char = 1;
+				continue;
+			}
+			met_char = 0;
+			if (!is_number(chr2->c) && !is_unicode_wspace(chr2->c))
+				break;
+			else if (chr2->c == ')' || chr2->c == ':')
+			{
+				cn = span->len;
+				break;
+			}
+		}
+		if (cn == span->len)
+			return 1;
+	}
+
+	/* Is the entire first span a roman numeral? Or does it start with
+	 * a roman numeral followed by ) or : ? */
+	if (is_roman(chr->c))
+	{
+		int cn = char_num;
+		for (cn = char_num+1; cn < span->len; cn++)
+		{
+			fz_text_char *chr2 = &span->text[cn];
+
+			if (!is_roman(chr2->c) && !is_unicode_wspace(chr2->c))
+				break;
+			else if (chr2->c == ')' || chr2->c == ':')
+			{
+				cn = span->len;
+				break;
+			}
+		}
+		if (cn == span->len)
+			return 1;
+	}
+#endif
+	return 0;
+}
+
+typedef struct region_masks_s region_masks;
+
+typedef struct region_mask_s region_mask;
+
+typedef struct region_s region;
+
+struct region_s
+{
+	float start;
+	float stop;
+	float ave_start;
+	float ave_stop;
+	int align;
+	float colw;
+};
+
+struct region_mask_s
+{
+	fz_context *ctx;
+	int freq;
+	fz_point blv;
+	int cap;
+	int len;
+	float size;
+	region *mask;
+};
+
+struct region_masks_s
+{
+	fz_context *ctx;
+	int cap;
+	int len;
+	region_mask **mask;
+};
+
+static region_masks *
+new_region_masks(fz_context *ctx)
+{
+	region_masks *rms = fz_malloc_struct(ctx, region_masks);
+	rms->ctx = ctx;
+	rms->cap = 0;
+	rms->len = 0;
+	rms->mask = NULL;
+	return rms;
+}
+
+static void
+free_region_mask(region_mask *rm)
+{
+	if (!rm)
+		return;
+	fz_free(rm->ctx, rm->mask);
+	fz_free(rm->ctx, rm);
+}
+
+static void
+free_region_masks(region_masks *rms)
+{
+	int i;
+
+	if (!rms)
+		return;
+	for (i=0; i < rms->len; i++)
+	{
+		free_region_mask(rms->mask[i]);
+	}
+	fz_free(rms->ctx, rms->mask);
+	fz_free(rms->ctx, rms);
+}
+
+static int region_masks_mergeable(const region_mask *rm1, const region_mask *rm2, float *score)
+{
+	int i1, i2;
+	int count = 0;
+
+	*score = 0;
+	if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
+		return 0;
+
+	for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
+	{
+		if (rm1->mask[i1].stop < rm2->mask[i2].start)
+		{
+			/* rm1's region is entirely before rm2's */
+			*score += rm1->mask[i1].stop - rm1->mask[i1].start;
+			i1++;
+		}
+		else if (rm1->mask[i1].start > rm2->mask[i2].stop)
+		{
+			/* rm2's region is entirely before rm1's */
+			*score += rm2->mask[i2].stop - rm2->mask[i2].start;
+			i2++;
+		}
+		else
+		{
+			float lscore, rscore;
+			if (rm1->mask[i1].start < rm2->mask[i2].start)
+			{
+				if (i2 > 0 && rm2->mask[i2-1].stop >= rm1->mask[i1].start)
+					return 0; /* Not compatible */
+				lscore = rm2->mask[i2].start - rm1->mask[i1].start;
+			}
+			else
+			{
+				if (i1 > 0 && rm1->mask[i1-1].stop >= rm2->mask[i2].start)
+					return 0; /* Not compatible */
+				lscore = rm1->mask[i1].start - rm2->mask[i2].start;
+			}
+			if (rm1->mask[i1].stop > rm2->mask[i2].stop)
+			{
+				if (i2+1 < rm2->len && rm2->mask[i2+1].start <= rm1->mask[i1].stop)
+					return 0; /* Not compatible */
+				rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
+			}
+			else
+			{
+				if (i1+1 < rm1->len && rm1->mask[i1+1].start <= rm2->mask[i2].stop)
+					return 0; /* Not compatible */
+				rscore = rm2->mask[i2].stop - rm1->mask[i1].stop;
+			}
+			/* In order to allow a region to merge, either the
+			 * left, the right, or the centre must agree */
+			if (lscore < 1)
+			{
+				if (rscore < 1)
+				{
+					rscore = 0;
+				}
+				lscore = 0;
+			}
+			else if (rscore < 1)
+			{
+				rscore = 0;
+			}
+			else
+			{
+				/* Neither Left or right agree. Does the centre? */
+				float ave1 = rm1->mask[i1].start + rm1->mask[i1].stop;
+				float ave2 = rm2->mask[i2].start + rm2->mask[i2].stop;
+				if (fabsf(ave1-ave2) > 1)
+				{
+					/* Nothing agrees, so don't merge */
+					return 0;
+				}
+				lscore = 0;
+				rscore = 0;
+			}
+			*score += lscore + rscore;
+			/* These two regions could be merged */
+			i1++;
+			i2++;
+		}
+		count++;
+	}
+	count += rm1->len-i1 + rm2->len-i2;
+	return count;
+}
+
+static int region_mask_matches(const region_mask *rm1, const region_mask *rm2, float *score)
+{
+	int i1, i2;
+	int close = 1;
+
+	*score = 0;
+	if (fabsf(rm1->blv.x-rm2->blv.x) >= MY_EPSILON || fabsf(rm1->blv.y-rm2->blv.y) >= MY_EPSILON)
+		return 0;
+
+	for (i1 = 0, i2 = 0; i1 < rm1->len && i2 < rm2->len; )
+	{
+		if (rm1->mask[i1].stop < rm2->mask[i2].start)
+		{
+			/* rm1's region is entirely before rm2's */
+			*score += rm1->mask[i1].stop - rm1->mask[i1].start;
+			i1++;
+		}
+		else if (rm1->mask[i1].start > rm2->mask[i2].stop)
+		{
+			/* Not compatible */
+			return 0;
+		}
+		else
+		{
+			float lscore, rscore;
+			if (rm1->mask[i1].start > rm2->mask[i2].start)
+			{
+				/* Not compatible */
+				return 0;
+			}
+			if (rm1->mask[i1].stop < rm2->mask[i2].stop)
+			{
+				/* Not compatible */
+				return 0;
+			}
+			lscore = rm2->mask[i2].start - rm1->mask[i1].start;
+			rscore = rm1->mask[i1].stop - rm2->mask[i2].stop;
+			if (lscore < 1)
+			{
+				if (rscore < 1)
+					close++;
+				close++;
+			}
+			else if (rscore < 1)
+				close++;
+			else if (fabsf(lscore - rscore) < 1)
+			{
+				lscore = fabsf(lscore-rscore);
+				rscore = 0;
+				close++;
+			}
+			*score += lscore + rscore;
+			i1++;
+			i2++;
+		}
+	}
+	if (i1 < rm1->len)
+	{
+		/* Still more to go in rm1 */
+		if (rm1->mask[i1].start < rm2->mask[rm2->len-1].stop)
+			return 0;
+	}
+	else if (i2 < rm2->len)
+	{
+		/* Still more to go in rm2 */
+		if (rm2->mask[i2].start < rm1->mask[rm1->len-1].stop)
+			return 0;
+	}
+
+	return close;
+}
+
+static void region_mask_merge(region_mask *rm1, const region_mask *rm2, int newlen)
+{
+	int o, i1, i2;
+
+	/* First, ensure that rm1 is long enough */
+	if (rm1->cap < newlen)
+	{
+		int newcap = rm1->cap ? rm1->cap : 2;
+		do
+		{
+			newcap *= 2;
+		}
+		while (newcap < newlen);
+		rm1->mask = fz_resize_array(rm1->ctx, rm1->mask, newcap, sizeof(*rm1->mask));
+		rm1->cap = newcap;
+	}
+
+	/* Now run backwards along rm1, filling it out with the merged regions */
+	for (o = newlen-1, i1 = rm1->len-1, i2 = rm2->len-1; o >= 0; o--)
+	{
+		/* So we read from i1 and i2 and store in o */
+		if (i1 < 0)
+		{
+			/* Just copy i2 */
+			rm1->mask[o] = rm2->mask[i2];
+			i2--;
+		}
+		else if (i2 < 0)
+		{
+			/* Just copy i1 */
+			rm1->mask[o] = rm1->mask[i1];
+			i1--;
+		}
+		else if (rm1->mask[i1].stop < rm2->mask[i2].start)
+		{
+			/* rm1's region is entirely before rm2's - copy rm2's */
+			rm1->mask[o] = rm2->mask[i2];
+			i2--;
+		}
+		else if (rm2->mask[i2].stop < rm1->mask[i1].start)
+		{
+			/* rm2's region is entirely before rm1's - copy rm1's */
+			rm1->mask[o] = rm1->mask[i1];
+			i1--;
+		}
+		else
+		{
+			/* We must be merging */
+			rm1->mask[o].ave_start = (rm1->mask[i1].start * rm1->freq + rm2->mask[i2].start * rm2->freq)/(rm1->freq + rm2->freq);
+			rm1->mask[o].ave_stop = (rm1->mask[i1].stop * rm1->freq + rm2->mask[i2].stop * rm2->freq)/(rm1->freq + rm2->freq);
+			rm1->mask[o].start = fz_min(rm1->mask[i1].start, rm2->mask[i2].start);
+			rm1->mask[o].stop = fz_max(rm1->mask[i1].stop, rm2->mask[i2].stop);
+			i1--;
+			i2--;
+		}
+	}
+	rm1->freq += rm2->freq;
+	rm1->len = newlen;
+}
+
+static region_mask *region_masks_match(const region_masks *rms, const region_mask *rm, fz_text_line *line, region_mask *prev_match)
+{
+	int i;
+	float best_score = 9999999;
+	float score;
+	int best = -1;
+	int best_count = 0;
+
+	/* If the 'previous match' matches, use it regardless. */
+	if (prev_match && region_mask_matches(prev_match, rm, &score))
+	{
+		return prev_match;
+	}
+
+	/* Run through and find the 'most compatible' region mask. We are
+	 * guaranteed that there will always be at least one compatible one!
+	 */
+	for (i=0; i < rms->len; i++)
+	{
+		int count = region_mask_matches(rms->mask[i], rm, &score);
+		if (count > best_count || (count == best_count && (score < best_score || best == -1)))
+		{
+			best = i;
+			best_score = score;
+			best_count = count;
+		}
+	}
+	assert(best >= 0 && best < rms->len);
+
+	/* So we have the matching mask. */
+	return rms->mask[best];
+}
+
+#ifdef DEBUG_MASKS
+static void
+dump_region_mask(const region_mask *rm)
+{
+	int j;
+	for (j = 0; j < rm->len; j++)
+	{
+		printf("%g->%g ", rm->mask[j].start, rm->mask[j].stop);
+	}
+	printf("* %d\n", rm->freq);
+}
+
+static void
+dump_region_masks(const region_masks *rms)
+{
+	int i;
+
+	for (i = 0; i < rms->len; i++)
+	{
+		region_mask *rm = rms->mask[i];
+		dump_region_mask(rm);
+	}
+}
+#endif
+
+static void region_masks_add(region_masks *rms, region_mask *rm)
+{
+	/* Add rm to rms */
+	if (rms->len == rms->cap)
+	{
+		int newcap = (rms->cap ? rms->cap * 2 : 4);
+		rms->mask = fz_resize_array(rms->ctx, rms->mask, newcap, sizeof(*rms->mask));
+		rms->cap = newcap;
+	}
+	rms->mask[rms->len] = rm;
+	rms->len++;
+}
+
+static void region_masks_sort(region_masks *rms)
+{
+	int i, j;
+
+	/* First calculate sizes */
+	for (i=0; i < rms->len; i++)
+	{
+		region_mask *rm = rms->mask[i];
+		float size = 0;
+		for (j=0; j < rm->len; j++)
+		{
+			size += rm->mask[j].stop - rm->mask[j].start;
+		}
+		rm->size = size;
+	}
+
+	/* Now, sort on size */
+	/* FIXME: bubble sort - use heapsort for efficiency */
+	for (i=0; i < rms->len-1; i++)
+	{
+		for (j=i+1; j < rms->len; j++)
+		{
+			if (rms->mask[i]->size < rms->mask[j]->size)
+			{
+				region_mask *tmp = rms->mask[i];
+				rms->mask[i] = rms->mask[j];
+				rms->mask[j] = tmp;
+			}
+		}
+	}
+}
+
+static void region_masks_merge(region_masks *rms, region_mask *rm)
+{
+	int i;
+	float best_score = 9999999;
+	float score;
+	int best = -1;
+	int best_count = 0;
+
+#ifdef DEBUG_MASKS
+	printf("\nAdding:\n");
+	dump_region_mask(rm);
+	printf("To:\n");
+	dump_region_masks(rms);
+#endif
+	for (i=0; i < rms->len; i++)
+	{
+		int count = region_masks_mergeable(rms->mask[i], rm, &score);
+		if (count && (score < best_score || best == -1))
+		{
+			best = i;
+			best_count = count;
+			best_score = score;
+		}
+	}
+	if (best != -1)
+	{
+		region_mask_merge(rms->mask[best], rm, best_count);
+#ifdef DEBUG_MASKS
+		printf("Merges to give:\n");
+		dump_region_masks(rms);
+#endif
+		free_region_mask(rm);
+		return;
+	}
+	region_masks_add(rms, rm);
+#ifdef DEBUG_MASKS
+	printf("Adding new one to give:\n");
+	dump_region_masks(rms);
+#endif
+}
+
+static region_mask *
+new_region_mask(fz_context *ctx, const fz_point *blv)
+{
+	region_mask *rm = fz_malloc_struct(ctx, region_mask);
+	rm->ctx = ctx;
+	rm->freq = 1;
+	rm->blv = *blv;
+	rm->cap = 0;
+	rm->len = 0;
+	rm->mask = NULL;
+	return rm;
+}
+
+static void
+region_mask_project(const region_mask *rm, const fz_point *min, const fz_point *max, float *start, float *end)
+{
+	/* We project min and max down onto the blv */
+	float s = min->x * rm->blv.x + min->y * rm->blv.y;
+	float e = max->x * rm->blv.x + max->y * rm->blv.y;
+	if (s > e)
+	{
+		*start = e;
+		*end = s;
+	}
+	else
+	{
+		*start = s;
+		*end = e;
+	}
+}
+
+static void
+region_mask_add(region_mask *rm, const fz_point *min, const fz_point *max)
+{
+	float start, end;
+	int i, j;
+
+	region_mask_project(rm, min, max, &start, &end);
+
+	/* Now add start/end into our region list. Typically we will be adding
+	 * to the end of the region list, so search from there backwards. */
+	for (i = rm->len; i > 0;)
+	{
+		if (start > rm->mask[i-1].stop)
+			break;
+		i--;
+	}
+	/* So we know that our interval can only affect list items >= i.
+	 * We know that start is after our previous end. */
+	if (i == rm->len || end < rm->mask[i].start)
+	{
+		/* Insert new one. No overlap. No merging */
+		if (rm->len == rm->cap)
+		{
+			int newcap = (rm->cap ? rm->cap * 2 : 4);
+			rm->mask = fz_resize_array(rm->ctx, rm->mask, newcap, sizeof(*rm->mask));
+			rm->cap = newcap;
+		}
+		if (rm->len > i)
+			memmove(&rm->mask[i+1], &rm->mask[i], (rm->len - i) * sizeof(*rm->mask));
+		rm->mask[i].ave_start = start;
+		rm->mask[i].ave_stop = end;
+		rm->mask[i].start = start;
+		rm->mask[i].stop = end;
+		rm->len++;
+	}
+	else
+	{
+		/* Extend current one down. */
+		rm->mask[i].ave_start = start;
+		rm->mask[i].start = start;
+		if (rm->mask[i].stop < end)
+		{
+			rm->mask[i].stop = end;
+			rm->mask[i].ave_stop = end;
+			/* Our region may now extend upwards too far */
+			i++;
+			j = i;
+			while (j < rm->len && rm->mask[j].start <= end)
+			{
+				rm->mask[i-1].stop = end = rm->mask[j].stop;
+				j++;
+			}
+			if (i != j)
+			{
+				/* Move everything from j down to i */
+				while (j < rm->len)
+				{
+					rm->mask[i++] = rm->mask[j++];
+				}
+			}
+			rm->len -= j-i;
+		}
+	}
+}
+
+static int
+region_mask_column(region_mask *rm, const fz_point *min, const fz_point *max, int *align, float *colw, float *left_)
+{
+	float start, end, left, right;
+	int i;
+
+	region_mask_project(rm, min, max, &start, &end);
+
+	for (i = 0; i < rm->len; i++)
+	{
+		/* The use of MY_EPSILON here is because we might be matching
+		 * start/end values calculated with slightly different blv's */
+		if (rm->mask[i].start - MY_EPSILON <= start && rm->mask[i].stop + MY_EPSILON >= end)
+			break;
+	}
+	if (i >= rm->len)
+	{
+		*align = 0;
+		*colw = 0;
+		return 0;
+	}
+	left = start - rm->mask[i].start;
+	right = rm->mask[i].stop - end;
+	if (left < 1 && right < 1)
+		*align = rm->mask[i].align;
+	else if (left*2 <= right)
+		*align = 0; /* Left */
+	else if (right * 2 < left)
+		*align = 2; /* Right */
+	else
+		*align = 1;
+	*left_ = left;
+	*colw = rm->mask[i].colw;
+	return i;
+}
+
+static void
+region_mask_alignment(region_mask *rm)
+{
+	int i;
+	float width = 0;
+
+	for (i = 0; i < rm->len; i++)
+	{
+		width += rm->mask[i].stop - rm->mask[i].start;
+	}
+	for (i = 0; i < rm->len; i++)
+	{
+		region *r = &rm->mask[i];
+		float left = r->ave_start - r->start;
+		float right = r->stop - r->ave_stop;
+		if (left*2 <= right)
+			r->align = 0; /* Left */
+		else if (right * 2 < left)
+			r->align = 2; /* Right */
+		else
+			r->align = 1;
+		r->colw = 100 * (rm->mask[i].stop - rm->mask[i].start) / width;
+	}
+}
+
+static void
+region_masks_alignment(region_masks *rms)
+{
+	int i;
+
+	for (i = 0; i < rms->len; i++)
+	{
+		region_mask_alignment(rms->mask[i]);
+	}
+}
+
+static int
+is_unicode_hyphen(int c)
+{
+	/* We omit 0x2011 (Non breaking hyphen) and 0x2043 (Hyphen Bullet)
+	 * from this list. */
+	return (c == '-' ||
+		c == 0x2010 || /* Hyphen */
+		c == 0x002d || /* Hyphen-Minus */
+		c == 0x00ad || /* Soft hyphen */
+		c == 0x058a || /* Armenian Hyphen */
+		c == 0x1400 || /* Canadian Syllabive Hyphen */
+		c == 0x1806); /* Mongolian Todo soft hyphen */
+}
+
+static int
+is_unicode_hyphenatable(int c)
+{
+	/* This is a pretty ad-hoc collection. It may need tuning. */
+	return ((c >= 'A' && c <= 'Z') ||
+		(c >= 'a' && c <= 'z') ||
+		(c >= 0x00c0 && c <= 0x00d6) ||
+		(c >= 0x00d8 && c <= 0x00f6) ||
+		(c >= 0x00f8 && c <= 0x02af) ||
+		(c >= 0x1d00 && c <= 0x1dbf) ||
+		(c >= 0x1e00 && c <= 0x1eff) ||
+		(c >= 0x2c60 && c <= 0x2c7f) ||
+		(c >= 0xa722 && c <= 0xa78e) ||
+		(c >= 0xa790 && c <= 0xa793) ||
+		(c >= 0xa7a8 && c <= 0xa7af) ||
+		(c >= 0xfb00 && c <= 0xfb07) ||
+		(c >= 0xff21 && c <= 0xff3a) ||
+		(c >= 0xff41 && c <= 0xff5a));
+}
+
+static void
+dehyphenate(fz_text_span *s1, fz_text_span *s2)
+{
+	int i;
+
+	for (i = s1->len-1; i > 0; i--)
+		if (!is_unicode_wspace(s1->text[i].c))
+			break;
+	/* Can't leave an empty span. */
+	if (i == 0)
+		return;
+
+	if (!is_unicode_hyphen(s1->text[i].c))
+		return;
+	if (!is_unicode_hyphenatable(s1->text[i-1].c))
+		return;
+	if (!is_unicode_hyphenatable(s2->text[0].c))
+		return;
+	s1->len = i;
+	s2->spacing = 0;
+}
+
+void
+fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
+{
+	fz_text_line *line;
+	fz_text_span *span;
+	line_heights *lh;
+	region_masks *rms;
+	int block_num;
+
+	/* Simple paragraph analysis; look for the most common 'inter line'
+	 * spacing. This will be assumed to be our line spacing. Anything
+	 * more than 25% wider than this will be assumed to be a paragraph
+	 * space. */
+
+	/* Step 1: Gather the line height information */
+	lh = new_line_heights(ctx);
+	for (block_num = 0; block_num < page->len; block_num++)
+	{
+		fz_text_block *block;
+
+		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+			continue;
+		block = page->blocks[block_num].u.text;
+
+		for (line = block->lines; line < block->lines + block->len; line++)
+		{
+			/* For every style in the line, add lineheight to the
+			 * record for that style. FIXME: This is a nasty n^2
+			 * algorithm at the moment. */
+			fz_text_style *style = NULL;
+
+			if (line->distance == 0)
+				continue;
+
+			for (span = line->first_span; span; span = span->next)
+			{
+				int char_num;
+
+				if (is_list_entry(line, span, &char_num))
+					goto list_entry;
+
+				for (; char_num < span->len; char_num++)
+				{
+					fz_text_char *chr = &span->text[char_num];
+
+					/* Ignore any whitespace chars */
+					if (is_unicode_wspace(chr->c))
+						continue;
+
+					if (chr->style != style)
+					{
+						/* Have we had this style before? */
+						int match = 0;
+						fz_text_span *span2;
+						for (span2 = line->first_span; span2; span2 = span2->next)
+						{
+							int char_num2;
+							for (char_num2 = 0; char_num2 < span2->len; char_num2++)
+							{
+								fz_text_char *chr2 = &span2->text[char_num2];
+								if (chr2->style == chr->style)
+								{
+									match = 1;
+									break;
+								}
+							}
+						}
+						if (char_num > 0 && match == 0)
+						{
+							fz_text_span *span2 = span;
+							int char_num2;
+							for (char_num2 = 0; char_num2 < char_num; char_num2++)
+							{
+								fz_text_char *chr2 = &span2->text[char_num2];
+								if (chr2->style == chr->style)
+								{
+									match = 1;
+									break;
+								}
+							}
+						}
+						if (match == 0)
+							insert_line_height(lh, chr->style, line->distance);
+						style = chr->style;
+					}
+				}
+list_entry:
+				{}
+			}
+		}
+	}
+
+	/* Step 2: Find the most popular line height for each style */
+	cull_line_heights(lh);
+
+	/* Step 3: Run through the blocks, breaking each block into two if
+	 * the line height isn't right. */
+	for (block_num = 0; block_num < page->len; block_num++)
+	{
+		int line_num;
+		fz_text_block *block;
+
+		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+			continue;
+		block = page->blocks[block_num].u.text;
+
+		for (line_num = 0; line_num < block->len; line_num++)
+		{
+			/* For every style in the line, check to see if lineheight
+			 * is correct for that style. FIXME: We check each style
+			 * more than once, currently. */
+			int ok = 0; /* -1 = early exit, split now. 0 = split. 1 = don't split. */
+			fz_text_style *style = NULL;
+			line = &block->lines[line_num];
+
+			if (line->distance == 0)
+				continue;
+
+#ifdef DEBUG_LINE_HEIGHTS
+			printf("line height=%g nspans=%d\n", line->distance, line->len);
+#endif
+			for (span = line->first_span; span; span = span->next)
+			{
+				int char_num;
+
+				if (is_list_entry(line, span, &char_num))
+					goto force_paragraph;
+
+				/* Now we do the rest of the line */
+				for (; char_num < span->len; char_num++)
+				{
+					fz_text_char *chr = &span->text[char_num];
+
+					/* Ignore any whitespace chars */
+					if (is_unicode_wspace(chr->c))
+						continue;
+
+					if (chr->style != style)
+					{
+						float proper_step = line_height_for_style(lh, chr->style);
+						if (proper_step * 0.95 <= line->distance && line->distance <= proper_step * 1.05)
+						{
+							ok = 1;
+							break;
+						}
+						style = chr->style;
+					}
+				}
+				if (ok)
+					break;
+			}
+			if (!ok)
+			{
+force_paragraph:
+				split_block(ctx, page, block_num, line_num);
+				break;
+			}
+		}
+	}
+	free_line_heights(lh);
+
+	/* Simple line region analysis:
+	 * For each line:
+	 *    form a list of 'start/stop' points (henceforth a 'region mask')
+	 *    find the normalised baseline vector for the line.
+	 *    Store the region mask and baseline vector.
+	 * Collate lines that have compatible region masks and identical
+	 * baseline vectors.
+	 * If the collated masks are column-like, then split into columns.
+	 * Otherwise split into tables.
+	 */
+	rms = new_region_masks(ctx);
+
+	/* Step 1: Form the region masks and store them into a list with the
+	 * normalised baseline vectors. */
+	for (block_num = 0; block_num < page->len; block_num++)
+	{
+		fz_text_block *block;
+
+		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+			continue;
+		block = page->blocks[block_num].u.text;
+
+		for (line = block->lines; line < block->lines + block->len; line++)
+		{
+			fz_point blv;
+			region_mask *rm;
+
+#ifdef DEBUG_MASKS
+			printf("Line: ");
+			dump_line(line);
+#endif
+			blv = line->first_span->max;
+			blv.x -= line->first_span->min.x;
+			blv.y -= line->first_span->min.y;
+			fz_normalize_vector(&blv);
+
+			rm = new_region_mask(ctx, &blv);
+			for (span = line->first_span; span; span = span->next)
+			{
+				fz_point *region_min = &span->min;
+				fz_point *region_max = &span->max;
+
+				/* Treat adjacent spans as one big region */
+				while (span->next && span->next->spacing < 1.5)
+				{
+					span = span->next;
+					region_max = &span->max;
+				}
+
+				region_mask_add(rm, region_min, region_max);
+			}
+#ifdef DEBUG_MASKS
+			dump_region_mask(rm);
+#endif
+			region_masks_add(rms, rm);
+		}
+	}
+
+	/* Step 2: Sort the region_masks by size of masked region */
+	region_masks_sort(rms);
+
+#ifdef DEBUG_MASKS
+	printf("Sorted list of regions:\n");
+	dump_region_masks(rms);
+#endif
+	/* Step 3: Merge the region masks where possible (large ones first) */
+	{
+		int i;
+		region_masks *rms2;
+		rms2 = new_region_masks(ctx);
+		for (i=0; i < rms->len; i++)
+		{
+			region_mask *rm = rms->mask[i];
+			rms->mask[i] = NULL;
+			region_masks_merge(rms2, rm);
+		}
+		free_region_masks(rms);
+		rms = rms2;
+	}
+
+#ifdef DEBUG_MASKS
+	printf("Merged list of regions:\n");
+	dump_region_masks(rms);
+#endif
+
+	/* Step 4: Figure out alignment */
+	region_masks_alignment(rms);
+
+	/* Step 5: At this point, we should probably look at the region masks
+	 * to try to guess which ones represent columns on the page. With our
+	 * current code, we could only get blocks of lines that span 2 or more
+	 * columns if the PDF producer wrote text out horizontally across 2
+	 * or more columns, and we've never seen that (yet!). So we skip this
+	 * step for now. */
+
+	/* Step 6: Run through the lines again, deciding which ones fit into
+	 * which region mask. */
+	{
+	region_mask *prev_match = NULL;
+	for (block_num = 0; block_num < page->len; block_num++)
+	{
+		fz_text_block *block;
+
+		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+			continue;
+		block = page->blocks[block_num].u.text;
+
+		for (line = block->lines; line < block->lines + block->len; line++)
+		{
+			fz_point blv;
+			region_mask *rm;
+			region_mask *match;
+
+			blv = line->first_span->max;
+			blv.x -= line->first_span->min.x;
+			blv.y -= line->first_span->min.y;
+			fz_normalize_vector(&blv);
+
+#ifdef DEBUG_MASKS
+			dump_line(line);
+#endif
+			rm = new_region_mask(ctx, &blv);
+			for (span = line->first_span; span; span = span->next)
+			{
+				fz_point *region_min = &span->min;
+				fz_point *region_max = &span->max;
+
+				/* Treat adjacent spans as one big region */
+				while (span->next && span->next->spacing < 1.5)
+				{
+					span = span->next;
+					region_max = &span->max;
+				}
+
+				region_mask_add(rm, region_min, region_max);
+			}
+#ifdef DEBUG_MASKS
+			printf("Mask: ");
+			dump_region_mask(rm);
+#endif
+			match = region_masks_match(rms, rm, line, prev_match);
+			prev_match = match;
+#ifdef DEBUG_MASKS
+			printf("Matches: ");
+			dump_region_mask(match);
+#endif
+			free_region_mask(rm);
+			span = line->first_span;
+			while (span)
+			{
+				fz_point *region_min = &span->min;
+				fz_point *region_max = &span->max;
+				fz_text_span *sn;
+				int col, align;
+				float colw, left;
+
+				/* Treat adjacent spans as one big region */
+#ifdef DEBUG_ALIGN
+				dump_span(span);
+#endif
+				for (sn = span->next; sn && sn->spacing < 1.5; sn = sn->next)
+				{
+					region_max = &sn->max;
+#ifdef DEBUG_ALIGN
+					dump_span(sn);
+#endif
+				}
+				col = region_mask_column(match, region_min, region_max, &align, &colw, &left);
+#ifdef DEBUG_ALIGN
+				printf(" = col%d colw=%g align=%d\n", col, colw, align);
+#endif
+				do
+				{
+					span->column = col;
+					span->align = align;
+					span->indent = left;
+					span->column_width = colw;
+					span = span->next;
+				}
+				while (span != sn);
+
+				if (span)
+					span = span->next;
+			}
+			line->region = match;
+		}
+	}
+	free_region_masks(rms);
+	}
+
+	/* Step 7: Collate lines within a block that share the same region
+	 * mask. */
+	for (block_num = 0; block_num < page->len; block_num++)
+	{
+		int line_num;
+		int prev_line_num;
+
+		fz_text_block *block;
+
+		if (page->blocks[block_num].type != FZ_PAGE_BLOCK_TEXT)
+			continue;
+		block = page->blocks[block_num].u.text;
+
+		/* First merge lines. This may leave empty lines behind. */
+		for (prev_line_num = 0, line_num = 1; line_num < block->len; line_num++)
+		{
+			fz_text_line *prev_line;
+			line = &block->lines[line_num];
+			if (!line->first_span)
+				continue;
+			prev_line = &block->lines[prev_line_num];
+			if (prev_line->region == line->region)
+			{
+				/* We only merge lines if the second line
+				 * only uses 1 of the columns. */
+				int col = line->first_span->column;
+				/* Copy the left value for the first span
+				 * in the first column in this line forward
+				 * for all the rest of the spans in the same
+				 * column. */
+				float indent = line->first_span->indent;
+				for (span = line->first_span->next; span; span = span->next)
+				{
+					if (col != span->column)
+						break;
+					span->indent = indent;
+				}
+				if (span)
+				{
+					prev_line_num = line_num;
+					continue;
+				}
+
+				/* Merge line into prev_line */
+				{
+					fz_text_span **prev_line_span = &prev_line->first_span;
+					int try_dehyphen = -1;
+					fz_text_span *prev_span = NULL;
+					span = line->first_span;
+					while (span)
+					{
+						/* Skip forwards through the original
+						 * line, until we find a place where
+						 * span should go. */
+						if ((*prev_line_span)->column <= span->column)
+						{
+							/* The current span we are considering
+							 * in prev_line is earlier than span.
+							 * Just skip forwards in prev_line. */
+							prev_span = (*prev_line_span);
+							prev_line_span = &prev_span->next;
+							try_dehyphen = span->column;
+						}
+						else
+						{
+							/* We want to copy span into prev_line. */
+							fz_text_span *next = (*prev_line_span)->next;
+
+							if (prev_line_span == &prev_line->first_span)
+								prev_line->first_span = span;
+							if (next == NULL)
+								prev_line->last_span = span;
+							if (try_dehyphen == span->column)
+								dehyphenate(prev_span, span);
+							try_dehyphen = -1;
+							prev_span = *prev_line_span = span;
+							span = span->next;
+							(*prev_line_span)->next = next;
+							prev_line_span = &span->next;
+						}
+					}
+					while (span || *prev_line_span);
+					line->first_span = NULL;
+					line->last_span = NULL;
+				}
+			}
+			else
+				prev_line_num = line_num;
+		}
+
+		/* Now get rid of the empty lines */
+		for (prev_line_num = 0, line_num = 0; line_num < block->len; line_num++)
+		{
+			line = &block->lines[line_num];
+			if (line->first_span)
+				block->lines[prev_line_num++] = *line;
+		}
+		block->len = prev_line_num;
+
+		/* Now try to spot indents */
+		for (line_num = 0; line_num < block->len; line_num++)
+		{
+			fz_text_span *span_num, *sn;
+			int col, count;
+			line = &block->lines[line_num];
+
+			/* Run through the spans... */
+			span_num = line->first_span;
+			{
+				float indent = 0;
+				/* For each set of spans that share the same
+				 * column... */
+				col = span_num->column;
+#ifdef DEBUG_INDENTS
+				printf("Indent %g: ", span_num->indent);
+				dump_span(span_num);
+				printf("\n");
+#endif
+
+				/* find the average indent of all but the first.. */
+				for (sn = span_num->next, count = 0; sn && sn->column == col; sn = sn->next, count++)
+				{
+#ifdef DEBUG_INDENTS
+					printf("Indent %g: ", sn->indent);
+					dump_span(sn);
+				printf("\n");
+#endif
+					indent += sn->indent;
+					sn->indent = 0;
+				}
+				if (sn != span_num->next)
+					indent /= count;
+
+				/* And compare this indent with the first one... */
+#ifdef DEBUG_INDENTS
+				printf("Average indent %g ", indent);
+#endif
+				indent -= span_num->indent;
+#ifdef DEBUG_INDENTS
+				printf("delta %g ", indent);
+#endif
+				if (fabsf(indent) < 1)
+				{
+					/* No indent worth speaking of */
+					indent = 0;
+				}
+#ifdef DEBUG_INDENTS
+				printf("recorded %g\n", indent);
+#endif
+				span_num->indent = indent;
+				span_num = sn;
+			}
+			for (; span_num; span_num = span_num->next)
+			{
+				span_num->indent = 0;
+			}
+		}
+	}
+}
author	Tor Andersson <tor.andersson@artifex.com>	2013-06-19 15:29:44 +0200
committer	Tor Andersson <tor.andersson@artifex.com>	2013-06-20 16:45:35 +0200
commit	0a927854a10e1e6b9770a81e2e1d9f3093631757 (patch)
tree	3d65d820d9fdba2d0d394d99c36290c851b78ca0 /source/fitz/stext-paragraph.c
parent	1ae8f19179c5f0f8c6352b3c7855465325d5449a (diff)
download	mupdf-0a927854a10e1e6b9770a81e2e1d9f3093631757.tar.xz