summaryrefslogtreecommitdiff
path: root/source/html
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2016-02-23 22:02:24 +0100
committerTor Andersson <tor.andersson@artifex.com>2016-02-24 13:25:27 +0100
commita76adb3dce6360d08aa5eeb7a23da50c3af81c10 (patch)
treeea65ffeee18874d57eee411ad3ebb7559cb2a1a5 /source/html
parent67c252d77a1665164c3b3d217d556f19ef879a99 (diff)
downloadmupdf-a76adb3dce6360d08aa5eeb7a23da50c3af81c10.tar.xz
html: Rewrite handling of whitespace collapsing.
Diffstat (limited to 'source/html')
-rw-r--r--source/html/html-layout.c148
1 files changed, 101 insertions, 47 deletions
diff --git a/source/html/html-layout.c b/source/html/html-layout.c
index f4a0024a..c738ff21 100644
--- a/source/html/html-layout.c
+++ b/source/html/html-layout.c
@@ -65,6 +65,18 @@ static int iswhite(int c)
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
+static int is_all_white(const char *s)
+{
+ while (*s)
+ {
+ if (!iswhite(*s))
+ return 0;
+ ++s;
+ }
+ return 1;
+}
+
+
/* TODO: pool allocator for flow nodes */
/* TODO: store text by pointing to a giant buffer */
@@ -94,10 +106,10 @@ static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html *top, fz_c
return flow;
}
-static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html *top, fz_css_style *style, int expand)
+static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html *top, fz_css_style *style)
{
fz_html_flow *flow = add_flow(ctx, pool, top, style, FLOW_SPACE);
- flow->expand = !!expand;
+ flow->expand = 1;
}
static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html *top, fz_css_style *style)
@@ -196,7 +208,24 @@ static int not_at_eol(int cat, int c)
return 0;
}
-static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const char *text)
+static void flush_space(fz_context *ctx, fz_pool *pool, fz_html *flow, fz_css_style *style, int *emit_white, int *at_bol)
+{
+ static const char *space = " ";
+ int bsp = style->white_space & WS_ALLOW_BREAK_SPACE;
+ if (*emit_white)
+ {
+ if (!*at_bol)
+ {
+ if (bsp)
+ add_flow_space(ctx, pool, flow, style);
+ else
+ add_flow_word(ctx, pool, flow, style, space, space+1);
+ }
+ *emit_white = 0;
+ }
+}
+
+static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const char *text, int *emit_white, int *at_bol)
{
fz_html *flow;
@@ -204,6 +233,8 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
int bsp = box->style.white_space & WS_ALLOW_BREAK_SPACE;
int bnl = box->style.white_space & WS_FORCE_BREAK_NEWLINE;
+ static const char *space = " ";
+
flow = box;
while (flow->type != BOX_FLOW)
flow = flow->up;
@@ -217,23 +248,37 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
else
text += 1;
add_flow_break(ctx, pool, flow, &box->style);
+ *at_bol = 1;
}
else if (iswhite(*text))
{
- const char *mark = text++;
if (collapse)
- while (iswhite(*text))
- ++text;
- /* TODO: tabs */
- if (bsp)
- add_flow_space(ctx, pool, flow, &box->style, 1);
+ {
+ if (bnl)
+ while (*text == ' ' || *text == '\t')
+ ++text;
+ else
+ while (iswhite(*text))
+ ++text;
+ *emit_white = 1;
+ }
else
- add_flow_word(ctx, pool, flow, &box->style, mark, text);
+ {
+ // TODO: tabs
+ if (bsp)
+ add_flow_space(ctx, pool, flow, &box->style);
+ else
+ add_flow_word(ctx, pool, flow, &box->style, space, space+1);
+ ++text;
+ }
}
else
{
const char *prev, *mark = text;
int c, addsbreak = 0;
+
+ flush_space(ctx, pool, flow, &box->style, emit_white, at_bol);
+
while (*text && !iswhite(*text))
{
/* TODO: Unicode Line Breaking Algorithm (UAX #14) */
@@ -263,11 +308,13 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
}
if (mark != text)
add_flow_word(ctx, pool, flow, &box->style, mark, text);
+
+ *at_bol = 0;
}
}
}
-static void generate_image(fz_context *ctx, fz_pool *pool, fz_archive *zip, const char *base_uri, fz_html *box, const char *src)
+static void generate_image(fz_context *ctx, fz_pool *pool, fz_archive *zip, const char *base_uri, fz_html *box, const char *src, int *emit_white, int *at_bol)
{
fz_image *img = NULL;
fz_buffer *buf = NULL;
@@ -286,6 +333,8 @@ static void generate_image(fz_context *ctx, fz_pool *pool, fz_archive *zip, cons
fz_var(buf);
fz_var(img);
+ flush_space(ctx, pool, flow, &box->style, emit_white, at_bol);
+
fz_try(ctx)
{
buf = fz_read_archive_entry(ctx, zip, path);
@@ -305,6 +354,8 @@ static void generate_image(fz_context *ctx, fz_pool *pool, fz_archive *zip, cons
fz_warn(ctx, "html: cannot add image src='%s'", src);
add_flow_word(ctx, pool, flow, &box->style, alt, alt + 7);
}
+
+ *at_bol = 0;
}
static void init_box(fz_context *ctx, fz_html *box)
@@ -411,7 +462,7 @@ static fz_html *insert_break_box(fz_context *ctx, fz_html *box, fz_html *top)
return top;
}
-static void insert_inline_box(fz_context *ctx, fz_pool *pool, fz_html *box, fz_html *top)
+static void insert_inline_box(fz_context *ctx, fz_pool *pool, fz_html *box, fz_html *top, int *at_bol)
{
if (top->type == BOX_BLOCK)
{
@@ -425,6 +476,7 @@ static void insert_inline_box(fz_context *ctx, fz_pool *pool, fz_html *box, fz_h
flow->is_first_flow = !top->last;
insert_box(ctx, flow, BOX_FLOW, top);
insert_box(ctx, box, BOX_INLINE, flow);
+ *at_bol = 1;
}
}
else if (top->type == BOX_FLOW)
@@ -439,7 +491,8 @@ static void insert_inline_box(fz_context *ctx, fz_pool *pool, fz_html *box, fz_h
static void generate_boxes(fz_context *ctx, fz_pool *pool,
fz_html_font_set *set, fz_archive *zip, const char *base_uri,
- fz_xml *node, fz_html *top, fz_css_rule *rule, fz_css_match *up_match, int list_counter)
+ fz_xml *node, fz_html *top, fz_css_rule *rule, fz_css_match *up_match, int list_counter,
+ int *emit_white, int *at_bol)
{
fz_css_match match;
fz_html *box;
@@ -473,6 +526,7 @@ static void generate_boxes(fz_context *ctx, fz_pool *pool,
fz_apply_css_style(ctx, set, &box->style, &match);
top = insert_break_box(ctx, box, top);
}
+ *at_bol = 1;
}
else if (!strcmp(tag, "img"))
@@ -482,8 +536,8 @@ static void generate_boxes(fz_context *ctx, fz_pool *pool,
{
box = new_box(ctx, pool);
fz_apply_css_style(ctx, set, &box->style, &match);
- insert_inline_box(ctx, pool, box, top);
- generate_image(ctx, pool, zip, base_uri, box, src);
+ insert_inline_box(ctx, pool, box, top, at_bol);
+ generate_image(ctx, pool, zip, base_uri, box, src, emit_white, at_bol);
}
}
@@ -503,7 +557,7 @@ static void generate_boxes(fz_context *ctx, fz_pool *pool,
}
else if (display == DIS_INLINE)
{
- insert_inline_box(ctx, pool, box, top);
+ insert_inline_box(ctx, pool, box, top, at_bol);
}
else
{
@@ -516,28 +570,35 @@ static void generate_boxes(fz_context *ctx, fz_pool *pool,
int child_counter = list_counter;
if (!strcmp(tag, "ul") || !strcmp(tag, "ol"))
child_counter = 0;
- generate_boxes(ctx, pool, set, zip, base_uri, fz_xml_down(node), box, rule, &match, child_counter);
+ generate_boxes(ctx, pool, set, zip, base_uri, fz_xml_down(node), box, rule, &match, child_counter, emit_white, at_bol);
}
-
- // TODO: remove empty flow boxes
}
}
else
{
- if (top->type != BOX_INLINE)
+ const char *text = fz_xml_text(node);
+ int collapse = top->style.white_space & WS_COLLAPSE;
+ if (collapse && is_all_white(text))
{
- /* Create anonymous inline box, with the same style as the top block box. */
- box = new_box(ctx, pool);
- insert_inline_box(ctx, pool, box, top);
- box->style = top->style;
- /* Make sure not to recursively multiply font sizes. */
- box->style.font_size.value = 1;
- box->style.font_size.unit = N_SCALE;
- generate_text(ctx, pool, box, fz_xml_text(node));
+ *emit_white = 1;
}
else
{
- generate_text(ctx, pool, top, fz_xml_text(node));
+ if (top->type != BOX_INLINE)
+ {
+ /* Create anonymous inline box, with the same style as the top block box. */
+ box = new_box(ctx, pool);
+ insert_inline_box(ctx, pool, box, top, at_bol);
+ box->style = top->style;
+ /* Make sure not to recursively multiply font sizes. */
+ box->style.font_size.value = 1;
+ box->style.font_size.unit = N_SCALE;
+ generate_text(ctx, pool, box, text, emit_white, at_bol);
+ }
+ else
+ {
+ generate_text(ctx, pool, top, text, emit_white, at_bol);
+ }
}
}
@@ -894,12 +955,7 @@ static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, f
}
}
- /* start by skipping whitespace and (soft) breaks at the beginning of tags */
node = box->flow_head;
- if (node->type == FLOW_BREAK)
- node = node->next;
- while (node && (node->type == FLOW_SPACE || node->type == FLOW_SBREAK))
- node = node->next;
candidate = NULL;
candidate_w = 0;
@@ -1372,14 +1428,7 @@ static fz_html_flow *find_list_mark_anchor(fz_context *ctx, fz_html *box)
while (box)
{
if (box->type == BOX_FLOW)
- {
- fz_html_flow *flow = box->flow_head;
- if (flow && flow->type == FLOW_BREAK)
- flow = flow->next;
- while (flow && (flow->type == FLOW_SPACE || flow->type == FLOW_SBREAK))
- flow = flow->next;
- return flow;
- }
+ return box->flow_head;
box = box->down;
}
return NULL;
@@ -1655,10 +1704,10 @@ fz_print_html_flow(fz_context *ctx, fz_html_flow *flow, fz_html_flow *end)
switch (flow->type)
{
case FLOW_WORD: printf("%s", flow->content.text); break;
- case FLOW_SPACE: if (!flow->breaks_line) printf(" "); break;
- case FLOW_SBREAK: if (!flow->breaks_line) printf("\\n"); break;
- case FLOW_BREAK: printf("\\n"); break;
- case FLOW_IMAGE: printf("[image]"); break;
+ case FLOW_SPACE: printf("_"); break;
+ case FLOW_SBREAK: printf("%%"); break;
+ case FLOW_BREAK: printf("!"); break;
+ case FLOW_IMAGE: printf("<img>"); break;
}
flow = flow->next;
}
@@ -1678,6 +1727,9 @@ fz_print_html(fz_context *ctx, fz_html *box, int pstyle, int level)
case BOX_INLINE: printf("inline"); break;
}
+ if (box->list_item)
+ printf(" list=%d", box->list_item);
+
if (box->down || box->flow_head)
printf(" {\n");
else
@@ -1944,6 +1996,8 @@ fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const cha
fz_css_rule *css;
fz_css_match match;
fz_html *box;
+ int emit_white = 0;
+ int at_bol = 1;
xml = fz_parse_xml(ctx, buf->data, buf->len, 1);
@@ -1966,7 +2020,7 @@ fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const cha
fz_match_css_at_page(ctx, &match, css);
fz_apply_css_style(ctx, set, &box->style, &match);
- generate_boxes(ctx, pool, set, zip, base_uri, xml, box, css, &match, 0);
+ generate_boxes(ctx, pool, set, zip, base_uri, xml, box, css, &match, 0, &emit_white, &at_bol);
fz_drop_css(ctx, css);
fz_drop_xml(ctx, xml);