summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2016-03-08 17:27:25 +0100
committerTor Andersson <tor.andersson@artifex.com>2016-03-14 17:08:29 +0100
commit9f737805b3ce0bfe2ed9a5d64e3962d95faca0fd (patch)
tree7c4fcf75daccf86e9f03ae17be78d5f4b0e3a0cb
parent3ab2ad89c00326dcbaf33749125432e8691b68a0 (diff)
downloadmupdf-9f737805b3ce0bfe2ed9a5d64e3962d95faca0fd.tar.xz
html: Use pair break table from UAX #14 to add soft breaks.
Note: does not handle the case where certain classes of characters prohibit breaking at spaces between them.
-rw-r--r--source/html/html-layout.c128
1 files changed, 70 insertions, 58 deletions
diff --git a/source/html/html-layout.c b/source/html/html-layout.c
index c274460b..99e0632a 100644
--- a/source/html/html-layout.c
+++ b/source/html/html-layout.c
@@ -69,6 +69,7 @@ struct genstate
fz_css_rule *css;
int at_bol;
int emit_white;
+ int last_brk_cls;
};
static int iswhite(int c)
@@ -177,46 +178,6 @@ static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *fl
return new_flow;
}
-static int iscjk(int c)
-{
- if (c >= 0x3000 && c <= 0x303F) return 1; /* CJK Symbols and Punctuation */
- if (c >= 0x3200 && c <= 0x9FFF) return 1; /* CJK Blocks */
- if (c >= 0xFF00 && c <= 0xFFEF) return 1; /* Halfwidth and Fullwidth Forms */
- return 0;
-}
-
-static int not_at_bol(int cat, int c)
-{
- if (cat == UCDN_GENERAL_CATEGORY_PF) return 1;
- if (cat == UCDN_GENERAL_CATEGORY_PE) return 1;
- if (c == ')' || c == 0xFF09) return 1;
- if (c == ']' || c == 0xFF3D) return 1;
- if (c == '}' || c == 0xFF5D) return 1;
- if (c == '>' || c == 0xFF1E) return 1;
- if (c == ',' || c == 0xFF0C || c == 0x3001) return 1;
- if (c == '.' || c == 0xFF0E || c == 0x3002) return 1;
- if (c == ':' || c == 0xFF1A) return 1;
- if (c == ';' || c == 0xFF1B) return 1;
- if (c == '?' || c == 0xFF1F) return 1;
- if (c == '!' || c == 0xFF01) return 1;
- if (c == '%' || c == 0xFF05) return 1;
- return 0;
-}
-
-static int not_at_eol(int cat, int c)
-{
- if (cat == UCDN_GENERAL_CATEGORY_PI) return 1;
- if (cat == UCDN_GENERAL_CATEGORY_PS) return 1;
- if (c == '(' || c == 0xFF08) return 1;
- if (c == '[' || c == 0xFF3B) return 1;
- if (c == '{' || c == 0xFF5B) return 1;
- if (c == '<' || c == 0xFF1C) return 1;
- if (c == '$' || c == 0xFF04) return 1;
- if (c >= 0xFFE0 || c == 0xFFE1) return 1; /* cent, pound */
- if (c == 0xFFE5 || c == 0xFFE6) return 1; /* yen, won */
- return 0;
-}
-
static void flush_space(fz_context *ctx, fz_pool *pool, fz_html *flow, fz_css_style *style, struct genstate *g)
{
static const char *space = " ";
@@ -234,6 +195,42 @@ static void flush_space(fz_context *ctx, fz_pool *pool, fz_html *flow, fz_css_st
}
}
+/* pair-wise lookup table for UAX#14 linebreaks */
+static const char *pairbrk[29] =
+{
+/* OCCQGNESIPPNAHIIHBBBZCWHHJJJR */
+/* PLPULSXYSROULLDNYAB2WMJ23LVTI */
+ "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
+ "_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */
+ "_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */
+ "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */
+ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */
+ "_^^%%%^^^_______%%__^^^______", /* NS nonstarters */
+ "_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */
+ "_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */
+ "_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */
+ "%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */
+ "%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */
+ "%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */
+ "%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */
+ "%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */
+ "_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */
+ "_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */
+ "_^^%_%^^^__%____%%__^^^______", /* HY hyphens */
+ "_^^%_%^^^_______%%__^^^______", /* BA break after */
+ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */
+ "_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */
+ "____________________^________", /* ZW zero width space */
+ "%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */
+ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */
+ "_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */
+ "_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/voewl/trailing syllable */
+ "_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */
+ "_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */
+ "_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */
+ "_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */
+};
+
static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const char *text, struct genstate *g)
{
fz_html *flow;
@@ -280,17 +277,20 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
add_flow_word(ctx, pool, flow, &box->style, space, space+1);
++text;
}
+ g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
}
else
{
const char *prev, *mark = text;
- int c, addsbreak = 0;
+ int c;
flush_space(ctx, pool, flow, &box->style, g);
+ if (g->at_bol)
+ g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
+
while (*text && !iswhite(*text))
{
- /* TODO: Unicode Line Breaking Algorithm (UAX #14) */
prev = text;
text += fz_chartorune(&c, text);
if (c == 0xAD) /* soft hyphen */
@@ -299,20 +299,30 @@ static void generate_text(fz_context *ctx, fz_pool *pool, fz_html *box, const ch
add_flow_word(ctx, pool, flow, &box->style, mark, prev);
add_flow_shyphen(ctx, pool, flow, &box->style);
mark = text;
+ g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
}
- else if (iscjk(c))
+ else if (bsp) /* allow soft breaks */
{
- int cat = ucdn_get_general_category(c);
- if (addsbreak && !not_at_bol(cat, c))
- add_flow_sbreak(ctx, pool, flow, &box->style);
- add_flow_word(ctx, pool, flow, &box->style, mark, text);
- if (!not_at_eol(cat, c))
- addsbreak = 1;
- mark = text;
- }
- else
- {
- addsbreak = 0;
+ int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
+ if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI)
+ {
+ int brk = pairbrk[g->last_brk_cls][this_brk_cls];
+
+ /* we handle spaces elsewhere, so ignore these classes */
+ if (brk == '@') brk = '^';
+ if (brk == '#') brk = '^';
+ if (brk == '%') brk = '^';
+
+ if (brk == '_')
+ {
+ if (mark != prev)
+ add_flow_word(ctx, pool, flow, &box->style, mark, prev);
+ add_flow_sbreak(ctx, pool, flow, &box->style);
+ mark = prev;
+ }
+
+ g->last_brk_cls = this_brk_cls;
+ }
}
}
if (mark != text)
@@ -1025,7 +1035,8 @@ static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, f
/* If the broken node fits, remember it. */
/* Also remember it if we have no other candidate and need to break in desperation. */
- if (line_w + break_w <= box->w || !candidate) {
+ if (line_w + break_w <= box->w || !candidate)
+ {
candidate = node;
candidate_w = line_w + break_w;
}
@@ -1746,9 +1757,10 @@ fz_print_html_flow(fz_context *ctx, fz_html_flow *flow, fz_html_flow *end)
switch (flow->type)
{
case FLOW_WORD: printf("%s", flow->content.text); break;
- case FLOW_SPACE: printf("_"); break;
- case FLOW_SBREAK: printf("%%"); break;
- case FLOW_BREAK: printf("!"); break;
+ case FLOW_SPACE: printf("[ ]"); break;
+ case FLOW_SBREAK: printf("[%%]"); break;
+ case FLOW_SHYPHEN: printf("[-]"); break;
+ case FLOW_BREAK: printf("[!]"); break;
case FLOW_IMAGE: printf("<img>"); break;
}
flow = flow->next;