diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2015-09-30 14:36:59 +0200 |
---|---|---|
committer | Robin Watts <robin.watts@artifex.com> | 2015-10-02 14:03:47 +0100 |
commit | 0dda0e13f481d1243fb88a576fc41fe9a7deac62 (patch) | |
tree | a45a7bc288aa3f28f5d6ff6f3d8867c6c4ce0b07 /source | |
parent | 6354c1a1f08d838d30019ba401367cebdc1a0f54 (diff) | |
download | mupdf-0dda0e13f481d1243fb88a576fc41fe9a7deac62.tar.xz |
epub: Use CJK line breaking rules.
A far cry from the full Unicode Line Breaking Algorithm, but it solves
the main issues with CJK line breaking.
Diffstat (limited to 'source')
-rw-r--r-- | source/html/html-layout.c | 64 |
1 files changed, 61 insertions, 3 deletions
diff --git a/source/html/html-layout.c b/source/html/html-layout.c index 717bb1ea..3ee6808b 100644 --- a/source/html/html-layout.c +++ b/source/html/html-layout.c @@ -113,6 +113,45 @@ static void add_flow_image(fz_context *ctx, fz_html *top, fz_css_style *style, f add_flow_glue(ctx, top, style, "", 0); } +static int iscjk(int c) +{ + if (c >= 0x3200 && c <= 0x9FFF) return 1; /* CJK Blocks */ + if (c >= 0xFF00 && c <= 0xFFEF) return 1; /* Halfwidth and Fullwidth Forms */ + return 0; +} + +static int not_at_bol(int cat, int c) +{ + if (cat == UCDN_GENERAL_CATEGORY_PF) return 1; + if (cat == UCDN_GENERAL_CATEGORY_PE) return 1; + if (c == ')' || c == 0xFF09) return 1; + if (c == ']' || c == 0xFF3D) return 1; + if (c == '}' || c == 0xFF5D) return 1; + if (c == '>' || c == 0xFF1E) return 1; + if (c == ',' || c == 0xFF0C) return 1; + if (c == '.' || c == 0xFF0E) return 1; + if (c == ':' || c == 0xFF1A) return 1; + if (c == ';' || c == 0xFF1B) return 1; + if (c == '?' || c == 0xFF1F) return 1; + if (c == '!' || c == 0xFF01) return 1; + if (c == '%' || c == 0xFF05) return 1; + return 0; +} + +static int not_at_eol(int cat, int c) +{ + if (cat == UCDN_GENERAL_CATEGORY_PI) return 1; + if (cat == UCDN_GENERAL_CATEGORY_PS) return 1; + if (c == '(' || c == 0xFF08) return 1; + if (c == '[' || c == 0xFF3B) return 1; + if (c == '{' || c == 0xFF5B) return 1; + if (c == '<' || c == 0xFF1C) return 1; + if (c == '$' || c == 0xFF04) return 1; + if (c >= 0xFFE0 || c == 0xFFE1) return 1; /* cent, pound */ + if (c == 0xFFE5 || c == 0xFFE6) return 1; /* yen, won */ + return 0; +} + static void generate_text(fz_context *ctx, fz_html *box, const char *text) { fz_html *flow; @@ -149,10 +188,29 @@ static void generate_text(fz_context *ctx, fz_html *box, const char *text) } else { - const char *mark = text++; + const char *mark = text; + int c, addglue = 0; while (*text && !iswhite(*text)) - ++text; - add_flow_word(ctx, flow, &box->style, mark, text); + { + /* TODO: Unicode Line Breaking Algorithm (UAX #14) */ + text += fz_chartorune(&c, text); + if (iscjk(c)) + { + int cat = ucdn_get_general_category(c); + if (addglue && !not_at_bol(cat, c)) + add_flow_glue(ctx, flow, &box->style, "", 0); + add_flow_word(ctx, flow, &box->style, mark, text); + if (!not_at_eol(cat, c)) + addglue = 1; + mark = text; + } + else + { + addglue = 0; + } + } + if (mark != text) + add_flow_word(ctx, flow, &box->style, mark, text); } } } |