summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2016-01-18 16:31:52 +0000
committerRobin Watts <robin.watts@artifex.com>2016-01-18 19:25:12 +0000
commit5e608c0649ece27029484f388c672bed98af6e34 (patch)
treef702ff385c5c1804b215a288b8d9f8215acc2f9d /source
parent8876142a36c76d242f5a1a73dd66fa0430847ddd (diff)
downloadmupdf-5e608c0649ece27029484f388c672bed98af6e34.tar.xz
Process HTML text for directionality.
After we parse html text from an ebook run it through the unicode bidirectional algorithm to determine the directionality of each fragment. This may require splitting of fragments. Currently we don't do anything with this information.
Diffstat (limited to 'source')
-rw-r--r--source/fitz/bidi.c222
-rw-r--r--source/html/html-layout.c195
2 files changed, 199 insertions, 218 deletions
diff --git a/source/fitz/bidi.c b/source/fitz/bidi.c
index a3bbbc1d..59d1578a 100644
--- a/source/fitz/bidi.c
+++ b/source/fitz/bidi.c
@@ -35,7 +35,6 @@
#include "mupdf/fitz.h"
#include "bidi-impl.h" /* standard bidi code interface */
-#include "bidi-internal.h"
/*
* Macros...
@@ -548,7 +547,6 @@ Bidi_CharType Bidi_classFromChN(uint16_t ch)
}
-static
int Bidi_isEuropeanNumber(const uint16_t *str, unsigned int len)
{
const uint16_t *end = str + len;
@@ -650,32 +648,6 @@ uint16_t Bidi_mirrorChar(const uint16_t u)
return UNICODE_EOS;
}
-#ifdef BIDI_LINE_AT_A_TIME
-/** Subsitutes characters with their mirrored equivalents, e.g. swaps
- * '(' with ')'. Implements rule L4 of the uint16_t Bidirectional
- * algorithm. Mirrored characters are found using a binary search through
- * the lookup table bidiOptMirrorList.
- */
-static
-void Bidi_mirror(uint16_t *text,
- const Bidi_Level *poslevel,
- int len)
-{
- int i;
- uint16_t out;
-
- for (i = 0; i < len; i ++)
- {
- if (ODD(poslevel[i]) == 0)
- continue;
- out = Bidi_mirrorChar((const uint16_t)text[i]);
- if(out != UNICODE_EOS)
- text[i] = out;
- }
-}
-#endif
-
-
/** Searches a RTL fragment for a mirror character
* When it finds one it creates a separate fragment for the
* character and the surrounding fragments. It passes the mirrored
@@ -766,11 +738,15 @@ void Bidi_classifyCharacters(const uint16_t *text,
);
}
fprintf(stderr, "\nTypes: ");
+#endif
for (i = 0; i < len; i++)
{
types[i] = Bidi_classFromChN(text[i]);
+#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "%c", charFromTypes[(int)types[i]]);
+#endif
}
+#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "\n");
#endif
}
@@ -1117,193 +1093,3 @@ void Bidi_fragmentText(fz_context *ctx,
fz_rethrow(ctx);
}
}
-
-
-
-static void newFragCb(const uint16_t *fragment,
- size_t fragmentLen,
- int rightToLeft,
- uint16_t mirror,
- void *arg)
-{
- size_t fragOffset;
- int isRtlNumber = FALSE;
- Bidi_ProcessLine_fragData *fragData = arg;
-
- assert(fragData != NULL);
-
- fragOffset = fragment - fragData->entireText;
-
- if((fragOffset != 0) &&
- Bidi_isEuropeanNumber(fragment,fragmentLen))
- {
- /* fragment contains digits only */
- isRtlNumber = TRUE;
- }
- fragData->callersCallback(fragOffset,
- fragmentLen,
- rightToLeft,
- mirror,
- fragData->callersData,
- isRtlNumber);
-}
-
-
-
-/* Currently this creates a text representation of all the objects
- * on a line, then passes it to Bidi_fragmentText. In future, we
- * should create the levels array directly from the objects supplied.
- */
-Bidi_Direction Bidi_processLine(fz_context *ctx,
- Bidi_PL_NextObj_Callback nextObjCb,
- Bidi_PL_Fragment_Callback fragmentCb,
- void *callerData,
- int bidiFlag,
- int *more)
-{
- uint16_t *entireText = NULL;
- size_t entireLength = 0;
- size_t lengthIncrement = 100;
- size_t freeSpace = 0;
- const uint16_t *objText = NULL;
- size_t objLength = 0;
- Bidi_Direction explicitDirection = Bidi_Neutral;
-
- fz_var(entireText);
-
- fz_try(ctx)
- {
- do
- {
- /* Call the client-supplied function to get the next object. */
- nextObjCb(callerData, &objText, &objLength, more, &explicitDirection);
- if (objText == NULL || objLength == 0)
- {
- /* End of sequence, or caller is giving us rubbish. */
- break;
- }
-
- if (freeSpace < objLength)
- {
- /* We need a bigger buffer. */
- uint16_t *newBuffer = NULL;
- size_t increment = (objLength < lengthIncrement) ?
- lengthIncrement : objLength;
- /* Accommodate the string terminator that ustrncat will write */
- if (entireText == NULL)
- {
- /* First Time. Add extra byte */
- entireLength += increment + 1;
- }
- else
- {
- entireLength += increment;
- }
- newBuffer = fz_resize_array(ctx, entireText, entireLength, sizeof(uint16_t));
- if (entireText == NULL)
- {
- /* First time. Pretend buffer holds an empty string. */
- newBuffer[0] = UNICODE_EOS;
- }
- entireText = newBuffer;
- freeSpace += increment;
- }
- ustrncat (entireText, objText, objLength);
-
- freeSpace -= objLength;
- }
- while (objText != NULL);
-
- if (entireText != NULL)
- {
- size_t i;
- size_t length = entireLength - 1 - freeSpace;
- Bidi_ProcessLine_fragData fragData;
- /* replace TAB character with SPACE for html Documents */
- if((bidiFlag & Bidi_replaceTab) != 0)
- {
- for (i = 0; i < length; i++)
- {
- if (entireText[i]=='\t')
- {
- entireText[i] = ' ';
- }
- }
- }
-
- fragData.entireText = entireText;
- fragData.callersCallback = fragmentCb;
- fragData.callersData = callerData;
- Bidi_fragmentText(ctx,
- entireText,
- length, &explicitDirection,
- newFragCb, &fragData,
- bidiFlag);
- }
- }
- fz_always(ctx)
- {
- fz_free(ctx, entireText);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
-
- return explicitDirection;
-}
-
-
-#ifdef BIDI_LINE_AT_A_TIME
-/* This changes the given text by reordering and possibly mirroring
- * characters. Suitable only for a single line of text.
- */
-void Bidi_processText(fz_context *ctx,
- uint16_t *text,
- Bidi_Direction baseDirection,
- Bidi_Direction outputDirection,
- int len)
-{
- Bidi_Level *levels;
-
- if (text == NULL || len == 0)
- {
- return;
- }
-
- levels = createLevels(ctx, text, (size_t)len, &baseDirection, TRUE, 0);
-
- assert (levels != NULL);
-
- /* resolve mirrored characters, e.g. swap '(' with ')' */
- Bidi_mirror(text, levels, len);
-
- (void)Bidi_reorder(baseDirection, text, levels, len);
-
- /* reverse the string */
- if (outputDirection == Bidi_RightToLeft)
- ustrnreverse(text, len);
-
-#ifdef DEBUG_BIDI_VERBOSE
- fprintf(stderr, "Done: ");
- {
- int i;
- for (i = 0; i < len; i++)
- {
- /* So that we can actually sort of read the debug string, any
- * non-ascii characters are replaced with a 1-digit hash
- * value from 0-9, making non-english characters appear
- * as numbers
- */
- fprintf(stderr, "%c", (text[i] <= 127 && text[i ]>= 32)?
- text[i]
- :(char)((text[i] % 9) + 48)
- );
- }
- fprintf(stderr, "\n\n");
- }
-#endif
-
- fz_free(ctx, levels);
-}
-#endif
diff --git a/source/html/html-layout.c b/source/html/html-layout.c
index ad3dd621..5bfcac85 100644
--- a/source/html/html-layout.c
+++ b/source/html/html-layout.c
@@ -116,6 +116,33 @@ static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html *top, fz_css_
add_flow_glue(ctx, pool, top, style, "", 0);
}
+static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
+{
+ fz_html_flow *new_flow;
+ char *text;
+ size_t len;
+
+ if (offset == 0)
+ return flow;
+ new_flow = fz_pool_alloc(ctx, pool, sizeof *flow);
+ *new_flow = *flow;
+ new_flow->next = flow->next;
+ flow->next = new_flow;
+
+ text = flow->text;
+ while (*text && offset)
+ {
+ int rune;
+ text += fz_chartorune(&rune, text);
+ offset--;
+ }
+ len = strlen(text);
+ new_flow->text = fz_pool_alloc(ctx, pool, len+1);
+ strcpy(new_flow->text, text);
+ *text = 0;
+ return new_flow;
+}
+
static int iscjk(int c)
{
if (c >= 0x3200 && c <= 0x9FFF) return 1; /* CJK Blocks */
@@ -1347,6 +1374,172 @@ fz_layout_html(fz_context *ctx, fz_html *box, float w, float h, float em)
layout_block(ctx, box, &page_box, em, h, 0);
}
+typedef struct
+{
+ uint16_t *buffer;
+ size_t max;
+ size_t len;
+} uni_buf;
+
+typedef struct
+{
+ fz_context *ctx;
+ fz_pool *pool;
+ fz_html_flow *flow;
+ uni_buf *buffer;
+} bidi_data;
+
+static size_t utf8len(const char *text)
+{
+ size_t len = 0;
+
+ while (*text)
+ {
+ int rune;
+ text += fz_chartorune(&rune, text);
+ len++;
+ }
+ return len;
+}
+
+static void newFragCb(const uint16_t *fragment,
+ size_t fragmentLen,
+ int rightToLeft,
+ uint16_t mirror,
+ void *arg)
+{
+ bidi_data *data = (bidi_data *)arg;
+ size_t fragmentOffset = fragment - data->buffer->buffer;
+ int charDirR2L = rightToLeft;
+
+ if((fragmentOffset != 0) &&
+ Bidi_isEuropeanNumber(fragment, fragmentLen))
+ {
+ /* fragment contains digits only */
+ charDirR2L = 0;
+ }
+
+ /* We are guaranteed that fragmentOffset will be at the beginning
+ * of flow. */
+ while (fragmentLen > 0)
+ {
+ size_t len;
+
+ if (data->flow->type == FLOW_GLUE)
+ {
+ len = 1;
+ }
+ else
+ {
+ /* Must be text */
+ len = utf8len(data->flow->text);
+ if (len > fragmentLen)
+ {
+ /* We need to split this flow box */
+ (void)split_flow(data->ctx, data->pool, data->flow, fragmentLen);
+ len = utf8len(data->flow->text);
+ }
+ }
+
+ /* This flow box is entirely contained within this fragment. */
+ data->flow->block_r2l = rightToLeft;
+ data->flow->char_r2l = charDirR2L;
+ data->flow = data->flow->next;
+ fragmentOffset += len;
+ fragmentLen -= len;
+ }
+}
+
+static void
+detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_flow *flow)
+{
+ fz_html_flow *end = flow;
+ const char *text;
+ bidi_data data;
+ Bidi_Direction baseDir = -1;
+
+ /* Stage 1: Gather the text from the flow up into a single buffer */
+ buffer->len = 0;
+ while (end)
+ {
+ size_t len;
+ int broken = 0;
+
+ switch (end->type)
+ {
+ case FLOW_WORD:
+ len = utf8len(end->text);
+ text = end->text;
+ break;
+ case FLOW_GLUE:
+ len = 1;
+ text = " ";
+ break;
+ case FLOW_BREAK:
+ case FLOW_IMAGE:
+ broken = 1;
+ break;
+ }
+
+ if (broken)
+ break;
+
+ /* Make sure the buffer is large enough */
+ if (buffer->len + len > buffer->max)
+ {
+ size_t newmax = buffer->max * 2;
+ if (newmax == 0)
+ newmax = 128; /* Sensible small default */
+ buffer->buffer = fz_resize_array(ctx, buffer->buffer, newmax, sizeof(uint16_t));
+ buffer->max = newmax;
+ }
+
+ /* Expand the utf8 text into Unicode and store it in the buffer */
+ while (*text)
+ {
+ int rune;
+ text += fz_chartorune(&rune, text);
+ buffer->buffer[buffer->len++] = (uint16_t)rune;
+ }
+
+ end = end->next;
+ }
+
+ /* Detect directionality for the buffer */
+ data.ctx = ctx;
+ data.pool = pool;
+ data.flow = flow;
+ data.buffer = buffer;
+ Bidi_fragmentText(ctx,
+ buffer->buffer,
+ buffer->len,
+ &baseDir,
+ newFragCb, &data,
+ 0 /* Flags */);
+}
+
+static void
+detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html *box)
+{
+ while (box)
+ {
+ if (box->flow_head)
+ {
+ detect_flow_directionality(ctx, pool, buffer, box->flow_head);
+ }
+ detect_box_directionality(ctx, pool, buffer, box->down);
+ box = box->next;
+ }
+}
+
+static void
+detect_directionality(fz_context *ctx, fz_pool *pool, fz_html *box)
+{
+ uni_buf buffer = { NULL };
+
+ detect_box_directionality(ctx, pool, &buffer, box);
+}
+
fz_html *
fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
{
@@ -1380,5 +1573,7 @@ fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const cha
fz_drop_css(ctx, css);
fz_drop_xml(ctx, xml);
+ detect_directionality(ctx, pool, box);
+
return box;
}