summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2016-01-18 16:31:52 +0000
committerRobin Watts <robin.watts@artifex.com>2016-01-18 19:25:12 +0000
commit5e608c0649ece27029484f388c672bed98af6e34 (patch)
treef702ff385c5c1804b215a288b8d9f8215acc2f9d
parent8876142a36c76d242f5a1a73dd66fa0430847ddd (diff)
downloadmupdf-5e608c0649ece27029484f388c672bed98af6e34.tar.xz
Process HTML text for directionality.
After we parse html text from an ebook run it through the unicode bidirectional algorithm to determine the directionality of each fragment. This may require splitting of fragments. Currently we don't do anything with this information.
-rw-r--r--include/mupdf/fitz/bidi.h55
-rw-r--r--include/mupdf/html.h21
-rw-r--r--source/fitz/bidi.c222
-rw-r--r--source/html/html-layout.c195
4 files changed, 275 insertions, 218 deletions
diff --git a/include/mupdf/fitz/bidi.h b/include/mupdf/fitz/bidi.h
index 0b5b9553..978696ef 100644
--- a/include/mupdf/fitz/bidi.h
+++ b/include/mupdf/fitz/bidi.h
@@ -199,6 +199,61 @@ Bidi_Direction Bidi_processLine(fz_context *ctx,
int *more);
+int Bidi_isEuropeanNumber(const uint16_t *str, unsigned int len);
+
+/**
+ * returns a character's mirrored equivalent
+ *
+ * @param u Unicode character to process
+ */
+uint16_t Bidi_mirrorChar(const uint16_t u);
+
+
+
+/**
+ * Prototype for callback function supplied to Bidi_fragmentText.
+ *
+ * @param fragment first character in fragment
+ * @param fragmentLen number of characters in fragment
+ * @param rightToLeft true if fragment is right-to-left
+ * @param mirror The mirror code of the fragment if it exists
+ * @param arg data from caller of Bidi_fragmentText
+ */
+typedef void (Bidi_Fragment_Callback)(const uint16_t *fragment,
+ size_t fragmentLen,
+ int rightToLeft,
+ uint16_t mirror,
+ void *arg);
+
+
+
+/**
+ * Partitions the given Unicode sequence into one or more unidirectional
+ * fragments and invokes the given callback function for each fragment.
+ *
+ * For example, if directionality of text is:
+ * 0123456789
+ * rrlllrrrrr,
+ * we'll invoke callback with:
+ * &text[0], length == 2, rightToLeft == true
+ * &text[2], length == 3, rightToLeft == false
+ * &text[5], length == 5, rightToLeft == true.
+ *
+ * @param[in] text start of Unicode sequence
+ * @param[in] textlen number of Unicodes to analyse
+ * @param[in] baseDir direction of paragraph (specify Bidi_Neutral
+ * to force auto-detection)
+ * @param[in] callback function to be called for each fragment
+ * @param[in] arg data to be passed to the callback function
+ * @param[in] bidiFlag flag to be passed to the callback function
+ */
+void Bidi_fragmentText(fz_context *ctx,
+ const uint16_t *text,
+ size_t textlen,
+ Bidi_Direction *baseDir,
+ Bidi_Fragment_Callback callback,
+ void *arg,
+ int bidiFlag);
#endif /* BIDI_BIDI_H */
diff --git a/include/mupdf/html.h b/include/mupdf/html.h
index 830c95f8..3955aebc 100644
--- a/include/mupdf/html.h
+++ b/include/mupdf/html.h
@@ -188,10 +188,31 @@ enum
FLOW_IMAGE = 3
};
+/* We have to recognise the distinction between render direction
+ * and layout direction. For most strings render direction and
+ * logical direction are the same.
+ *
+ * Char direction determines whether a string 'ABC' appears as
+ * ABC or CBA.
+ *
+ * Block direction determines how fragments are attached together.
+ * 'ABC' and 'DEF' with r2l char and block directions will
+ * appear as 'FEDCBA'. With l2r char and block it will appear
+ * as 'ABCDEF'.
+ *
+ * The reason for the distinction is that we can have logical
+ * strings like 'ABC0123DEF', where 'ABC' and 'DEF' are in r2l
+ * scripts. The bidirectional code breaks this down into 3 fragments
+ * 'ABC' '0123' 'DEF', where all three are r2l, but digits need to
+ * be rendered left to right. i.e. the desired result is:
+ * FED0123CBA, rather than FED3210CBA.
+ */
struct fz_html_flow_s
{
unsigned int type : 2;
unsigned int expand : 1;
+ unsigned int char_r2l : 1;
+ unsigned int block_r2l : 1;
float x, y, w, h, em;
fz_css_style *style;
char *text;
diff --git a/source/fitz/bidi.c b/source/fitz/bidi.c
index a3bbbc1d..59d1578a 100644
--- a/source/fitz/bidi.c
+++ b/source/fitz/bidi.c
@@ -35,7 +35,6 @@
#include "mupdf/fitz.h"
#include "bidi-impl.h" /* standard bidi code interface */
-#include "bidi-internal.h"
/*
* Macros...
@@ -548,7 +547,6 @@ Bidi_CharType Bidi_classFromChN(uint16_t ch)
}
-static
int Bidi_isEuropeanNumber(const uint16_t *str, unsigned int len)
{
const uint16_t *end = str + len;
@@ -650,32 +648,6 @@ uint16_t Bidi_mirrorChar(const uint16_t u)
return UNICODE_EOS;
}
-#ifdef BIDI_LINE_AT_A_TIME
-/** Subsitutes characters with their mirrored equivalents, e.g. swaps
- * '(' with ')'. Implements rule L4 of the uint16_t Bidirectional
- * algorithm. Mirrored characters are found using a binary search through
- * the lookup table bidiOptMirrorList.
- */
-static
-void Bidi_mirror(uint16_t *text,
- const Bidi_Level *poslevel,
- int len)
-{
- int i;
- uint16_t out;
-
- for (i = 0; i < len; i ++)
- {
- if (ODD(poslevel[i]) == 0)
- continue;
- out = Bidi_mirrorChar((const uint16_t)text[i]);
- if(out != UNICODE_EOS)
- text[i] = out;
- }
-}
-#endif
-
-
/** Searches a RTL fragment for a mirror character
* When it finds one it creates a separate fragment for the
* character and the surrounding fragments. It passes the mirrored
@@ -766,11 +738,15 @@ void Bidi_classifyCharacters(const uint16_t *text,
);
}
fprintf(stderr, "\nTypes: ");
+#endif
for (i = 0; i < len; i++)
{
types[i] = Bidi_classFromChN(text[i]);
+#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "%c", charFromTypes[(int)types[i]]);
+#endif
}
+#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "\n");
#endif
}
@@ -1117,193 +1093,3 @@ void Bidi_fragmentText(fz_context *ctx,
fz_rethrow(ctx);
}
}
-
-
-
-static void newFragCb(const uint16_t *fragment,
- size_t fragmentLen,
- int rightToLeft,
- uint16_t mirror,
- void *arg)
-{
- size_t fragOffset;
- int isRtlNumber = FALSE;
- Bidi_ProcessLine_fragData *fragData = arg;
-
- assert(fragData != NULL);
-
- fragOffset = fragment - fragData->entireText;
-
- if((fragOffset != 0) &&
- Bidi_isEuropeanNumber(fragment,fragmentLen))
- {
- /* fragment contains digits only */
- isRtlNumber = TRUE;
- }
- fragData->callersCallback(fragOffset,
- fragmentLen,
- rightToLeft,
- mirror,
- fragData->callersData,
- isRtlNumber);
-}
-
-
-
-/* Currently this creates a text representation of all the objects
- * on a line, then passes it to Bidi_fragmentText. In future, we
- * should create the levels array directly from the objects supplied.
- */
-Bidi_Direction Bidi_processLine(fz_context *ctx,
- Bidi_PL_NextObj_Callback nextObjCb,
- Bidi_PL_Fragment_Callback fragmentCb,
- void *callerData,
- int bidiFlag,
- int *more)
-{
- uint16_t *entireText = NULL;
- size_t entireLength = 0;
- size_t lengthIncrement = 100;
- size_t freeSpace = 0;
- const uint16_t *objText = NULL;
- size_t objLength = 0;
- Bidi_Direction explicitDirection = Bidi_Neutral;
-
- fz_var(entireText);
-
- fz_try(ctx)
- {
- do
- {
- /* Call the client-supplied function to get the next object. */
- nextObjCb(callerData, &objText, &objLength, more, &explicitDirection);
- if (objText == NULL || objLength == 0)
- {
- /* End of sequence, or caller is giving us rubbish. */
- break;
- }
-
- if (freeSpace < objLength)
- {
- /* We need a bigger buffer. */
- uint16_t *newBuffer = NULL;
- size_t increment = (objLength < lengthIncrement) ?
- lengthIncrement : objLength;
- /* Accommodate the string terminator that ustrncat will write */
- if (entireText == NULL)
- {
- /* First Time. Add extra byte */
- entireLength += increment + 1;
- }
- else
- {
- entireLength += increment;
- }
- newBuffer = fz_resize_array(ctx, entireText, entireLength, sizeof(uint16_t));
- if (entireText == NULL)
- {
- /* First time. Pretend buffer holds an empty string. */
- newBuffer[0] = UNICODE_EOS;
- }
- entireText = newBuffer;
- freeSpace += increment;
- }
- ustrncat (entireText, objText, objLength);
-
- freeSpace -= objLength;
- }
- while (objText != NULL);
-
- if (entireText != NULL)
- {
- size_t i;
- size_t length = entireLength - 1 - freeSpace;
- Bidi_ProcessLine_fragData fragData;
- /* replace TAB character with SPACE for html Documents */
- if((bidiFlag & Bidi_replaceTab) != 0)
- {
- for (i = 0; i < length; i++)
- {
- if (entireText[i]=='\t')
- {
- entireText[i] = ' ';
- }
- }
- }
-
- fragData.entireText = entireText;
- fragData.callersCallback = fragmentCb;
- fragData.callersData = callerData;
- Bidi_fragmentText(ctx,
- entireText,
- length, &explicitDirection,
- newFragCb, &fragData,
- bidiFlag);
- }
- }
- fz_always(ctx)
- {
- fz_free(ctx, entireText);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
-
- return explicitDirection;
-}
-
-
-#ifdef BIDI_LINE_AT_A_TIME
-/* This changes the given text by reordering and possibly mirroring
- * characters. Suitable only for a single line of text.
- */
-void Bidi_processText(fz_context *ctx,
- uint16_t *text,
- Bidi_Direction baseDirection,
- Bidi_Direction outputDirection,
- int len)
-{
- Bidi_Level *levels;
-
- if (text == NULL || len == 0)
- {
- return;
- }
-
- levels = createLevels(ctx, text, (size_t)len, &baseDirection, TRUE, 0);
-
- assert (levels != NULL);
-
- /* resolve mirrored characters, e.g. swap '(' with ')' */
- Bidi_mirror(text, levels, len);
-
- (void)Bidi_reorder(baseDirection, text, levels, len);
-
- /* reverse the string */
- if (outputDirection == Bidi_RightToLeft)
- ustrnreverse(text, len);
-
-#ifdef DEBUG_BIDI_VERBOSE
- fprintf(stderr, "Done: ");
- {
- int i;
- for (i = 0; i < len; i++)
- {
- /* So that we can actually sort of read the debug string, any
- * non-ascii characters are replaced with a 1-digit hash
- * value from 0-9, making non-english characters appear
- * as numbers
- */
- fprintf(stderr, "%c", (text[i] <= 127 && text[i ]>= 32)?
- text[i]
- :(char)((text[i] % 9) + 48)
- );
- }
- fprintf(stderr, "\n\n");
- }
-#endif
-
- fz_free(ctx, levels);
-}
-#endif
diff --git a/source/html/html-layout.c b/source/html/html-layout.c
index ad3dd621..5bfcac85 100644
--- a/source/html/html-layout.c
+++ b/source/html/html-layout.c
@@ -116,6 +116,33 @@ static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html *top, fz_css_
add_flow_glue(ctx, pool, top, style, "", 0);
}
+static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
+{
+ fz_html_flow *new_flow;
+ char *text;
+ size_t len;
+
+ if (offset == 0)
+ return flow;
+ new_flow = fz_pool_alloc(ctx, pool, sizeof *flow);
+ *new_flow = *flow;
+ new_flow->next = flow->next;
+ flow->next = new_flow;
+
+ text = flow->text;
+ while (*text && offset)
+ {
+ int rune;
+ text += fz_chartorune(&rune, text);
+ offset--;
+ }
+ len = strlen(text);
+ new_flow->text = fz_pool_alloc(ctx, pool, len+1);
+ strcpy(new_flow->text, text);
+ *text = 0;
+ return new_flow;
+}
+
static int iscjk(int c)
{
if (c >= 0x3200 && c <= 0x9FFF) return 1; /* CJK Blocks */
@@ -1347,6 +1374,172 @@ fz_layout_html(fz_context *ctx, fz_html *box, float w, float h, float em)
layout_block(ctx, box, &page_box, em, h, 0);
}
+typedef struct
+{
+ uint16_t *buffer;
+ size_t max;
+ size_t len;
+} uni_buf;
+
+typedef struct
+{
+ fz_context *ctx;
+ fz_pool *pool;
+ fz_html_flow *flow;
+ uni_buf *buffer;
+} bidi_data;
+
+static size_t utf8len(const char *text)
+{
+ size_t len = 0;
+
+ while (*text)
+ {
+ int rune;
+ text += fz_chartorune(&rune, text);
+ len++;
+ }
+ return len;
+}
+
+static void newFragCb(const uint16_t *fragment,
+ size_t fragmentLen,
+ int rightToLeft,
+ uint16_t mirror,
+ void *arg)
+{
+ bidi_data *data = (bidi_data *)arg;
+ size_t fragmentOffset = fragment - data->buffer->buffer;
+ int charDirR2L = rightToLeft;
+
+ if((fragmentOffset != 0) &&
+ Bidi_isEuropeanNumber(fragment, fragmentLen))
+ {
+ /* fragment contains digits only */
+ charDirR2L = 0;
+ }
+
+ /* We are guaranteed that fragmentOffset will be at the beginning
+ * of flow. */
+ while (fragmentLen > 0)
+ {
+ size_t len;
+
+ if (data->flow->type == FLOW_GLUE)
+ {
+ len = 1;
+ }
+ else
+ {
+ /* Must be text */
+ len = utf8len(data->flow->text);
+ if (len > fragmentLen)
+ {
+ /* We need to split this flow box */
+ (void)split_flow(data->ctx, data->pool, data->flow, fragmentLen);
+ len = utf8len(data->flow->text);
+ }
+ }
+
+ /* This flow box is entirely contained within this fragment. */
+ data->flow->block_r2l = rightToLeft;
+ data->flow->char_r2l = charDirR2L;
+ data->flow = data->flow->next;
+ fragmentOffset += len;
+ fragmentLen -= len;
+ }
+}
+
+static void
+detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_flow *flow)
+{
+ fz_html_flow *end = flow;
+ const char *text;
+ bidi_data data;
+ Bidi_Direction baseDir = -1;
+
+ /* Stage 1: Gather the text from the flow up into a single buffer */
+ buffer->len = 0;
+ while (end)
+ {
+ size_t len;
+ int broken = 0;
+
+ switch (end->type)
+ {
+ case FLOW_WORD:
+ len = utf8len(end->text);
+ text = end->text;
+ break;
+ case FLOW_GLUE:
+ len = 1;
+ text = " ";
+ break;
+ case FLOW_BREAK:
+ case FLOW_IMAGE:
+ broken = 1;
+ break;
+ }
+
+ if (broken)
+ break;
+
+ /* Make sure the buffer is large enough */
+ if (buffer->len + len > buffer->max)
+ {
+ size_t newmax = buffer->max * 2;
+ if (newmax == 0)
+ newmax = 128; /* Sensible small default */
+ buffer->buffer = fz_resize_array(ctx, buffer->buffer, newmax, sizeof(uint16_t));
+ buffer->max = newmax;
+ }
+
+ /* Expand the utf8 text into Unicode and store it in the buffer */
+ while (*text)
+ {
+ int rune;
+ text += fz_chartorune(&rune, text);
+ buffer->buffer[buffer->len++] = (uint16_t)rune;
+ }
+
+ end = end->next;
+ }
+
+ /* Detect directionality for the buffer */
+ data.ctx = ctx;
+ data.pool = pool;
+ data.flow = flow;
+ data.buffer = buffer;
+ Bidi_fragmentText(ctx,
+ buffer->buffer,
+ buffer->len,
+ &baseDir,
+ newFragCb, &data,
+ 0 /* Flags */);
+}
+
+static void
+detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html *box)
+{
+ while (box)
+ {
+ if (box->flow_head)
+ {
+ detect_flow_directionality(ctx, pool, buffer, box->flow_head);
+ }
+ detect_box_directionality(ctx, pool, buffer, box->down);
+ box = box->next;
+ }
+}
+
+static void
+detect_directionality(fz_context *ctx, fz_pool *pool, fz_html *box)
+{
+ uni_buf buffer = { NULL };
+
+ detect_box_directionality(ctx, pool, &buffer, box);
+}
+
fz_html *
fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
{
@@ -1380,5 +1573,7 @@ fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const cha
fz_drop_css(ctx, css);
fz_drop_xml(ctx, xml);
+ detect_directionality(ctx, pool, box);
+
return box;
}