diff options
-rw-r--r-- | include/mupdf/fitz/bidi.h | 55 | ||||
-rw-r--r-- | include/mupdf/html.h | 21 | ||||
-rw-r--r-- | source/fitz/bidi.c | 222 | ||||
-rw-r--r-- | source/html/html-layout.c | 195 |
4 files changed, 275 insertions, 218 deletions
diff --git a/include/mupdf/fitz/bidi.h b/include/mupdf/fitz/bidi.h index 0b5b9553..978696ef 100644 --- a/include/mupdf/fitz/bidi.h +++ b/include/mupdf/fitz/bidi.h @@ -199,6 +199,61 @@ Bidi_Direction Bidi_processLine(fz_context *ctx, int *more); +int Bidi_isEuropeanNumber(const uint16_t *str, unsigned int len); + +/** + * returns a character's mirrored equivalent + * + * @param u Unicode character to process + */ +uint16_t Bidi_mirrorChar(const uint16_t u); + + + +/** + * Prototype for callback function supplied to Bidi_fragmentText. + * + * @param fragment first character in fragment + * @param fragmentLen number of characters in fragment + * @param rightToLeft true if fragment is right-to-left + * @param mirror The mirror code of the fragment if it exists + * @param arg data from caller of Bidi_fragmentText + */ +typedef void (Bidi_Fragment_Callback)(const uint16_t *fragment, + size_t fragmentLen, + int rightToLeft, + uint16_t mirror, + void *arg); + + + +/** + * Partitions the given Unicode sequence into one or more unidirectional + * fragments and invokes the given callback function for each fragment. + * + * For example, if directionality of text is: + * 0123456789 + * rrlllrrrrr, + * we'll invoke callback with: + * &text[0], length == 2, rightToLeft == true + * &text[2], length == 3, rightToLeft == false + * &text[5], length == 5, rightToLeft == true. + * + * @param[in] text start of Unicode sequence + * @param[in] textlen number of Unicodes to analyse + * @param[in] baseDir direction of paragraph (specify Bidi_Neutral + * to force auto-detection) + * @param[in] callback function to be called for each fragment + * @param[in] arg data to be passed to the callback function + * @param[in] bidiFlag flag to be passed to the callback function + */ +void Bidi_fragmentText(fz_context *ctx, + const uint16_t *text, + size_t textlen, + Bidi_Direction *baseDir, + Bidi_Fragment_Callback callback, + void *arg, + int bidiFlag); #endif /* BIDI_BIDI_H */ diff --git a/include/mupdf/html.h b/include/mupdf/html.h index 830c95f8..3955aebc 100644 --- a/include/mupdf/html.h +++ b/include/mupdf/html.h @@ -188,10 +188,31 @@ enum FLOW_IMAGE = 3 }; +/* We have to recognise the distinction between render direction + * and layout direction. For most strings render direction and + * logical direction are the same. + * + * Char direction determines whether a string 'ABC' appears as + * ABC or CBA. + * + * Block direction determines how fragments are attached together. + * 'ABC' and 'DEF' with r2l char and block directions will + * appear as 'FEDCBA'. With l2r char and block it will appear + * as 'ABCDEF'. + * + * The reason for the distinction is that we can have logical + * strings like 'ABC0123DEF', where 'ABC' and 'DEF' are in r2l + * scripts. The bidirectional code breaks this down into 3 fragments + * 'ABC' '0123' 'DEF', where all three are r2l, but digits need to + * be rendered left to right. i.e. the desired result is: + * FED0123CBA, rather than FED3210CBA. + */ struct fz_html_flow_s { unsigned int type : 2; unsigned int expand : 1; + unsigned int char_r2l : 1; + unsigned int block_r2l : 1; float x, y, w, h, em; fz_css_style *style; char *text; diff --git a/source/fitz/bidi.c b/source/fitz/bidi.c index a3bbbc1d..59d1578a 100644 --- a/source/fitz/bidi.c +++ b/source/fitz/bidi.c @@ -35,7 +35,6 @@ #include "mupdf/fitz.h" #include "bidi-impl.h" /* standard bidi code interface */ -#include "bidi-internal.h" /* * Macros... @@ -548,7 +547,6 @@ Bidi_CharType Bidi_classFromChN(uint16_t ch) } -static int Bidi_isEuropeanNumber(const uint16_t *str, unsigned int len) { const uint16_t *end = str + len; @@ -650,32 +648,6 @@ uint16_t Bidi_mirrorChar(const uint16_t u) return UNICODE_EOS; } -#ifdef BIDI_LINE_AT_A_TIME -/** Subsitutes characters with their mirrored equivalents, e.g. swaps - * '(' with ')'. Implements rule L4 of the uint16_t Bidirectional - * algorithm. Mirrored characters are found using a binary search through - * the lookup table bidiOptMirrorList. - */ -static -void Bidi_mirror(uint16_t *text, - const Bidi_Level *poslevel, - int len) -{ - int i; - uint16_t out; - - for (i = 0; i < len; i ++) - { - if (ODD(poslevel[i]) == 0) - continue; - out = Bidi_mirrorChar((const uint16_t)text[i]); - if(out != UNICODE_EOS) - text[i] = out; - } -} -#endif - - /** Searches a RTL fragment for a mirror character * When it finds one it creates a separate fragment for the * character and the surrounding fragments. It passes the mirrored @@ -766,11 +738,15 @@ void Bidi_classifyCharacters(const uint16_t *text, ); } fprintf(stderr, "\nTypes: "); +#endif for (i = 0; i < len; i++) { types[i] = Bidi_classFromChN(text[i]); +#ifdef DEBUG_BIDI_VERBOSE fprintf(stderr, "%c", charFromTypes[(int)types[i]]); +#endif } +#ifdef DEBUG_BIDI_VERBOSE fprintf(stderr, "\n"); #endif } @@ -1117,193 +1093,3 @@ void Bidi_fragmentText(fz_context *ctx, fz_rethrow(ctx); } } - - - -static void newFragCb(const uint16_t *fragment, - size_t fragmentLen, - int rightToLeft, - uint16_t mirror, - void *arg) -{ - size_t fragOffset; - int isRtlNumber = FALSE; - Bidi_ProcessLine_fragData *fragData = arg; - - assert(fragData != NULL); - - fragOffset = fragment - fragData->entireText; - - if((fragOffset != 0) && - Bidi_isEuropeanNumber(fragment,fragmentLen)) - { - /* fragment contains digits only */ - isRtlNumber = TRUE; - } - fragData->callersCallback(fragOffset, - fragmentLen, - rightToLeft, - mirror, - fragData->callersData, - isRtlNumber); -} - - - -/* Currently this creates a text representation of all the objects - * on a line, then passes it to Bidi_fragmentText. In future, we - * should create the levels array directly from the objects supplied. - */ -Bidi_Direction Bidi_processLine(fz_context *ctx, - Bidi_PL_NextObj_Callback nextObjCb, - Bidi_PL_Fragment_Callback fragmentCb, - void *callerData, - int bidiFlag, - int *more) -{ - uint16_t *entireText = NULL; - size_t entireLength = 0; - size_t lengthIncrement = 100; - size_t freeSpace = 0; - const uint16_t *objText = NULL; - size_t objLength = 0; - Bidi_Direction explicitDirection = Bidi_Neutral; - - fz_var(entireText); - - fz_try(ctx) - { - do - { - /* Call the client-supplied function to get the next object. */ - nextObjCb(callerData, &objText, &objLength, more, &explicitDirection); - if (objText == NULL || objLength == 0) - { - /* End of sequence, or caller is giving us rubbish. */ - break; - } - - if (freeSpace < objLength) - { - /* We need a bigger buffer. */ - uint16_t *newBuffer = NULL; - size_t increment = (objLength < lengthIncrement) ? - lengthIncrement : objLength; - /* Accommodate the string terminator that ustrncat will write */ - if (entireText == NULL) - { - /* First Time. Add extra byte */ - entireLength += increment + 1; - } - else - { - entireLength += increment; - } - newBuffer = fz_resize_array(ctx, entireText, entireLength, sizeof(uint16_t)); - if (entireText == NULL) - { - /* First time. Pretend buffer holds an empty string. */ - newBuffer[0] = UNICODE_EOS; - } - entireText = newBuffer; - freeSpace += increment; - } - ustrncat (entireText, objText, objLength); - - freeSpace -= objLength; - } - while (objText != NULL); - - if (entireText != NULL) - { - size_t i; - size_t length = entireLength - 1 - freeSpace; - Bidi_ProcessLine_fragData fragData; - /* replace TAB character with SPACE for html Documents */ - if((bidiFlag & Bidi_replaceTab) != 0) - { - for (i = 0; i < length; i++) - { - if (entireText[i]=='\t') - { - entireText[i] = ' '; - } - } - } - - fragData.entireText = entireText; - fragData.callersCallback = fragmentCb; - fragData.callersData = callerData; - Bidi_fragmentText(ctx, - entireText, - length, &explicitDirection, - newFragCb, &fragData, - bidiFlag); - } - } - fz_always(ctx) - { - fz_free(ctx, entireText); - } - fz_catch(ctx) - { - fz_rethrow(ctx); - } - - return explicitDirection; -} - - -#ifdef BIDI_LINE_AT_A_TIME -/* This changes the given text by reordering and possibly mirroring - * characters. Suitable only for a single line of text. - */ -void Bidi_processText(fz_context *ctx, - uint16_t *text, - Bidi_Direction baseDirection, - Bidi_Direction outputDirection, - int len) -{ - Bidi_Level *levels; - - if (text == NULL || len == 0) - { - return; - } - - levels = createLevels(ctx, text, (size_t)len, &baseDirection, TRUE, 0); - - assert (levels != NULL); - - /* resolve mirrored characters, e.g. swap '(' with ')' */ - Bidi_mirror(text, levels, len); - - (void)Bidi_reorder(baseDirection, text, levels, len); - - /* reverse the string */ - if (outputDirection == Bidi_RightToLeft) - ustrnreverse(text, len); - -#ifdef DEBUG_BIDI_VERBOSE - fprintf(stderr, "Done: "); - { - int i; - for (i = 0; i < len; i++) - { - /* So that we can actually sort of read the debug string, any - * non-ascii characters are replaced with a 1-digit hash - * value from 0-9, making non-english characters appear - * as numbers - */ - fprintf(stderr, "%c", (text[i] <= 127 && text[i ]>= 32)? - text[i] - :(char)((text[i] % 9) + 48) - ); - } - fprintf(stderr, "\n\n"); - } -#endif - - fz_free(ctx, levels); -} -#endif diff --git a/source/html/html-layout.c b/source/html/html-layout.c index ad3dd621..5bfcac85 100644 --- a/source/html/html-layout.c +++ b/source/html/html-layout.c @@ -116,6 +116,33 @@ static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html *top, fz_css_ add_flow_glue(ctx, pool, top, style, "", 0); } +static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) +{ + fz_html_flow *new_flow; + char *text; + size_t len; + + if (offset == 0) + return flow; + new_flow = fz_pool_alloc(ctx, pool, sizeof *flow); + *new_flow = *flow; + new_flow->next = flow->next; + flow->next = new_flow; + + text = flow->text; + while (*text && offset) + { + int rune; + text += fz_chartorune(&rune, text); + offset--; + } + len = strlen(text); + new_flow->text = fz_pool_alloc(ctx, pool, len+1); + strcpy(new_flow->text, text); + *text = 0; + return new_flow; +} + static int iscjk(int c) { if (c >= 0x3200 && c <= 0x9FFF) return 1; /* CJK Blocks */ @@ -1347,6 +1374,172 @@ fz_layout_html(fz_context *ctx, fz_html *box, float w, float h, float em) layout_block(ctx, box, &page_box, em, h, 0); } +typedef struct +{ + uint16_t *buffer; + size_t max; + size_t len; +} uni_buf; + +typedef struct +{ + fz_context *ctx; + fz_pool *pool; + fz_html_flow *flow; + uni_buf *buffer; +} bidi_data; + +static size_t utf8len(const char *text) +{ + size_t len = 0; + + while (*text) + { + int rune; + text += fz_chartorune(&rune, text); + len++; + } + return len; +} + +static void newFragCb(const uint16_t *fragment, + size_t fragmentLen, + int rightToLeft, + uint16_t mirror, + void *arg) +{ + bidi_data *data = (bidi_data *)arg; + size_t fragmentOffset = fragment - data->buffer->buffer; + int charDirR2L = rightToLeft; + + if((fragmentOffset != 0) && + Bidi_isEuropeanNumber(fragment, fragmentLen)) + { + /* fragment contains digits only */ + charDirR2L = 0; + } + + /* We are guaranteed that fragmentOffset will be at the beginning + * of flow. */ + while (fragmentLen > 0) + { + size_t len; + + if (data->flow->type == FLOW_GLUE) + { + len = 1; + } + else + { + /* Must be text */ + len = utf8len(data->flow->text); + if (len > fragmentLen) + { + /* We need to split this flow box */ + (void)split_flow(data->ctx, data->pool, data->flow, fragmentLen); + len = utf8len(data->flow->text); + } + } + + /* This flow box is entirely contained within this fragment. */ + data->flow->block_r2l = rightToLeft; + data->flow->char_r2l = charDirR2L; + data->flow = data->flow->next; + fragmentOffset += len; + fragmentLen -= len; + } +} + +static void +detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_flow *flow) +{ + fz_html_flow *end = flow; + const char *text; + bidi_data data; + Bidi_Direction baseDir = -1; + + /* Stage 1: Gather the text from the flow up into a single buffer */ + buffer->len = 0; + while (end) + { + size_t len; + int broken = 0; + + switch (end->type) + { + case FLOW_WORD: + len = utf8len(end->text); + text = end->text; + break; + case FLOW_GLUE: + len = 1; + text = " "; + break; + case FLOW_BREAK: + case FLOW_IMAGE: + broken = 1; + break; + } + + if (broken) + break; + + /* Make sure the buffer is large enough */ + if (buffer->len + len > buffer->max) + { + size_t newmax = buffer->max * 2; + if (newmax == 0) + newmax = 128; /* Sensible small default */ + buffer->buffer = fz_resize_array(ctx, buffer->buffer, newmax, sizeof(uint16_t)); + buffer->max = newmax; + } + + /* Expand the utf8 text into Unicode and store it in the buffer */ + while (*text) + { + int rune; + text += fz_chartorune(&rune, text); + buffer->buffer[buffer->len++] = (uint16_t)rune; + } + + end = end->next; + } + + /* Detect directionality for the buffer */ + data.ctx = ctx; + data.pool = pool; + data.flow = flow; + data.buffer = buffer; + Bidi_fragmentText(ctx, + buffer->buffer, + buffer->len, + &baseDir, + newFragCb, &data, + 0 /* Flags */); +} + +static void +detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html *box) +{ + while (box) + { + if (box->flow_head) + { + detect_flow_directionality(ctx, pool, buffer, box->flow_head); + } + detect_box_directionality(ctx, pool, buffer, box->down); + box = box->next; + } +} + +static void +detect_directionality(fz_context *ctx, fz_pool *pool, fz_html *box) +{ + uni_buf buffer = { NULL }; + + detect_box_directionality(ctx, pool, &buffer, box); +} + fz_html * fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) { @@ -1380,5 +1573,7 @@ fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const cha fz_drop_css(ctx, css); fz_drop_xml(ctx, xml); + detect_directionality(ctx, pool, box); + return box; } |