summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2016-01-18 16:31:52 +0000
committerRobin Watts <robin.watts@artifex.com>2016-01-18 19:25:12 +0000
commit5e608c0649ece27029484f388c672bed98af6e34 (patch)
treef702ff385c5c1804b215a288b8d9f8215acc2f9d /include
parent8876142a36c76d242f5a1a73dd66fa0430847ddd (diff)
downloadmupdf-5e608c0649ece27029484f388c672bed98af6e34.tar.xz
Process HTML text for directionality.
After we parse html text from an ebook run it through the unicode bidirectional algorithm to determine the directionality of each fragment. This may require splitting of fragments. Currently we don't do anything with this information.
Diffstat (limited to 'include')
-rw-r--r--include/mupdf/fitz/bidi.h55
-rw-r--r--include/mupdf/html.h21
2 files changed, 76 insertions, 0 deletions
diff --git a/include/mupdf/fitz/bidi.h b/include/mupdf/fitz/bidi.h
index 0b5b9553..978696ef 100644
--- a/include/mupdf/fitz/bidi.h
+++ b/include/mupdf/fitz/bidi.h
@@ -199,6 +199,61 @@ Bidi_Direction Bidi_processLine(fz_context *ctx,
int *more);
+int Bidi_isEuropeanNumber(const uint16_t *str, unsigned int len);
+
+/**
+ * returns a character's mirrored equivalent
+ *
+ * @param u Unicode character to process
+ */
+uint16_t Bidi_mirrorChar(const uint16_t u);
+
+
+
+/**
+ * Prototype for callback function supplied to Bidi_fragmentText.
+ *
+ * @param fragment first character in fragment
+ * @param fragmentLen number of characters in fragment
+ * @param rightToLeft true if fragment is right-to-left
+ * @param mirror The mirror code of the fragment if it exists
+ * @param arg data from caller of Bidi_fragmentText
+ */
+typedef void (Bidi_Fragment_Callback)(const uint16_t *fragment,
+ size_t fragmentLen,
+ int rightToLeft,
+ uint16_t mirror,
+ void *arg);
+
+
+
+/**
+ * Partitions the given Unicode sequence into one or more unidirectional
+ * fragments and invokes the given callback function for each fragment.
+ *
+ * For example, if directionality of text is:
+ * 0123456789
+ * rrlllrrrrr,
+ * we'll invoke callback with:
+ * &text[0], length == 2, rightToLeft == true
+ * &text[2], length == 3, rightToLeft == false
+ * &text[5], length == 5, rightToLeft == true.
+ *
+ * @param[in] text start of Unicode sequence
+ * @param[in] textlen number of Unicodes to analyse
+ * @param[in] baseDir direction of paragraph (specify Bidi_Neutral
+ * to force auto-detection)
+ * @param[in] callback function to be called for each fragment
+ * @param[in] arg data to be passed to the callback function
+ * @param[in] bidiFlag flag to be passed to the callback function
+ */
+void Bidi_fragmentText(fz_context *ctx,
+ const uint16_t *text,
+ size_t textlen,
+ Bidi_Direction *baseDir,
+ Bidi_Fragment_Callback callback,
+ void *arg,
+ int bidiFlag);
#endif /* BIDI_BIDI_H */
diff --git a/include/mupdf/html.h b/include/mupdf/html.h
index 830c95f8..3955aebc 100644
--- a/include/mupdf/html.h
+++ b/include/mupdf/html.h
@@ -188,10 +188,31 @@ enum
FLOW_IMAGE = 3
};
+/* We have to recognise the distinction between render direction
+ * and layout direction. For most strings render direction and
+ * logical direction are the same.
+ *
+ * Char direction determines whether a string 'ABC' appears as
+ * ABC or CBA.
+ *
+ * Block direction determines how fragments are attached together.
+ * 'ABC' and 'DEF' with r2l char and block directions will
+ * appear as 'FEDCBA'. With l2r char and block it will appear
+ * as 'ABCDEF'.
+ *
+ * The reason for the distinction is that we can have logical
+ * strings like 'ABC0123DEF', where 'ABC' and 'DEF' are in r2l
+ * scripts. The bidirectional code breaks this down into 3 fragments
+ * 'ABC' '0123' 'DEF', where all three are r2l, but digits need to
+ * be rendered left to right. i.e. the desired result is:
+ * FED0123CBA, rather than FED3210CBA.
+ */
struct fz_html_flow_s
{
unsigned int type : 2;
unsigned int expand : 1;
+ unsigned int char_r2l : 1;
+ unsigned int block_r2l : 1;
float x, y, w, h, em;
fz_css_style *style;
char *text;