summaryrefslogtreecommitdiff
path: root/source/fitz/bidi-std.c
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2016-01-15 16:47:52 +0000
committerRobin Watts <robin.watts@artifex.com>2016-01-18 19:25:12 +0000
commit8876142a36c76d242f5a1a73dd66fa0430847ddd (patch)
tree24e9f41b48228d2503dc6d01fc187cde7a76c0b8 /source/fitz/bidi-std.c
parent81ce5000573a1e2b2fbb0b82789452931566af98 (diff)
downloadmupdf-8876142a36c76d242f5a1a73dd66fa0430847ddd.tar.xz
First import of bidi code.
Diffstat (limited to 'source/fitz/bidi-std.c')
-rw-r--r--source/fitz/bidi-std.c1185
1 files changed, 1185 insertions, 0 deletions
diff --git a/source/fitz/bidi-std.c b/source/fitz/bidi-std.c
new file mode 100644
index 00000000..e8282b96
--- /dev/null
+++ b/source/fitz/bidi-std.c
@@ -0,0 +1,1185 @@
+// Extracted from Bidi.cpp - version 26
+
+// Reference implementation for Unicode Bidirectional Algorithm
+
+// Bidi include file
+#include "mupdf/fitz.h"
+#include "bidi-impl.h"
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+/*------------------------------------------------------------------------
+ File: Bidi.Cpp
+
+ Description
+ -----------
+
+ Sample Implementation of the Unicode Bidirectional Algorithm as it
+ was revised by Revision 5 of the Uniode Technical Report # 9
+ (1999-8-17)
+
+ Verified for changes to the algorithm up through Unicode 5.2.0 (2009).
+
+ This implementation is organized into several passes, each implemen-
+ ting one or more of the rules of the Unicode Bidi Algorithm. The
+ resolution of Weak Types and of Neutrals each use a state table
+ approach.
+
+ Both a printf based interface and a Windows DlgProc are provided for
+ interactive testing.
+
+ A stress harness comparing this implementation (v24) to a Java based
+ implementation was used by Doug Felt to verify that the two
+ implementations produce identical results for all strings up to six
+ bidi classes and stochastic strings up to length 20.
+
+ Version 26 was verified by the author against the Unicode 5.2.0
+ file BidiTest.txt, which provides an exhaustive text of strings of
+ length 4 or less, but covers some important cases where the language
+ in UAX#9 had been clarified.
+
+ To see this code running in an actual Windows program,
+ download the free Unibook uitlity from http://unicode.org/unibook
+ The bidi demo is executed from the tools menu. It is build from
+ this source file.
+
+ Build Notes
+ -----------
+
+ To compile the sample implementation please set the #define
+ directives above so the correct headers get included. Not all the
+ files are needed for all purposes. For the commandline version
+ only bidi.h and bidi.cpp are needed.
+
+ The Win32 version is provided as a dialog procedure. To use as a
+ standalone program compile with the the lbmain.cpp file. If all you
+ need is the ability to run the code "as is", you can instead download
+ the unibook utility from http://uincode.org/unibook/ which contains
+ the bidi demo compiled from this source file.
+
+ This code uses an extension to C++ that gives variables declared in
+ a for() statement function the same scope as the for() statement.
+ If your compiler does not support this extension, you may need to
+ move the declaration, e.g. int ich = 0; in front of the for statement.
+
+ Implementation Note
+ -------------------
+
+ NOTE: The Unicode Bidirectional Algorithm removes all explicit
+ formatting codes in rule X9, but states that this can be
+ simulated by conformant implementations. This implementation
+ attempts to demonstrate such a simulation
+
+ To demonstrate this, the current implementation does the
+ following:
+
+ in resolveExplicit()
+ - change LRE, LRO, RLE, RLO, PDF to BN
+ - assign nested levels to BN
+
+ in resolveWeak and resolveNeutrals
+ - assign L and R to BN's where they exist in place of
+ sor and eor by changing the last BN in front of a
+ level change to a strong type
+ - skip over BN's for the purpose of determining actions
+ - include BN in the count of deferred runs
+ which will resolve some of them to EN, AN and N
+
+ in resolveWhiteSpace
+ - set the level of any surviving BN to the base level,
+ or the level of the preceding character
+ - include LRE,LRO, RLE, RLO, PDF and BN in the count
+ whitespace to be reset
+
+ This will result in the same order for non-BN characters as
+ if the BN characters had been removed.
+
+ The clean() function can be used to remove boundary marks for
+ verification purposes.
+
+ Notation
+ --------
+ Pointer variables generally start with the letter p
+ Counter variables generally start with the letter c
+ Index variables generally start with the letter i
+ Boolean variables generally start with the letter f
+
+ The enumerated bidirectional types have the same name as in the
+ description for the Unicode Bidirectional Algorithm
+
+
+ Using this code outside a demo context
+ --------------------------------------
+
+ The way the functions are broken down in this demo code is based
+ on the needs of the demo to show the evolution in internal state
+ as the algorithm proceeds. This obscures how the algorithm would
+ be used in practice. These are the steps:
+
+ 1. Allocate a pair of arrays large enough to hold bidi class
+ and calculated levels (one for each input character)
+
+ 2. Provide your own function to assign directional types (bidi
+ classes) corresponding to each character in the input,
+ conflating ON, WS, S to N. Unlike the classify function in this
+ demo, the input would be actual Unicode characters.
+
+ 3. Process the first paragraph by calling BidiParagraph. That
+ function changes B into BN and returns a length including the
+ paragraph separator. (The iteration over multiple paragraphs
+ is left as excercise for the reader).
+
+ 4. Assign directional types again, but now assign specific types
+ to whitespace characters.
+
+ 5. Instead of reordering the input in place it is often desirable
+ to calculate an array of offsets indicating the reordering.
+ To that end, allocate such an array here and use it instead
+ of the input array in the next step.
+
+ 6. Resolve and reorder the lines by calling BidiLines. That
+ function 'breaks' lines on LS characters. Provide an optional
+ array of flags indicating the location of other line breaks as
+ needed.
+
+
+ Update History
+ --------------
+ Version 24 is the initial published and verified version of this
+ reference implementation. Version 25 and its updates fix various
+ minor issues with the scaffolding used for demonstrating the
+ algorithm using pseudo-alphabets from the command line or dialog
+ box. No changes to the implementation of the actual bidi algrithm
+ are made in any of the minor updates to version 25. Version 26
+ also makes no change to the actual algorithm but was verified
+ against the official BidiTest.txt file for Unicode 5.2.0.
+
+ - updated pseudo-alphabet
+
+ - Last Revised 12-10-99 (25)
+
+ - enable demo mode for release builds - no other changes
+
+ - Last Revised 12-10-00 (25a)
+
+ - fix regression in pseudo alphabet use for Windows UI
+
+ - Last Revised 02-01-01 (25b)
+
+ - fixed a few comments, renamed a variable
+
+ - Last Revised 03-04-01 (25c)
+
+ - make base level settable, enable mirror by default,
+ fix dialog size
+
+ - Last Revised 06-02-01 (25e)
+
+ - fixed some comments
+
+ - Last Revised 09-29-01 (25f)
+
+ - fixed classification for LS,RLM,LRM in pseudo alphabet,
+ focus issues in UI, regression fix to commandline from 25(e)
+ fix DEMO switch
+
+ - Last Revised 11-07-01 (25g)
+
+ - fixed classification for plus/minus in pseudo alphabet
+ to track changes made in Unicode 4.0.1
+
+ - Last Revised 12-03-04 (25h)
+
+ - now compiles as dialog-only program for WINDOWS_UI==1
+ using new bidimain.cpp
+
+ - Last Revised 12-02-07 (25i)
+
+ - cleaned up whitespace and indenting in the source,
+ fixed two comments (table headers)
+
+ - Last Revised 15-03-07 (25j)
+
+ - named enumerations
+
+ - Last Revised 30-05-07 (25k)
+
+ - added usage notes, minor edits to comments, indentation, etc
+ throughout. Added the bidiParagraph function. Checked against
+ changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No
+ changes needed to this implementation to match the values in
+ the BidiTest.txt file in the Unicode Character Database.
+ Minor fixes to dialog/windows proc, updated preprocessor directives.
+
+ - Last Revised 03-08-09 (26)
+
+ Credits:
+ -------
+ Written by: Asmus Freytag
+ Command line interface by: Rick McGowan
+ Verification (v24): Doug Felt
+
+ Disclaimer and legal rights:
+ ---------------------------
+ Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved.
+ Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+
+ THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+ BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+ OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE.
+
+ The file bid.rc is included in the software covered by the above.
+------------------------------------------------------------------------*/
+
+
+/* === HELPER FUNCTIONS AND DECLARATIONS ================================= */
+
+#define odd(x) ((x) & 1)
+
+/*----------------------------------------------------------------------
+ The following array maps character codes to types for the purpose of
+ this sample implementation. The legend string gives a human readable
+ explanation of the pseudo alphabet.
+
+ For simplicity, characters entered by buttons are given a 1:1 mapping
+ between their type and pseudo character value. Pseudo characters that
+ can be typed from the keyboard are explained in the legend string.
+
+ Use the Unicode Character Database for the real values in real use.
+ ---------------------------------------------------------------------*/
+
+const int chLS = 0x15;
+
+int TypesFromChar[] =
+{
+
+// 0 1 2 3 4 5 6 7 8 9 a b c d e f
+BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_S, BDI_B, BDI_S, BDI_WS, BDI_B, BDI_BN, BDI_BN, /*00-0f*/
+BDI_LRO,BDI_LRE,BDI_PDF,BDI_RLO,BDI_RLE,BDI_WS, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_B, BDI_B, BDI_B, BDI_S, /*10-1f*/
+BDI_WS, BDI_ON, BDI_ON, BDI_ET, BDI_ET, BDI_ET, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ES, BDI_CS, BDI_ES, BDI_CS, BDI_ES, /*20-2f*/
+BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_EN, BDI_AN, BDI_AN, BDI_AN, BDI_AN, BDI_CS, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, /*30-3f*/
+BDI_ON, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_AL, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, /*40-4f*/
+BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_LRE,BDI_ON, BDI_RLE,BDI_PDF,BDI_S, /*50-5f*/
+BDI_NSM,BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, /*60-6f*/
+BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_LRO,BDI_B, BDI_RLO,BDI_BN, BDI_ON, /*70-7f*/
+};
+
+
+/***************************************
+ Reverse, human readable reference:
+
+ LRM: 0x4
+ RLM: 0x5
+ L: 0x16,a-z
+ LRE: 0x11,[
+ LRO: 0x10,{
+ R: 0x17,G-Z
+ AL: A-F
+ RLE: 0x14,]
+ RLO: 0x13,}
+ PDF: 0x12,^
+ EN: 0-5
+ ES: /,+,[hyphen]
+ ET: #,$,%
+ AN: 6-9
+ CS: [comma],.,:
+ NSM: `
+ BN: 0x0-0x8,0xe,0xf,0x18-0x1b,~
+ B: 0xa,0xd,0x1c-0x1e,|
+ S: 0x9,0xb,0x1f,_
+ WS: 0xc,0x15,[space]
+ ON: !,",&,',(,),*,;,<,=,>,?,@,\,0x7f
+****************************************/
+
+// WS, LS and S are not explicitly needed except for L1. Therefore this
+// table conflates ON, S, WS, and LS to N, all others unchanged
+int NTypes[] = {
+ BDI_N, // ON,
+ BDI_L, // L,
+ BDI_R, // R,
+ BDI_AN, // AN,
+ BDI_EN, // EN,
+ BDI_AL, // AL
+ BDI_NSM, // NSM
+ BDI_CS, // CS
+ BDI_ES, // ES
+ BDI_ET, // ET
+ BDI_BN, // BN
+ BDI_N, // S
+ BDI_N, // WS
+ BDI_B, // B
+ BDI_RLO, // RLO
+ BDI_RLE, // RLE
+ BDI_LRO, // LRO
+ BDI_LRE, // LRE
+ BDI_PDF, // PDF
+ BDI_ON, // LS
+};
+
+// === HELPER FUNCTIONS ================================================
+
+// reverse cch characters
+static
+void reverse(uint16_t * psz, int cch)
+{
+ uint16_t chTemp;
+ int ich;
+
+ for (ich = 0; ich < --cch; ich++)
+ {
+ chTemp = psz[ich];
+ psz[ich] = psz[cch];
+ psz[cch] = chTemp;
+ }
+}
+
+// Set a run of cval values at locations all prior to, but not including
+// iStart, to the new value nval.
+static
+void SetDeferredRun(int *pval, int cval, int iStart, int nval)
+{
+ int i;
+
+ for (i = iStart - 1; i >= iStart - cval; i--)
+ {
+ pval[i] = nval;
+ }
+}
+
+// === ASSIGNING BIDI CLASSES ============================================
+
+// === THE PARAGRAPH LEVEL ===============================================
+
+/*------------------------------------------------------------------------
+ Function: resolveParagraphs
+
+ Resolves the input strings into blocks over which the algorithm
+ is then applied.
+
+ Implements Rule P1 of the Unicode Bidi Algorithm
+
+ Input: Text string
+ Character count
+
+ Output: revised character count
+
+ Note: This is a very simplistic function. In effect it restricts
+ the action of the algorithm to the first paragraph in the input
+ where a paragraph ends at the end of the first block separator
+ or at the end of the input text.
+
+------------------------------------------------------------------------*/
+
+static
+int resolveParagraphs(int * types, int cch)
+{
+ int ich;
+
+ // skip characters not of type B
+ for(ich = 0; ich < cch && types[ich] != BDI_B; ich++)
+ ;
+ // stop after first B, make it a BN for use in the next steps
+ if (ich < cch && types[ich] == BDI_B)
+ types[ich++] = BDI_BN;
+ return ich;
+}
+
+/*------------------------------------------------------------------------
+ Function: baseLevel
+
+ Determines the base level
+
+ Implements rule P2 of the Unicode Bidi Algorithm.
+
+ Input: Array of directional classes
+ Character count
+
+ Note: Ignores explicit embeddings
+------------------------------------------------------------------------*/
+static
+int baseLevel(const int * pcls, int cch)
+{
+ int ich;
+
+ for (ich = 0; ich < cch; ich++)
+ {
+ switch (pcls[ich])
+ {
+ // strong left
+ case BDI_L:
+ return 0;
+
+ // strong right
+ case BDI_R:
+ case BDI_AL:
+ return 1;
+ }
+ }
+ return 0;
+}
+
+//====== RESOLVE EXPLICIT ================================================
+
+static
+int GreaterEven(int i)
+{
+ return odd(i) ? i + 1 : i + 2;
+}
+
+static
+int GreaterOdd(int i)
+{
+ return odd(i) ? i + 2 : i + 1;
+}
+
+static
+int EmbeddingDirection(int level)
+{
+ return odd(level) ? BDI_R : BDI_L;
+}
+
+
+/*------------------------------------------------------------------------
+ Function: resolveExplicit
+
+ Recursively resolves explicit embedding levels and overrides.
+ Implements rules X1-X9, of the Unicode Bidirectional Algorithm.
+
+ Input: Base embedding level and direction
+ Character count
+
+ Output: Array of embedding levels
+ Caller must allocate (one level per input character)
+
+ In/Out: Array of direction classes
+
+
+ Note: The function uses two simple counters to keep track of
+ matching explicit codes and PDF. Use the default argument for
+ the outermost call. The nesting counter counts the recursion
+ depth and not the embedding level.
+------------------------------------------------------------------------*/
+const int MAX_LEVEL = 61; // the real value
+
+int Bidi_resolveExplicit(int level, int dir, int * pcls, int * plevel, int cch,
+ int nNest)
+{
+ int ich;
+
+ // always called with a valid nesting level
+ // nesting levels are != embedding levels
+ int nLastValid = nNest;
+
+ // check input values
+ assert(nNest >= 0 && level >= 0 && level <= MAX_LEVEL);
+
+ // process the text
+ for (ich = 0; ich < cch; ich++)
+ {
+ int cls = pcls[ich];
+ switch (cls)
+ {
+ case BDI_LRO:
+ case BDI_LRE:
+ nNest++;
+ if (GreaterEven(level) <= MAX_LEVEL)
+ {
+ plevel[ich] = GreaterEven(level);
+ pcls[ich] = BDI_BN;
+ ich += Bidi_resolveExplicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L),
+ &pcls[ich+1], &plevel[ich+1],
+ cch - (ich+1), nNest);
+ nNest--;
+ continue;
+ }
+ cls = pcls[ich] = BDI_BN;
+ break;
+
+ case BDI_RLO:
+ case BDI_RLE:
+ nNest++;
+ if (GreaterOdd(level) <= MAX_LEVEL)
+ {
+ plevel[ich] = GreaterOdd(level);
+ pcls[ich] = BDI_BN;
+ ich += Bidi_resolveExplicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R),
+ &pcls[ich+1], &plevel[ich+1],
+ cch - (ich+1), nNest);
+ nNest--;
+ continue;
+ }
+ cls = pcls[ich] = BDI_BN;
+ break;
+
+ case BDI_PDF:
+ cls = pcls[ich] = BDI_BN;
+ if (nNest)
+ {
+ if (nLastValid < nNest)
+ {
+ nNest--;
+ }
+ else
+ {
+ cch = ich; // break the loop, but complete body
+ }
+ }
+ break;
+ }
+
+ // Apply the override
+ if (dir != BDI_N)
+ {
+ cls = dir;
+ }
+ plevel[ich] = level;
+ if (pcls[ich] != BDI_BN)
+ pcls[ich] = cls;
+ }
+
+ return ich;
+}
+
+// === RESOLVE WEAK TYPES ================================================
+
+enum bidi_state // possible states
+{
+ xa, // arabic letter
+ xr, // right leter
+ xl, // left letter
+
+ ao, // arabic lett. foll by ON
+ ro, // right lett. foll by ON
+ lo, // left lett. foll by ON
+
+ rt, // ET following R
+ lt, // ET following L
+
+ cn, // EN, AN following AL
+ ra, // arabic number foll R
+ re, // european number foll R
+ la, // arabic number foll L
+ le, // european number foll L
+
+ ac, // CS following cn
+ rc, // CS following ra
+ rs, // CS,ES following re
+ lc, // CS following la
+ ls, // CS,ES following le
+
+ ret, // ET following re
+ let // ET following le
+} ;
+
+int stateWeak[][10] =
+{
+ // N, L, R, AN, EN, AL,NSM, CS, ES, ET,
+/*xa*/ { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter */
+/*xr*/ { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter */
+/*xl*/ { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter */
+
+/*ao*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* arabic lett. foll by ON*/
+/*ro*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* right lett. foll by ON */
+/*lo*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* left lett. foll by ON */
+
+/*rt*/ { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R */
+/*lt*/ { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L */
+
+/*cn*/ { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL */
+/*ra*/ { ro, xl, xr, ra, re, xa, ra, rc, ro, rt }, /* arabic number foll R */
+/*re*/ { ro, xl, xr, ra, re, xa, re, rs, rs,ret }, /* european number foll R */
+/*la*/ { lo, xl, xr, la, le, xa, la, lc, lo, lt }, /* arabic number foll L */
+/*le*/ { lo, xl, xr, la, le, xa, le, ls, ls,let }, /* european number foll L */
+
+/*ac*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn */
+/*rc*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra */
+/*rs*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re */
+/*lc*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la */
+/*ls*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le */
+
+/*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re */
+/*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let } /* ET following le */
+
+
+};
+
+enum bidi_action // possible actions
+{
+ // primitives
+ IX = 0x100, // increment
+ XX = 0xF, // no-op
+
+ // actions
+ xxx = (XX << 4) + XX, // no-op
+ xIx = IX + xxx, // increment run
+ xxN = (XX << 4) + BDI_ON, // set current to N
+ xxE = (XX << 4) + BDI_EN, // set current to EN
+ xxA = (XX << 4) + BDI_AN, // set current to AN
+ xxR = (XX << 4) + BDI_R, // set current to R
+ xxL = (XX << 4) + BDI_L, // set current to L
+ Nxx = (BDI_ON << 4) + 0xF, // set run to neutral
+ Axx = (BDI_AN << 4) + 0xF, // set run to AN
+ ExE = (BDI_EN << 4) + BDI_EN,// set run to EN, set current to EN
+ NIx = (BDI_ON << 4) + 0xF + IX,// set run to N, increment
+ NxN = (BDI_ON << 4) + BDI_ON,// set run to N, set current to N
+ NxR = (BDI_ON << 4) + BDI_R, // set run to N, set current to R
+ NxE = (BDI_ON << 4) + BDI_EN,// set run to N, set current to EN
+
+ AxA = (BDI_AN << 4) + BDI_AN,// set run to AN, set current to AN
+ NxL = (BDI_ON << 4) + BDI_L, // set run to N, set current to L
+ LxL = (BDI_L << 4) + BDI_L // set run to L, set current to L
+};
+
+
+int actionWeak[][10] =
+{
+ // N,.. L, R, AN, EN, AL, NSM, CS,..ES, ET,
+/*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter */
+/*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right leter */
+/*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter */
+
+/*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON */
+/*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON */
+/*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON */
+
+/*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R */
+/*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L */
+
+/*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following AL */
+/*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R */
+/*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R */
+/*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L */
+/*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L */
+
+/*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn */
+/*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra */
+/*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re */
+/*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la */
+/*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le */
+
+/*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re */
+/*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL } /* ET following le */
+};
+
+static
+int GetDeferredType(int action)
+{
+ return (action >> 4) & 0xF;
+}
+
+static
+int GetResolvedType(int action)
+{
+ return action & 0xF;
+}
+
+/* Note on action table:
+
+ States can be of two kinds:
+ - Immediate Resolution State, where each input token
+ is resolved as soon as it is seen. These states havve
+ only single action codes (xxN) or the no-op (xxx)
+ for static input tokens.
+ - Deferred Resolution State, where input tokens either
+ either extend the run (xIx) or resolve its Type (e.g. Nxx).
+
+ Input classes are of three kinds
+ - Static Input Token, where the class of the token remains
+ unchanged on output (AN, L, N, R)
+ - Replaced Input Token, where the class of the token is
+ always replaced on output (AL, BDI_BN, NSM, CS, ES, ET)
+ - Conditional Input Token, where the class of the token is
+ changed on output in some but not all cases (EN)
+
+ Where tokens are subject to change, a double action
+ (e.g. NxA, or NxN) is _required_ after deferred states,
+ resolving both the deferred state and changing the current token.
+
+ These properties of the table are verified by assertions below.
+ This code is needed only during debugging and maintenance
+*/
+
+/*------------------------------------------------------------------------
+ Function: resolveWeak
+
+ Resolves the directionality of numeric and other weak character types
+
+ Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm.
+
+ Input: Array of embedding levels
+ Character count
+
+ In/Out: Array of directional classes
+
+ Note: On input only these directional classes are expected
+ AL, HL, R, L, ON, BDI_BN, NSM, AN, EN, ES, ET, CS,
+------------------------------------------------------------------------*/
+void Bidi_resolveWeak(fz_context *ctx, int baselevel, int *pcls, int *plevel, int cch)
+{
+ int state = odd(baselevel) ? xr : xl;
+ int cls;
+ int ich;
+ int action;
+ int clsRun;
+ int clsNew;
+
+ int level = baselevel;
+
+ int cchRun = 0;
+
+ for (ich = 0; ich < cch; ich++)
+ {
+ if (pcls[ich] > BDI_BN) {
+ fz_warn(ctx, "error: pcls[%d] > BN (%d)\n", ich, pcls[ich]);
+ }
+
+ // ignore boundary neutrals
+ if (pcls[ich] == BDI_BN)
+ {
+ // must flatten levels unless at a level change;
+ plevel[ich] = level;
+
+ // lookahead for level changes
+ if (ich + 1 == cch && level != baselevel)
+ {
+ // have to fixup last BN before end of the loop, since
+ // its fix-upped value will be needed below the assert
+ pcls[ich] = EmbeddingDirection(level);
+ }
+ else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN)
+ {
+ // fixup LAST BN in front / after a level run to make
+ // it act like the SOR/EOR in rule X10
+ int newlevel = plevel[ich+1];
+ if (level > newlevel) {
+ newlevel = level;
+ }
+ plevel[ich] = newlevel;
+
+ // must match assigned level
+ pcls[ich] = EmbeddingDirection(newlevel);
+ level = plevel[ich+1];
+ }
+ else
+ {
+ // don't interrupt runs
+ if (cchRun)
+ {
+ cchRun++;
+ }
+ continue;
+ }
+ }
+
+ assert(pcls[ich] <= BDI_BN);
+ cls = pcls[ich];
+
+ action = actionWeak[state][cls];
+
+ // resolve the directionality for deferred runs
+ clsRun = GetDeferredType(action);
+ if (clsRun != XX)
+ {
+ SetDeferredRun(pcls, cchRun, ich, clsRun);
+ cchRun = 0;
+ }
+
+ // resolve the directionality class at the current location
+ clsNew = GetResolvedType(action);
+ if (clsNew != XX)
+ pcls[ich] = clsNew;
+
+ // increment a deferred run
+ if (IX & action)
+ cchRun++;
+
+ state = stateWeak[state][cls];
+ }
+
+ // resolve any deferred runs
+ // use the direction of the current level to emulate PDF
+ cls = EmbeddingDirection(level);
+
+ // resolve the directionality for deferred runs
+ clsRun = GetDeferredType(actionWeak[state][cls]);
+ if (clsRun != XX)
+ SetDeferredRun(pcls, cchRun, ich, clsRun);
+}
+
+// === RESOLVE NEUTAL TYPES ==============================================
+
+// action values
+enum neutral_action
+{
+ // action to resolve previous input
+ nL = BDI_L, // resolve EN to L
+ En = 3 << 4, // resolve neutrals run to embedding level direction
+ Rn = BDI_R << 4, // resolve neutrals run to strong right
+ Ln = BDI_L << 4, // resolved neutrals run to strong left
+ In = (1<<8), // increment count of deferred neutrals
+ LnL = (1<<4)+BDI_L // set run and EN to L
+};
+
+static
+int GetDeferredNeutrals(int action, int level)
+{
+ action = (action >> 4) & 0xF;
+ if (action == (En >> 4))
+ return EmbeddingDirection(level);
+ else
+ return action;
+}
+
+static
+int GetResolvedNeutrals(int action)
+{
+ action = action & 0xF;
+ if (action == In)
+ return 0;
+ else
+ return action;
+}
+
+// state values
+enum neutral_state
+{
+ // new temporary class
+ r, // R and characters resolved to R
+ l, // L and characters resolved to L
+ rn, // N preceded by right
+ ln, // N preceded by left
+ a, // AN preceded by left (the abbrev 'la' is used up above)
+ na // N preceeded by a
+} ;
+
+
+/*------------------------------------------------------------------------
+ Notes:
+
+ By rule W7, whenever a EN is 'dominated' by an L (including start of
+ run with embedding direction = L) it is resolved to, and further treated
+ as L.
+
+ This leads to the need for 'a' and 'na' states.
+------------------------------------------------------------------------*/
+
+int actionNeutrals[][5] =
+{
+// N, L, R, AN, EN, = cls
+ // state =
+ {In, 0, 0, 0, 0}, // r right
+ {In, 0, 0, 0, BDI_L},// l left
+
+ {In, En, Rn, Rn, Rn}, // rn N preceded by right
+ {In, Ln, En, En, LnL}, // ln N preceded by left
+
+ {In, 0, 0, 0, BDI_L},// a AN preceded by left
+ {In, En, Rn, Rn, En} // na N preceded by a
+} ;
+
+int stateNeutrals[][5] =
+{
+// N, L, R, AN, EN = cls
+ // state =
+ {rn, l, r, r, r}, // r right
+ {ln, l, r, a, l}, // l left
+
+ {rn, l, r, r, r}, // rn N preceded by right
+ {ln, l, r, a, l}, // ln N preceded by left
+
+ {na, l, r, a, l}, // a AN preceded by left
+ {na, l, r, a, l} // na N preceded by la
+} ;
+
+/*------------------------------------------------------------------------
+ Function: resolveNeutrals
+
+ Resolves the directionality of neutral character types.
+
+ Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm.
+
+ Input: Array of embedding levels
+ Character count
+ Baselevel
+
+ In/Out: Array of directional classes
+
+ Note: On input only these directional classes are expected
+ R, L, N, AN, EN and BN
+
+ W8 resolves a number of ENs to L
+------------------------------------------------------------------------*/
+void Bidi_resolveNeutrals(int baselevel, int *pcls, const int *plevel, int cch)
+{
+ // the state at the start of text depends on the base level
+ int state = odd(baselevel) ? r : l;
+ int cls;
+ int ich;
+ int clsRun;
+
+ int cchRun = 0;
+ int level = baselevel;
+
+ for (ich = 0; ich < cch; ich++)
+ {
+ int action;
+ int clsNew;
+
+ // ignore boundary neutrals
+ if (pcls[ich] == BDI_BN)
+ {
+ // include in the count for a deferred run
+ if (cchRun)
+ cchRun++;
+
+ // skip any further processing
+ continue;
+ }
+
+ assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed"
+ cls = pcls[ich];
+
+ action = actionNeutrals[state][cls];
+
+ // resolve the directionality for deferred runs
+ clsRun = GetDeferredNeutrals(action, level);
+ if (clsRun != BDI_N)
+ {
+ SetDeferredRun(pcls, cchRun, ich, clsRun);
+ cchRun = 0;
+ }
+
+ // resolve the directionality class at the current location
+ clsNew = GetResolvedNeutrals(action);
+ if (clsNew != BDI_N)
+ pcls[ich] = clsNew;
+
+ if (In & action)
+ cchRun++;
+
+ state = stateNeutrals[state][cls];
+ level = plevel[ich];
+ }
+
+ // resolve any deferred runs
+ cls = EmbeddingDirection(level); // eor has type of current level
+
+ // resolve the directionality for deferred runs
+ clsRun = GetDeferredNeutrals(actionNeutrals[state][cls], level);
+ if (clsRun != BDI_N)
+ SetDeferredRun(pcls, cchRun, ich, clsRun);
+}
+
+// === RESOLVE IMPLLICIT =================================================
+
+/*------------------------------------------------------------------------
+ Function: resolveImplicit
+
+ Recursively resolves implicit embedding levels.
+ Implements rules I1 and I2 of the Unicode Bidirectional Algorithm.
+
+ Input: Array of direction classes
+ Character count
+ Base level
+
+ In/Out: Array of embedding levels
+
+ Note: levels may exceed 15 on output.
+ Accepted subset of direction classes
+ R, L, AN, EN
+------------------------------------------------------------------------*/
+int addLevel[][4] =
+{
+ // L, R, AN, EN = cls
+ // level =
+/* even */ { 0, 1, 2, 2 }, // EVEN
+/* odd */ { 1, 0, 1, 1 } // ODD
+
+};
+
+void Bidi_resolveImplicit(const int * pcls, int * plevel, int cch)
+{
+ int ich;
+
+ for (ich = 0; ich < cch; ich++)
+ {
+ // cannot resolve bn here, since some bn were resolved to strong
+ // types in resolveWeak. To remove these we need the original
+ // types, which are available again in resolveWhiteSpace
+ if (pcls[ich] == BDI_BN)
+ {
+ continue;
+ }
+ assert(pcls[ich] > 0); // "No Neutrals allowed to survive here."
+ assert(pcls[ich] < 5); // "Out of range."
+ plevel[ich] += addLevel[odd(plevel[ich])][pcls[ich] - 1];
+ }
+}
+
+// === REORDER ===========================================================
+/*------------------------------------------------------------------------
+ Function: resolveLines
+
+ Breaks a paragraph into lines
+
+ Input: Character count
+ Array of line break flags
+ In/Out: Array of characters
+
+ Returns the count of characters on the first line
+
+ Note: This function only breaks lines at hard line breaks. Other
+ line breaks can be passed in. If pbrk[n] is true, then a break
+ occurs after the character in pszInput[n]. Breaks before the first
+ character are not allowed.
+------------------------------------------------------------------------*/
+static
+int resolveLines(uint16_t * pszInput, int * pbrk, int cch)
+{
+ int ich;
+
+ // skip characters not of type LS
+ for(ich = 0; ich < cch; ich++)
+ {
+ if (pszInput[ich] == chLS || (pbrk && pbrk[ich]))
+ {
+ ich++;
+ break;
+ }
+ }
+
+ return ich;
+}
+
+/*------------------------------------------------------------------------
+ Function: resolveWhiteSpace
+
+ Resolves levels for WS and S
+ Implements rule L1 of the Unicode bidi Algorithm.
+
+ Input: Base embedding level
+ Character count
+ Array of direction classes (for one line of text)
+
+ In/Out: Array of embedding levels (for one line of text)
+
+ Note: this should be applied a line at a time. The default driver
+ code supplied in this file assumes a single line of text; for
+ a real implementation, cch and the initial pointer values
+ would have to be adjusted.
+------------------------------------------------------------------------*/
+void Bidi_resolveWhitespace(int baselevel, const int *pcls, int *plevel,
+ int cch)
+{
+ int cchrun = 0;
+ int oldlevel = baselevel;
+ int ich;
+
+ for (ich = 0; ich < cch; ich++)
+ {
+ switch(pcls[ich])
+ {
+ default:
+ cchrun = 0; // any other character breaks the run
+ break;
+ case BDI_WS:
+ cchrun++;
+ break;
+
+ case BDI_RLE:
+ case BDI_LRE:
+ case BDI_LRO:
+ case BDI_RLO:
+ case BDI_PDF:
+ case BDI_BN:
+ plevel[ich] = oldlevel;
+ cchrun++;
+ break;
+
+ case BDI_S:
+ case BDI_B:
+ // reset levels for WS before eot
+ SetDeferredRun(plevel, cchrun, ich, baselevel);
+ cchrun = 0;
+ plevel[ich] = baselevel;
+ break;
+ }
+ oldlevel = plevel[ich];
+ }
+ // reset level before eot
+ SetDeferredRun(plevel, cchrun, ich, baselevel);
+}
+
+#ifdef BIDI_LINE_AT_A_TIME
+/*------------------------------------------------------------------------
+ Functions: reorder/reorderLevel
+
+ Recursively reorders the display string
+ "From the highest level down, reverse all characters at that level and
+ higher, down to the lowest odd level"
+
+ Implements rule L2 of the Unicode bidi Algorithm.
+
+ Input: Array of embedding levels
+ Character count
+ Flag enabling reversal (set to false by initial caller)
+
+ In/Out: Text to reorder
+
+ Note: levels may exceed 15 resp. 61 on input.
+
+ Rule L3 - reorder combining marks is not implemented here
+ Rule L4 - glyph mirroring is implemented as a display option below
+
+ Note: this should be applied a line at a time
+-------------------------------------------------------------------------*/
+static int reorderLevel(int level, uint16_t * pszText, const int * plevel, int cch,
+ int fReverse)
+{
+ int ich;
+
+ // true as soon as first odd level encountered
+ fReverse = fReverse || odd(level);
+
+ for (ich = 0; ich < cch; ich++)
+ {
+ if (plevel[ich] < level)
+ {
+ break;
+ }
+ else if (plevel[ich] > level)
+ {
+ ich += reorderLevel(level + 1, pszText + ich, plevel + ich,
+ cch - ich, fReverse) - 1;
+ }
+ }
+ if (fReverse)
+ {
+ reverse(pszText, ich);
+ }
+ return ich;
+}
+
+
+int Bidi_reorder(int baselevel, uint16_t * pszText, const int * plevel, int cch)
+{
+ int ich = 0;
+
+ while (ich < cch)
+ {
+ ich += reorderLevel(baselevel, pszText + ich, plevel + ich,
+ cch - ich, FALSE);
+ }
+ return ich;
+}
+#endif