From 201388486324758d08ab3b1be4cc0a84b8b1ae9a Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Tue, 19 Jan 2016 18:00:49 +0000 Subject: Tidy bidirectional source. Make the import follow mupdf style (better, if not perfect). Use ucdn where possible to avoid duplicating tables. Shrink the types, make them explicit (e.g. use fz_bidi_level rather than int) and make tables const. Use 32-bit integers for text. --- source/fitz/bidi-std.c | 1764 ++++++++++++++++++++++++------------------------ 1 file changed, 871 insertions(+), 893 deletions(-) (limited to 'source/fitz/bidi-std.c') diff --git a/source/fitz/bidi-std.c b/source/fitz/bidi-std.c index e8282b96..5d74d361 100644 --- a/source/fitz/bidi-std.c +++ b/source/fitz/bidi-std.c @@ -14,256 +14,256 @@ #endif /*------------------------------------------------------------------------ - File: Bidi.Cpp + File: Bidi.Cpp + + Description + ----------- + + Sample Implementation of the Unicode Bidirectional Algorithm as it + was revised by Revision 5 of the Uniode Technical Report # 9 + (1999-8-17) + + Verified for changes to the algorithm up through Unicode 5.2.0 (2009). + + This implementation is organized into several passes, each implemen- + ting one or more of the rules of the Unicode Bidi Algorithm. The + resolution of Weak Types and of Neutrals each use a state table + approach. + + Both a printf based interface and a Windows DlgProc are provided for + interactive testing. + + A stress harness comparing this implementation (v24) to a Java based + implementation was used by Doug Felt to verify that the two + implementations produce identical results for all strings up to six + bidi classes and stochastic strings up to length 20. + + Version 26 was verified by the author against the Unicode 5.2.0 + file BidiTest.txt, which provides an exhaustive text of strings of + length 4 or less, but covers some important cases where the language + in UAX#9 had been clarified. + + To see this code running in an actual Windows program, + download the free Unibook uitlity from http://unicode.org/unibook + The bidi demo is executed from the tools menu. It is build from + this source file. + + Build Notes + ----------- + + To compile the sample implementation please set the #define + directives above so the correct headers get included. Not all the + files are needed for all purposes. For the commandline version + only bidi.h and bidi.cpp are needed. + + The Win32 version is provided as a dialog procedure. To use as a + standalone program compile with the the lbmain.cpp file. If all you + need is the ability to run the code "as is", you can instead download + the unibook utility from http://uincode.org/unibook/ which contains + the bidi demo compiled from this source file. + + This code uses an extension to C++ that gives variables declared in + a for() statement function the same scope as the for() statement. + If your compiler does not support this extension, you may need to + move the declaration, e.g. int ich = 0; in front of the for statement. + + Implementation Note + ------------------- + + NOTE: The Unicode Bidirectional Algorithm removes all explicit + formatting codes in rule X9, but states that this can be + simulated by conformant implementations. This implementation + attempts to demonstrate such a simulation - Description - ----------- - - Sample Implementation of the Unicode Bidirectional Algorithm as it - was revised by Revision 5 of the Uniode Technical Report # 9 - (1999-8-17) - - Verified for changes to the algorithm up through Unicode 5.2.0 (2009). + To demonstrate this, the current implementation does the + following: - This implementation is organized into several passes, each implemen- - ting one or more of the rules of the Unicode Bidi Algorithm. The - resolution of Weak Types and of Neutrals each use a state table - approach. - - Both a printf based interface and a Windows DlgProc are provided for - interactive testing. - - A stress harness comparing this implementation (v24) to a Java based - implementation was used by Doug Felt to verify that the two - implementations produce identical results for all strings up to six - bidi classes and stochastic strings up to length 20. - - Version 26 was verified by the author against the Unicode 5.2.0 - file BidiTest.txt, which provides an exhaustive text of strings of - length 4 or less, but covers some important cases where the language - in UAX#9 had been clarified. - - To see this code running in an actual Windows program, - download the free Unibook uitlity from http://unicode.org/unibook - The bidi demo is executed from the tools menu. It is build from - this source file. - - Build Notes - ----------- - - To compile the sample implementation please set the #define - directives above so the correct headers get included. Not all the - files are needed for all purposes. For the commandline version - only bidi.h and bidi.cpp are needed. - - The Win32 version is provided as a dialog procedure. To use as a - standalone program compile with the the lbmain.cpp file. If all you - need is the ability to run the code "as is", you can instead download - the unibook utility from http://uincode.org/unibook/ which contains - the bidi demo compiled from this source file. - - This code uses an extension to C++ that gives variables declared in - a for() statement function the same scope as the for() statement. - If your compiler does not support this extension, you may need to - move the declaration, e.g. int ich = 0; in front of the for statement. - - Implementation Note - ------------------- - - NOTE: The Unicode Bidirectional Algorithm removes all explicit - formatting codes in rule X9, but states that this can be - simulated by conformant implementations. This implementation - attempts to demonstrate such a simulation + in resolveExplicit() + - change LRE, LRO, RLE, RLO, PDF to BN + - assign nested levels to BN - To demonstrate this, the current implementation does the - following: + in resolveWeak and resolveNeutrals + - assign L and R to BN's where they exist in place of + sor and eor by changing the last BN in front of a + level change to a strong type + - skip over BN's for the purpose of determining actions + - include BN in the count of deferred runs + which will resolve some of them to EN, AN and N - in resolveExplicit() - - change LRE, LRO, RLE, RLO, PDF to BN - - assign nested levels to BN + in resolveWhiteSpace + - set the level of any surviving BN to the base level, + or the level of the preceding character + - include LRE,LRO, RLE, RLO, PDF and BN in the count + whitespace to be reset - in resolveWeak and resolveNeutrals - - assign L and R to BN's where they exist in place of - sor and eor by changing the last BN in front of a - level change to a strong type - - skip over BN's for the purpose of determining actions - - include BN in the count of deferred runs - which will resolve some of them to EN, AN and N + This will result in the same order for non-BN characters as + if the BN characters had been removed. - in resolveWhiteSpace - - set the level of any surviving BN to the base level, - or the level of the preceding character - - include LRE,LRO, RLE, RLO, PDF and BN in the count - whitespace to be reset + The clean() function can be used to remove boundary marks for + verification purposes. - This will result in the same order for non-BN characters as - if the BN characters had been removed. + Notation + -------- + Pointer variables generally start with the letter p + Counter variables generally start with the letter c + Index variables generally start with the letter i + Boolean variables generally start with the letter f - The clean() function can be used to remove boundary marks for - verification purposes. + The enumerated bidirectional types have the same name as in the + description for the Unicode Bidirectional Algorithm - Notation - -------- - Pointer variables generally start with the letter p - Counter variables generally start with the letter c - Index variables generally start with the letter i - Boolean variables generally start with the letter f + Using this code outside a demo context + -------------------------------------- - The enumerated bidirectional types have the same name as in the - description for the Unicode Bidirectional Algorithm + The way the functions are broken down in this demo code is based + on the needs of the demo to show the evolution in internal state + as the algorithm proceeds. This obscures how the algorithm would + be used in practice. These are the steps: + 1. Allocate a pair of arrays large enough to hold bidi class + and calculated levels (one for each input character) - Using this code outside a demo context - -------------------------------------- + 2. Provide your own function to assign directional types (bidi + classes) corresponding to each character in the input, + conflating ON, WS, S to N. Unlike the classify function in this + demo, the input would be actual Unicode characters. - The way the functions are broken down in this demo code is based - on the needs of the demo to show the evolution in internal state - as the algorithm proceeds. This obscures how the algorithm would - be used in practice. These are the steps: + 3. Process the first paragraph by calling BidiParagraph. That + function changes B into BN and returns a length including the + paragraph separator. (The iteration over multiple paragraphs + is left as excercise for the reader). - 1. Allocate a pair of arrays large enough to hold bidi class - and calculated levels (one for each input character) + 4. Assign directional types again, but now assign specific types + to whitespace characters. - 2. Provide your own function to assign directional types (bidi - classes) corresponding to each character in the input, - conflating ON, WS, S to N. Unlike the classify function in this - demo, the input would be actual Unicode characters. + 5. Instead of reordering the input in place it is often desirable + to calculate an array of offsets indicating the reordering. + To that end, allocate such an array here and use it instead + of the input array in the next step. - 3. Process the first paragraph by calling BidiParagraph. That - function changes B into BN and returns a length including the - paragraph separator. (The iteration over multiple paragraphs - is left as excercise for the reader). - - 4. Assign directional types again, but now assign specific types - to whitespace characters. + 6. Resolve and reorder the lines by calling BidiLines. That + function 'breaks' lines on LS characters. Provide an optional + array of flags indicating the location of other line breaks as + needed. - 5. Instead of reordering the input in place it is often desirable - to calculate an array of offsets indicating the reordering. - To that end, allocate such an array here and use it instead - of the input array in the next step. - - 6. Resolve and reorder the lines by calling BidiLines. That - function 'breaks' lines on LS characters. Provide an optional - array of flags indicating the location of other line breaks as - needed. + Update History + -------------- + Version 24 is the initial published and verified version of this + reference implementation. Version 25 and its updates fix various + minor issues with the scaffolding used for demonstrating the + algorithm using pseudo-alphabets from the command line or dialog + box. No changes to the implementation of the actual bidi algrithm + are made in any of the minor updates to version 25. Version 26 + also makes no change to the actual algorithm but was verified + against the official BidiTest.txt file for Unicode 5.2.0. + - updated pseudo-alphabet - Update History - -------------- - Version 24 is the initial published and verified version of this - reference implementation. Version 25 and its updates fix various - minor issues with the scaffolding used for demonstrating the - algorithm using pseudo-alphabets from the command line or dialog - box. No changes to the implementation of the actual bidi algrithm - are made in any of the minor updates to version 25. Version 26 - also makes no change to the actual algorithm but was verified - against the official BidiTest.txt file for Unicode 5.2.0. + - Last Revised 12-10-99 (25) - - updated pseudo-alphabet + - enable demo mode for release builds - no other changes - - Last Revised 12-10-99 (25) + - Last Revised 12-10-00 (25a) - - enable demo mode for release builds - no other changes + - fix regression in pseudo alphabet use for Windows UI - - Last Revised 12-10-00 (25a) + - Last Revised 02-01-01 (25b) - - fix regression in pseudo alphabet use for Windows UI + - fixed a few comments, renamed a variable - - Last Revised 02-01-01 (25b) + - Last Revised 03-04-01 (25c) - - fixed a few comments, renamed a variable + - make base level settable, enable mirror by default, + fix dialog size - - Last Revised 03-04-01 (25c) + - Last Revised 06-02-01 (25e) - - make base level settable, enable mirror by default, - fix dialog size + - fixed some comments - - Last Revised 06-02-01 (25e) + - Last Revised 09-29-01 (25f) - - fixed some comments + - fixed classification for LS,RLM,LRM in pseudo alphabet, + focus issues in UI, regression fix to commandline from 25(e) + fix DEMO switch - - Last Revised 09-29-01 (25f) + - Last Revised 11-07-01 (25g) - - fixed classification for LS,RLM,LRM in pseudo alphabet, - focus issues in UI, regression fix to commandline from 25(e) - fix DEMO switch + - fixed classification for plus/minus in pseudo alphabet + to track changes made in Unicode 4.0.1 - - Last Revised 11-07-01 (25g) + - Last Revised 12-03-04 (25h) - - fixed classification for plus/minus in pseudo alphabet - to track changes made in Unicode 4.0.1 + - now compiles as dialog-only program for WINDOWS_UI==1 + using new bidimain.cpp - - Last Revised 12-03-04 (25h) + - Last Revised 12-02-07 (25i) - - now compiles as dialog-only program for WINDOWS_UI==1 - using new bidimain.cpp + - cleaned up whitespace and indenting in the source, + fixed two comments (table headers) - - Last Revised 12-02-07 (25i) + - Last Revised 15-03-07 (25j) - - cleaned up whitespace and indenting in the source, - fixed two comments (table headers) + - named enumerations - - Last Revised 15-03-07 (25j) + - Last Revised 30-05-07 (25k) - - named enumerations + - added usage notes, minor edits to comments, indentation, etc + throughout. Added the bidiParagraph function. Checked against + changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No + changes needed to this implementation to match the values in + the BidiTest.txt file in the Unicode Character Database. + Minor fixes to dialog/windows proc, updated preprocessor directives. - - Last Revised 30-05-07 (25k) + - Last Revised 03-08-09 (26) - - added usage notes, minor edits to comments, indentation, etc - throughout. Added the bidiParagraph function. Checked against - changes in the Unicode Bidi Algorithm for Unicode 5.2.0. No - changes needed to this implementation to match the values in - the BidiTest.txt file in the Unicode Character Database. - Minor fixes to dialog/windows proc, updated preprocessor directives. + Credits: + ------- + Written by: Asmus Freytag + Command line interface by: Rick McGowan + Verification (v24): Doug Felt - - Last Revised 03-08-09 (26) + Disclaimer and legal rights: + --------------------------- + Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved. + Distributed under the Terms of Use in http://www.unicode.org/copyright.html. - Credits: - ------- - Written by: Asmus Freytag - Command line interface by: Rick McGowan - Verification (v24): Doug Felt + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE + BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, + OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, + ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE. - Disclaimer and legal rights: - --------------------------- - Copyright (C) 1999-2009, ASMUS, Inc. All Rights Reserved. - Distributed under the Terms of Use in http://www.unicode.org/copyright.html. - - THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE - BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, - OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, - ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE. - - The file bid.rc is included in the software covered by the above. + The file bid.rc is included in the software covered by the above. ------------------------------------------------------------------------*/ - /* === HELPER FUNCTIONS AND DECLARATIONS ================================= */ #define odd(x) ((x) & 1) /*---------------------------------------------------------------------- - The following array maps character codes to types for the purpose of - this sample implementation. The legend string gives a human readable - explanation of the pseudo alphabet. + The following array maps character codes to types for the purpose of + this sample implementation. The legend string gives a human readable + explanation of the pseudo alphabet. - For simplicity, characters entered by buttons are given a 1:1 mapping - between their type and pseudo character value. Pseudo characters that - can be typed from the keyboard are explained in the legend string. + For simplicity, characters entered by buttons are given a 1:1 mapping + between their type and pseudo character value. Pseudo characters that + can be typed from the keyboard are explained in the legend string. - Use the Unicode Character Database for the real values in real use. + Use the Unicode Character Database for the real values in real use. ---------------------------------------------------------------------*/ -const int chLS = 0x15; - -int TypesFromChar[] = +enum { + chLS = 0x15 +}; -// 0 1 2 3 4 5 6 7 8 9 a b c d e f +#if 0 +static const fz_bidi_chartype types_from_char[] = +{ +// 0 1 2 3 4 5 6 7 8 9 a b c d e f BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_S, BDI_B, BDI_S, BDI_WS, BDI_B, BDI_BN, BDI_BN, /*00-0f*/ BDI_LRO,BDI_LRE,BDI_PDF,BDI_RLO,BDI_RLE,BDI_WS, BDI_L, BDI_R, BDI_BN, BDI_BN, BDI_BN, BDI_BN, BDI_B, BDI_B, BDI_B, BDI_S, /*10-1f*/ BDI_WS, BDI_ON, BDI_ON, BDI_ET, BDI_ET, BDI_ET, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ON, BDI_ES, BDI_CS, BDI_ES, BDI_CS, BDI_ES, /*20-2f*/ @@ -273,87 +273,75 @@ BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_R, BDI_NSM,BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, /*60-6f*/ BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_L, BDI_LRO,BDI_B, BDI_RLO,BDI_BN, BDI_ON, /*70-7f*/ }; - +#endif /*************************************** - Reverse, human readable reference: - - LRM: 0x4 - RLM: 0x5 - L: 0x16,a-z - LRE: 0x11,[ - LRO: 0x10,{ - R: 0x17,G-Z - AL: A-F - RLE: 0x14,] - RLO: 0x13,} - PDF: 0x12,^ - EN: 0-5 - ES: /,+,[hyphen] - ET: #,$,% - AN: 6-9 - CS: [comma],.,: - NSM: ` - BN: 0x0-0x8,0xe,0xf,0x18-0x1b,~ - B: 0xa,0xd,0x1c-0x1e,| - S: 0x9,0xb,0x1f,_ - WS: 0xc,0x15,[space] - ON: !,",&,',(,),*,;,<,=,>,?,@,\,0x7f + Reverse, human readable reference: + + LRM: 0x4 + RLM: 0x5 + L: 0x16,a-z + LRE: 0x11,[ + LRO: 0x10,{ + R: 0x17,G-Z + AL: A-F + RLE: 0x14,] + RLO: 0x13,} + PDF: 0x12,^ + EN: 0-5 + ES: /,+,[hyphen] + ET: #,$,% + AN: 6-9 + CS: [comma],.,: + NSM: ` + BN: 0x0-0x8,0xe,0xf,0x18-0x1b,~ + B: 0xa,0xd,0x1c-0x1e,| + S: 0x9,0xb,0x1f,_ + WS: 0xc,0x15,[space] + ON: !,",&,',(,),*,;,<,=,>,?,@,\,0x7f ****************************************/ -// WS, LS and S are not explicitly needed except for L1. Therefore this -// table conflates ON, S, WS, and LS to N, all others unchanged -int NTypes[] = { - BDI_N, // ON, - BDI_L, // L, - BDI_R, // R, - BDI_AN, // AN, - BDI_EN, // EN, - BDI_AL, // AL - BDI_NSM, // NSM - BDI_CS, // CS - BDI_ES, // ES - BDI_ET, // ET - BDI_BN, // BN - BDI_N, // S - BDI_N, // WS - BDI_B, // B - BDI_RLO, // RLO - BDI_RLE, // RLE - BDI_LRO, // LRO - BDI_LRE, // LRE - BDI_PDF, // PDF - BDI_ON, // LS -}; - // === HELPER FUNCTIONS ================================================ +#ifdef BIDI_LINE_AT_A_TIME // reverse cch characters static -void reverse(uint16_t * psz, int cch) +void reverse(uint32_t *psz, int cch) { - uint16_t chTemp; - int ich; - - for (ich = 0; ich < --cch; ich++) - { - chTemp = psz[ich]; - psz[ich] = psz[cch]; - psz[cch] = chTemp; - } + uint32_t ch_temp; + int ich; + + for (ich = 0; ich < --cch; ich++) + { + ch_temp = psz[ich]; + psz[ich] = psz[cch]; + psz[cch] = ch_temp; + } } +#endif // Set a run of cval values at locations all prior to, but not including // iStart, to the new value nval. static -void SetDeferredRun(int *pval, int cval, int iStart, int nval) +void set_deferred_run(fz_bidi_chartype *pval, int cval, int iStart, fz_bidi_chartype nval) +{ + int i; + + for (i = iStart - 1; i >= iStart - cval; i--) + { + pval[i] = nval; + } +} + +static +void set_deferred_level_run(fz_bidi_level *pval, int cval, int iStart, fz_bidi_level nval) { - int i; + int i; - for (i = iStart - 1; i >= iStart - cval; i--) - { - pval[i] = nval; - } + for (i = iStart - 1; i >= iStart - cval; i--) + { + pval[i] = nval; + } } // === ASSIGNING BIDI CLASSES ============================================ @@ -361,459 +349,451 @@ void SetDeferredRun(int *pval, int cval, int iStart, int nval) // === THE PARAGRAPH LEVEL =============================================== /*------------------------------------------------------------------------ - Function: resolveParagraphs + Function: resolve_paragraphs - Resolves the input strings into blocks over which the algorithm - is then applied. + Resolves the input strings into blocks over which the algorithm + is then applied. - Implements Rule P1 of the Unicode Bidi Algorithm + Implements Rule P1 of the Unicode Bidi Algorithm - Input: Text string - Character count + Input: Text string + Character count - Output: revised character count + Output: revised character count - Note: This is a very simplistic function. In effect it restricts - the action of the algorithm to the first paragraph in the input - where a paragraph ends at the end of the first block separator - or at the end of the input text. + Note: This is a very simplistic function. In effect it restricts + the action of the algorithm to the first paragraph in the input + where a paragraph ends at the end of the first block separator + or at the end of the input text. ------------------------------------------------------------------------*/ - -static -int resolveParagraphs(int * types, int cch) +#if 0 +static int resolve_paragraphs(fz_bidi_chartype *types, int cch) { - int ich; - - // skip characters not of type B - for(ich = 0; ich < cch && types[ich] != BDI_B; ich++) - ; - // stop after first B, make it a BN for use in the next steps - if (ich < cch && types[ich] == BDI_B) - types[ich++] = BDI_BN; - return ich; + int ich; + + // skip characters not of type B + for(ich = 0; ich < cch && types[ich] != BDI_B; ich++) + ; + // stop after first B, make it a BN for use in the next steps + if (ich < cch && types[ich] == BDI_B) + types[ich++] = BDI_BN; + return ich; } /*------------------------------------------------------------------------ - Function: baseLevel + Function: base_level - Determines the base level + Determines the base level - Implements rule P2 of the Unicode Bidi Algorithm. + Implements rule P2 of the Unicode Bidi Algorithm. - Input: Array of directional classes - Character count + Input: Array of directional classes + Character count - Note: Ignores explicit embeddings + Note: Ignores explicit embeddings ------------------------------------------------------------------------*/ -static -int baseLevel(const int * pcls, int cch) +static int base_level(const fz_bidi_chartype *pcls, int cch) { - int ich; - - for (ich = 0; ich < cch; ich++) - { - switch (pcls[ich]) - { - // strong left - case BDI_L: - return 0; - - // strong right - case BDI_R: - case BDI_AL: - return 1; - } - } - return 0; + int ich; + + for (ich = 0; ich < cch; ich++) + { + switch (pcls[ich]) + { + // strong left + case BDI_L: + return 0; + + // strong right + case BDI_R: + case BDI_AL: + return 1; + } + } + return 0; } +#endif //====== RESOLVE EXPLICIT ================================================ -static -int GreaterEven(int i) +static fz_bidi_level greater_even(fz_bidi_level i) { - return odd(i) ? i + 1 : i + 2; + return odd(i) ? i + 1 : i + 2; } -static -int GreaterOdd(int i) +static fz_bidi_level greater_odd(fz_bidi_level i) { - return odd(i) ? i + 2 : i + 1; + return odd(i) ? i + 2 : i + 1; } -static -int EmbeddingDirection(int level) +static fz_bidi_chartype embedding_direction(fz_bidi_chartype level) { - return odd(level) ? BDI_R : BDI_L; + return odd(level) ? BDI_R : BDI_L; } - /*------------------------------------------------------------------------ - Function: resolveExplicit - - Recursively resolves explicit embedding levels and overrides. - Implements rules X1-X9, of the Unicode Bidirectional Algorithm. + Function: resolveExplicit - Input: Base embedding level and direction - Character count + Recursively resolves explicit embedding levels and overrides. + Implements rules X1-X9, of the Unicode Bidirectional Algorithm. - Output: Array of embedding levels - Caller must allocate (one level per input character) + Input: Base embedding level and direction + Character count - In/Out: Array of direction classes + Output: Array of embedding levels + Caller must allocate (one level per input character) + In/Out: Array of direction classes - Note: The function uses two simple counters to keep track of - matching explicit codes and PDF. Use the default argument for - the outermost call. The nesting counter counts the recursion - depth and not the embedding level. + Note: The function uses two simple counters to keep track of + matching explicit codes and PDF. Use the default argument for + the outermost call. The nesting counter counts the recursion + depth and not the embedding level. ------------------------------------------------------------------------*/ -const int MAX_LEVEL = 61; // the real value - -int Bidi_resolveExplicit(int level, int dir, int * pcls, int * plevel, int cch, - int nNest) +int fz_bidi_resolve_explicit(fz_bidi_level level, fz_bidi_chartype dir, fz_bidi_chartype *pcls, fz_bidi_level *plevel, int cch, + fz_bidi_level n_nest) { - int ich; - - // always called with a valid nesting level - // nesting levels are != embedding levels - int nLastValid = nNest; - - // check input values - assert(nNest >= 0 && level >= 0 && level <= MAX_LEVEL); - - // process the text - for (ich = 0; ich < cch; ich++) - { - int cls = pcls[ich]; - switch (cls) - { - case BDI_LRO: - case BDI_LRE: - nNest++; - if (GreaterEven(level) <= MAX_LEVEL) - { - plevel[ich] = GreaterEven(level); - pcls[ich] = BDI_BN; - ich += Bidi_resolveExplicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L), - &pcls[ich+1], &plevel[ich+1], - cch - (ich+1), nNest); - nNest--; - continue; - } - cls = pcls[ich] = BDI_BN; - break; - - case BDI_RLO: - case BDI_RLE: - nNest++; - if (GreaterOdd(level) <= MAX_LEVEL) - { - plevel[ich] = GreaterOdd(level); - pcls[ich] = BDI_BN; - ich += Bidi_resolveExplicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R), - &pcls[ich+1], &plevel[ich+1], - cch - (ich+1), nNest); - nNest--; - continue; - } - cls = pcls[ich] = BDI_BN; - break; - - case BDI_PDF: - cls = pcls[ich] = BDI_BN; - if (nNest) - { - if (nLastValid < nNest) - { - nNest--; - } - else - { - cch = ich; // break the loop, but complete body - } - } - break; - } - - // Apply the override - if (dir != BDI_N) - { - cls = dir; - } - plevel[ich] = level; - if (pcls[ich] != BDI_BN) - pcls[ich] = cls; - } - - return ich; + int ich; + + // always called with a valid nesting level + // nesting levels are != embedding levels + int nLastValid = n_nest; + + // check input values + assert(n_nest >= 0 && level >= 0 && level <= BIDI_LEVEL_MAX); + + // process the text + for (ich = 0; ich < cch; ich++) + { + fz_bidi_chartype cls = pcls[ich]; + switch (cls) + { + case BDI_LRO: + case BDI_LRE: + n_nest++; + if (greater_even(level) <= BIDI_LEVEL_MAX) + { + plevel[ich] = greater_even(level); + pcls[ich] = BDI_BN; + ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_LRE ? BDI_N : BDI_L), + &pcls[ich+1], &plevel[ich+1], + cch - (ich+1), n_nest); + n_nest--; + continue; + } + cls = pcls[ich] = BDI_BN; + break; + + case BDI_RLO: + case BDI_RLE: + n_nest++; + if (greater_odd(level) <= BIDI_LEVEL_MAX) + { + plevel[ich] = greater_odd(level); + pcls[ich] = BDI_BN; + ich += fz_bidi_resolve_explicit(plevel[ich], (cls == BDI_RLE ? BDI_N : BDI_R), + &pcls[ich+1], &plevel[ich+1], + cch - (ich+1), n_nest); + n_nest--; + continue; + } + cls = pcls[ich] = BDI_BN; + break; + + case BDI_PDF: + cls = pcls[ich] = BDI_BN; + if (n_nest) + { + if (nLastValid < n_nest) + { + n_nest--; + } + else + { + cch = ich; // break the loop, but complete body + } + } + break; + } + + // Apply the override + if (dir != BDI_N) + { + cls = dir; + } + plevel[ich] = level; + if (pcls[ich] != BDI_BN) + pcls[ich] = cls; + } + + return ich; } // === RESOLVE WEAK TYPES ================================================ enum bidi_state // possible states { - xa, // arabic letter - xr, // right leter - xl, // left letter - - ao, // arabic lett. foll by ON - ro, // right lett. foll by ON - lo, // left lett. foll by ON - - rt, // ET following R - lt, // ET following L - - cn, // EN, AN following AL - ra, // arabic number foll R - re, // european number foll R - la, // arabic number foll L - le, // european number foll L - - ac, // CS following cn - rc, // CS following ra - rs, // CS,ES following re - lc, // CS following la - ls, // CS,ES following le - - ret, // ET following re - let // ET following le + xa, // arabic letter + xr, // right leter + xl, // left letter + + ao, // arabic lett. foll by ON + ro, // right lett. foll by ON + lo, // left lett. foll by ON + + rt, // ET following R + lt, // ET following L + + cn, // EN, AN following AL + ra, // arabic number foll R + re, // european number foll R + la, // arabic number foll L + le, // european number foll L + + ac, // CS following cn + rc, // CS following ra + rs, // CS,ES following re + lc, // CS following la + ls, // CS,ES following le + + ret, // ET following re + let // ET following le } ; -int stateWeak[][10] = +const unsigned char state_weak[][10] = { - // N, L, R, AN, EN, AL,NSM, CS, ES, ET, -/*xa*/ { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter */ -/*xr*/ { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter */ -/*xl*/ { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter */ + // N, L, R, AN, EN, AL,NSM, CS, ES, ET, +/*xa*/ { ao, xl, xr, cn, cn, xa, xa, ao, ao, ao }, /* arabic letter */ +/*xr*/ { ro, xl, xr, ra, re, xa, xr, ro, ro, rt }, /* right letter */ +/*xl*/ { lo, xl, xr, la, le, xa, xl, lo, lo, lt }, /* left letter */ /*ao*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* arabic lett. foll by ON*/ /*ro*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* right lett. foll by ON */ /*lo*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* left lett. foll by ON */ -/*rt*/ { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R */ -/*lt*/ { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L */ +/*rt*/ { ro, xl, xr, ra, re, xa, rt, ro, ro, rt }, /* ET following R */ +/*lt*/ { lo, xl, xr, la, le, xa, lt, lo, lo, lt }, /* ET following L */ -/*cn*/ { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL */ +/*cn*/ { ao, xl, xr, cn, cn, xa, cn, ac, ao, ao }, /* EN, AN following AL */ /*ra*/ { ro, xl, xr, ra, re, xa, ra, rc, ro, rt }, /* arabic number foll R */ /*re*/ { ro, xl, xr, ra, re, xa, re, rs, rs,ret }, /* european number foll R */ /*la*/ { lo, xl, xr, la, le, xa, la, lc, lo, lt }, /* arabic number foll L */ /*le*/ { lo, xl, xr, la, le, xa, le, ls, ls,let }, /* european number foll L */ -/*ac*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn */ -/*rc*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra */ -/*rs*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re */ -/*lc*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la */ -/*ls*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le */ - -/*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re */ -/*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let } /* ET following le */ +/*ac*/ { ao, xl, xr, cn, cn, xa, ao, ao, ao, ao }, /* CS following cn */ +/*rc*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS following ra */ +/*rs*/ { ro, xl, xr, ra, re, xa, ro, ro, ro, rt }, /* CS,ES following re */ +/*lc*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS following la */ +/*ls*/ { lo, xl, xr, la, le, xa, lo, lo, lo, lt }, /* CS,ES following le */ +/*ret*/ { ro, xl, xr, ra, re, xa,ret, ro, ro,ret }, /* ET following re */ +/*let*/ { lo, xl, xr, la, le, xa,let, lo, lo,let } /* ET following le */ }; enum bidi_action // possible actions { - // primitives - IX = 0x100, // increment - XX = 0xF, // no-op - - // actions - xxx = (XX << 4) + XX, // no-op - xIx = IX + xxx, // increment run - xxN = (XX << 4) + BDI_ON, // set current to N - xxE = (XX << 4) + BDI_EN, // set current to EN - xxA = (XX << 4) + BDI_AN, // set current to AN - xxR = (XX << 4) + BDI_R, // set current to R - xxL = (XX << 4) + BDI_L, // set current to L - Nxx = (BDI_ON << 4) + 0xF, // set run to neutral - Axx = (BDI_AN << 4) + 0xF, // set run to AN - ExE = (BDI_EN << 4) + BDI_EN,// set run to EN, set current to EN - NIx = (BDI_ON << 4) + 0xF + IX,// set run to N, increment - NxN = (BDI_ON << 4) + BDI_ON,// set run to N, set current to N - NxR = (BDI_ON << 4) + BDI_R, // set run to N, set current to R - NxE = (BDI_ON << 4) + BDI_EN,// set run to N, set current to EN - - AxA = (BDI_AN << 4) + BDI_AN,// set run to AN, set current to AN - NxL = (BDI_ON << 4) + BDI_L, // set run to N, set current to L - LxL = (BDI_L << 4) + BDI_L // set run to L, set current to L + // primitives + IX = 0x100, // increment + XX = 0xF, // no-op + + // actions + xxx = (XX << 4) + XX, // no-op + xIx = IX + xxx, // increment run + xxN = (XX << 4) + BDI_ON, // set current to N + xxE = (XX << 4) + BDI_EN, // set current to EN + xxA = (XX << 4) + BDI_AN, // set current to AN + xxR = (XX << 4) + BDI_R, // set current to R + xxL = (XX << 4) + BDI_L, // set current to L + Nxx = (BDI_ON << 4) + 0xF, // set run to neutral + Axx = (BDI_AN << 4) + 0xF, // set run to AN + ExE = (BDI_EN << 4) + BDI_EN, // set run to EN, set current to EN + NIx = (BDI_ON << 4) + 0xF + IX, // set run to N, increment + NxN = (BDI_ON << 4) + BDI_ON, // set run to N, set current to N + NxR = (BDI_ON << 4) + BDI_R, // set run to N, set current to R + NxE = (BDI_ON << 4) + BDI_EN, // set run to N, set current to EN + + AxA = (BDI_AN << 4) + BDI_AN, // set run to AN, set current to AN + NxL = (BDI_ON << 4) + BDI_L, // set run to N, set current to L + LxL = (BDI_L << 4) + BDI_L // set run to L, set current to L }; +typedef uint16_t fz_bidi_action; -int actionWeak[][10] = +const fz_bidi_action action_weak[][10] = { - // N,.. L, R, AN, EN, AL, NSM, CS,..ES, ET, -/*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter */ -/*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right leter */ -/*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter */ - -/*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON */ -/*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON */ -/*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON */ - -/*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R */ -/*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L */ - -/*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following AL */ -/*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R */ -/*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R */ -/*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L */ -/*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L */ - -/*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn */ -/*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra */ -/*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re */ -/*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la */ -/*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le */ - -/*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re */ -/*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL } /* ET following le */ + // N,.. L, R, AN, EN, AL, NSM, CS,..ES, ET, +/*xa*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxR, xxN, xxN, xxN }, /* arabic letter */ +/*xr*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxR, xxN, xxN, xIx }, /* right leter */ +/*xl*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xIx }, /* left letter */ + +/*ao*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxN, xxN, xxN, xxN }, /* arabic lett. foll by ON */ +/*ro*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxN, xxN, xxN, xIx }, /* right lett. foll by ON */ +/*lo*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxN, xxN, xxN, xIx }, /* left lett. foll by ON */ + +/*rt*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, xIx, NxN, NxN, xIx }, /* ET following R */ +/*lt*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, xIx, NxN, NxN, xIx }, /* ET following L */ + +/*cn*/ { xxx, xxx, xxx, xxx, xxA, xxR, xxA, xIx, xxN, xxN }, /* EN, AN following AL */ +/*ra*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll R */ +/*re*/ { xxx, xxx, xxx, xxx, xxE, xxR, xxE, xIx, xIx, xxE }, /* european number foll R */ +/*la*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxA, xIx, xxN, xIx }, /* arabic number foll L */ +/*le*/ { xxx, xxx, xxx, xxx, xxL, xxR, xxL, xIx, xIx, xxL }, /* european number foll L */ + +/*ac*/ { Nxx, Nxx, Nxx, Axx, AxA, NxR, NxN, NxN, NxN, NxN }, /* CS following cn */ +/*rc*/ { Nxx, Nxx, Nxx, Axx, NxE, NxR, NxN, NxN, NxN, NIx }, /* CS following ra */ +/*rs*/ { Nxx, Nxx, Nxx, Nxx, ExE, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following re */ +/*lc*/ { Nxx, Nxx, Nxx, Axx, NxL, NxR, NxN, NxN, NxN, NIx }, /* CS following la */ +/*ls*/ { Nxx, Nxx, Nxx, Nxx, LxL, NxR, NxN, NxN, NxN, NIx }, /* CS,ES following le */ + +/*ret*/{ xxx, xxx, xxx, xxx, xxE, xxR, xxE, xxN, xxN, xxE }, /* ET following re */ +/*let*/{ xxx, xxx, xxx, xxx, xxL, xxR, xxL, xxN, xxN, xxL } /* ET following le */ }; static -int GetDeferredType(int action) +fz_bidi_chartype get_deferred_type(fz_bidi_action action) { - return (action >> 4) & 0xF; + return (action >> 4) & 0xF; } static -int GetResolvedType(int action) +fz_bidi_chartype get_resolved_type(fz_bidi_action action) { - return action & 0xF; + return action & 0xF; } /* Note on action table: - States can be of two kinds: - - Immediate Resolution State, where each input token - is resolved as soon as it is seen. These states havve - only single action codes (xxN) or the no-op (xxx) - for static input tokens. - - Deferred Resolution State, where input tokens either - either extend the run (xIx) or resolve its Type (e.g. Nxx). - - Input classes are of three kinds - - Static Input Token, where the class of the token remains - unchanged on output (AN, L, N, R) - - Replaced Input Token, where the class of the token is - always replaced on output (AL, BDI_BN, NSM, CS, ES, ET) - - Conditional Input Token, where the class of the token is - changed on output in some but not all cases (EN) - - Where tokens are subject to change, a double action - (e.g. NxA, or NxN) is _required_ after deferred states, - resolving both the deferred state and changing the current token. - - These properties of the table are verified by assertions below. - This code is needed only during debugging and maintenance + States can be of two kinds: + - Immediate Resolution State, where each input token + is resolved as soon as it is seen. These states havve + only single action codes (xxN) or the no-op (xxx) + for static input tokens. + - Deferred Resolution State, where input tokens either + either extend the run (xIx) or resolve its Type (e.g. Nxx). + + Input classes are of three kinds + - Static Input Token, where the class of the token remains + unchanged on output (AN, L, N, R) + - Replaced Input Token, where the class of the token is + always replaced on output (AL, BDI_BN, NSM, CS, ES, ET) + - Conditional Input Token, where the class of the token is + changed on output in some but not all cases (EN) + + Where tokens are subject to change, a double action + (e.g. NxA, or NxN) is _required_ after deferred states, + resolving both the deferred state and changing the current token. + + These properties of the table are verified by assertions below. + This code is needed only during debugging and maintenance */ /*------------------------------------------------------------------------ - Function: resolveWeak + Function: resolveWeak - Resolves the directionality of numeric and other weak character types + Resolves the directionality of numeric and other weak character types - Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm. + Implements rules X10 and W1-W6 of the Unicode Bidirectional Algorithm. - Input: Array of embedding levels - Character count + Input: Array of embedding levels + Character count - In/Out: Array of directional classes + In/Out: Array of directional classes - Note: On input only these directional classes are expected - AL, HL, R, L, ON, BDI_BN, NSM, AN, EN, ES, ET, CS, + Note: On input only these directional classes are expected + AL, HL, R, L, ON, BDI_BN, NSM, AN, EN, ES, ET, CS, ------------------------------------------------------------------------*/ -void Bidi_resolveWeak(fz_context *ctx, int baselevel, int *pcls, int *plevel, int cch) +void fz_bidi_resolve_weak(fz_context *ctx, fz_bidi_level baselevel, fz_bidi_chartype *pcls, fz_bidi_level *plevel, int cch) { - int state = odd(baselevel) ? xr : xl; - int cls; - int ich; - int action; - int clsRun; - int clsNew; - - int level = baselevel; - - int cchRun = 0; - - for (ich = 0; ich < cch; ich++) - { - if (pcls[ich] > BDI_BN) { - fz_warn(ctx, "error: pcls[%d] > BN (%d)\n", ich, pcls[ich]); - } - - // ignore boundary neutrals - if (pcls[ich] == BDI_BN) - { - // must flatten levels unless at a level change; - plevel[ich] = level; - - // lookahead for level changes - if (ich + 1 == cch && level != baselevel) - { - // have to fixup last BN before end of the loop, since - // its fix-upped value will be needed below the assert - pcls[ich] = EmbeddingDirection(level); - } - else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN) - { - // fixup LAST BN in front / after a level run to make - // it act like the SOR/EOR in rule X10 - int newlevel = plevel[ich+1]; - if (level > newlevel) { - newlevel = level; - } - plevel[ich] = newlevel; - - // must match assigned level - pcls[ich] = EmbeddingDirection(newlevel); - level = plevel[ich+1]; - } - else - { - // don't interrupt runs - if (cchRun) - { - cchRun++; - } - continue; - } - } - - assert(pcls[ich] <= BDI_BN); - cls = pcls[ich]; - - action = actionWeak[state][cls]; - - // resolve the directionality for deferred runs - clsRun = GetDeferredType(action); - if (clsRun != XX) - { - SetDeferredRun(pcls, cchRun, ich, clsRun); - cchRun = 0; - } - - // resolve the directionality class at the current location - clsNew = GetResolvedType(action); - if (clsNew != XX) - pcls[ich] = clsNew; - - // increment a deferred run - if (IX & action) - cchRun++; - - state = stateWeak[state][cls]; - } - - // resolve any deferred runs - // use the direction of the current level to emulate PDF - cls = EmbeddingDirection(level); - - // resolve the directionality for deferred runs - clsRun = GetDeferredType(actionWeak[state][cls]); - if (clsRun != XX) - SetDeferredRun(pcls, cchRun, ich, clsRun); + int state = odd(baselevel) ? xr : xl; + fz_bidi_chartype cls; + int ich; + fz_bidi_action action; + fz_bidi_chartype cls_run; + fz_bidi_chartype cls_new; + + fz_bidi_level level = baselevel; + + int cch_run = 0; + + for (ich = 0; ich < cch; ich++) + { + if (pcls[ich] > BDI_BN) { + fz_warn(ctx, "error: pcls[%d] > BN (%d)\n", ich, pcls[ich]); + } + + // ignore boundary neutrals + if (pcls[ich] == BDI_BN) + { + // must flatten levels unless at a level change; + plevel[ich] = level; + + // lookahead for level changes + if (ich + 1 == cch && level != baselevel) + { + // have to fixup last BN before end of the loop, since + // its fix-upped value will be needed below the assert + pcls[ich] = embedding_direction(level); + } + else if (ich + 1 < cch && level != plevel[ich+1] && pcls[ich+1] != BDI_BN) + { + // fixup LAST BN in front / after a level run to make + // it act like the SOR/EOR in rule X10 + int newlevel = plevel[ich+1]; + if (level > newlevel) { + newlevel = level; + } + plevel[ich] = newlevel; + + // must match assigned level + pcls[ich] = embedding_direction(newlevel); + level = plevel[ich+1]; + } + else + { + // don't interrupt runs + if (cch_run) + { + cch_run++; + } + continue; + } + } + + assert(pcls[ich] <= BDI_BN); + cls = pcls[ich]; + + action = action_weak[state][cls]; + + // resolve the directionality for deferred runs + cls_run = get_deferred_type(action); + if (cls_run != XX) + { + set_deferred_run(pcls, cch_run, ich, cls_run); + cch_run = 0; + } + + // resolve the directionality class at the current location + cls_new = get_resolved_type(action); + if (cls_new != XX) + pcls[ich] = cls_new; + + // increment a deferred run + if (IX & action) + cch_run++; + + state = state_weak[state][cls]; + } + + // resolve any deferred runs + // use the direction of the current level to emulate PDF + cls = embedding_direction(level); + + // resolve the directionality for deferred runs + cls_run = get_deferred_type(action_weak[state][cls]); + if (cls_run != XX) + set_deferred_run(pcls, cch_run, ich, cls_run); } // === RESOLVE NEUTAL TYPES ============================================== @@ -821,365 +801,363 @@ void Bidi_resolveWeak(fz_context *ctx, int baselevel, int *pcls, int *plevel, in // action values enum neutral_action { - // action to resolve previous input - nL = BDI_L, // resolve EN to L - En = 3 << 4, // resolve neutrals run to embedding level direction - Rn = BDI_R << 4, // resolve neutrals run to strong right - Ln = BDI_L << 4, // resolved neutrals run to strong left - In = (1<<8), // increment count of deferred neutrals - LnL = (1<<4)+BDI_L // set run and EN to L + // action to resolve previous input + nL = BDI_L, // resolve EN to L + En = 3 << 4, // resolve neutrals run to embedding level direction + Rn = BDI_R << 4, // resolve neutrals run to strong right + Ln = BDI_L << 4, // resolved neutrals run to strong left + In = (1<<8), // increment count of deferred neutrals + LnL = (1<<4)+BDI_L // set run and EN to L }; -static -int GetDeferredNeutrals(int action, int level) +static fz_bidi_chartype +get_deferred_neutrals(fz_bidi_action action, fz_bidi_level level) { - action = (action >> 4) & 0xF; - if (action == (En >> 4)) - return EmbeddingDirection(level); - else - return action; + action = (action >> 4) & 0xF; + if (action == (En >> 4)) + return embedding_direction(level); + else + return action; } -static -int GetResolvedNeutrals(int action) +static fz_bidi_chartype get_resolved_neutrals(fz_bidi_action action) { - action = action & 0xF; - if (action == In) - return 0; - else - return action; + action = action & 0xF; + if (action == In) + return 0; + else + return action; } // state values enum neutral_state { - // new temporary class - r, // R and characters resolved to R - l, // L and characters resolved to L - rn, // N preceded by right - ln, // N preceded by left - a, // AN preceded by left (the abbrev 'la' is used up above) - na // N preceeded by a + // new temporary class + r, // R and characters resolved to R + l, // L and characters resolved to L + rn, // N preceded by right + ln, // N preceded by left + a, // AN preceded by left (the abbrev 'la' is used up above) + na // N preceeded by a } ; - /*------------------------------------------------------------------------ - Notes: + Notes: - By rule W7, whenever a EN is 'dominated' by an L (including start of - run with embedding direction = L) it is resolved to, and further treated - as L. + By rule W7, whenever a EN is 'dominated' by an L (including start of + run with embedding direction = L) it is resolved to, and further treated + as L. - This leads to the need for 'a' and 'na' states. + This leads to the need for 'a' and 'na' states. ------------------------------------------------------------------------*/ -int actionNeutrals[][5] = +const int action_neutrals[][5] = { -// N, L, R, AN, EN, = cls - // state = - {In, 0, 0, 0, 0}, // r right - {In, 0, 0, 0, BDI_L},// l left +// N, L, R, AN, EN, = cls + // state = + {In, 0, 0, 0, 0}, // r right + {In, 0, 0, 0, BDI_L}, // l left - {In, En, Rn, Rn, Rn}, // rn N preceded by right - {In, Ln, En, En, LnL}, // ln N preceded by left + {In, En, Rn, Rn, Rn}, // rn N preceded by right + {In, Ln, En, En, LnL}, // ln N preceded by left - {In, 0, 0, 0, BDI_L},// a AN preceded by left - {In, En, Rn, Rn, En} // na N preceded by a + {In, 0, 0, 0, BDI_L}, // a AN preceded by left + {In, En, Rn, Rn, En} // na N preceded by a } ; -int stateNeutrals[][5] = +const int state_neutrals[][5] = { -// N, L, R, AN, EN = cls - // state = - {rn, l, r, r, r}, // r right - {ln, l, r, a, l}, // l left +// N, L, R, AN, EN = cls + // state = + {rn, l, r, r, r}, // r right + {ln, l, r, a, l}, // l left - {rn, l, r, r, r}, // rn N preceded by right - {ln, l, r, a, l}, // ln N preceded by left + {rn, l, r, r, r}, // rn N preceded by right + {ln, l, r, a, l}, // ln N preceded by left - {na, l, r, a, l}, // a AN preceded by left - {na, l, r, a, l} // na N preceded by la + {na, l, r, a, l}, // a AN preceded by left + {na, l, r, a, l} // na N preceded by la } ; /*------------------------------------------------------------------------ - Function: resolveNeutrals + Function: resolveNeutrals - Resolves the directionality of neutral character types. + Resolves the directionality of neutral character types. - Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm. + Implements rules W7, N1 and N2 of the Unicode Bidi Algorithm. - Input: Array of embedding levels - Character count - Baselevel + Input: Array of embedding levels + Character count + Baselevel - In/Out: Array of directional classes + In/Out: Array of directional classes - Note: On input only these directional classes are expected - R, L, N, AN, EN and BN + Note: On input only these directional classes are expected + R, L, N, AN, EN and BN - W8 resolves a number of ENs to L + W8 resolves a number of ENs to L ------------------------------------------------------------------------*/ -void Bidi_resolveNeutrals(int baselevel, int *pcls, const int *plevel, int cch) +void fz_bidi_resolve_neutrals(fz_bidi_level baselevel, fz_bidi_chartype *pcls, const fz_bidi_level *plevel, int cch) { - // the state at the start of text depends on the base level - int state = odd(baselevel) ? r : l; - int cls; - int ich; - int clsRun; - - int cchRun = 0; - int level = baselevel; - - for (ich = 0; ich < cch; ich++) - { - int action; - int clsNew; - - // ignore boundary neutrals - if (pcls[ich] == BDI_BN) - { - // include in the count for a deferred run - if (cchRun) - cchRun++; - - // skip any further processing - continue; - } - - assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed" - cls = pcls[ich]; - - action = actionNeutrals[state][cls]; - - // resolve the directionality for deferred runs - clsRun = GetDeferredNeutrals(action, level); - if (clsRun != BDI_N) - { - SetDeferredRun(pcls, cchRun, ich, clsRun); - cchRun = 0; - } - - // resolve the directionality class at the current location - clsNew = GetResolvedNeutrals(action); - if (clsNew != BDI_N) - pcls[ich] = clsNew; - - if (In & action) - cchRun++; - - state = stateNeutrals[state][cls]; - level = plevel[ich]; - } - - // resolve any deferred runs - cls = EmbeddingDirection(level); // eor has type of current level - - // resolve the directionality for deferred runs - clsRun = GetDeferredNeutrals(actionNeutrals[state][cls], level); - if (clsRun != BDI_N) - SetDeferredRun(pcls, cchRun, ich, clsRun); + // the state at the start of text depends on the base level + int state = odd(baselevel) ? r : l; + fz_bidi_chartype cls; + int ich; + fz_bidi_chartype cls_run; + + int cch_run = 0; + fz_bidi_level level = baselevel; + + for (ich = 0; ich < cch; ich++) + { + int action; + fz_bidi_chartype cls_new; + + // ignore boundary neutrals + if (pcls[ich] == BDI_BN) + { + // include in the count for a deferred run + if (cch_run) + cch_run++; + + // skip any further processing + continue; + } + + assert(pcls[ich] < 5); // "Only N, L, R, AN, EN are allowed" + cls = pcls[ich]; + + action = action_neutrals[state][cls]; + + // resolve the directionality for deferred runs + cls_run = get_deferred_neutrals(action, level); + if (cls_run != BDI_N) + { + set_deferred_run(pcls, cch_run, ich, cls_run); + cch_run = 0; + } + + // resolve the directionality class at the current location + cls_new = get_resolved_neutrals(action); + if (cls_new != BDI_N) + pcls[ich] = cls_new; + + if (In & action) + cch_run++; + + state = state_neutrals[state][cls]; + level = plevel[ich]; + } + + // resolve any deferred runs + cls = embedding_direction(level); // eor has type of current level + + // resolve the directionality for deferred runs + cls_run = get_deferred_neutrals(action_neutrals[state][cls], level); + if (cls_run != BDI_N) + set_deferred_run(pcls, cch_run, ich, cls_run); } // === RESOLVE IMPLLICIT ================================================= /*------------------------------------------------------------------------ - Function: resolveImplicit + Function: resolveImplicit - Recursively resolves implicit embedding levels. - Implements rules I1 and I2 of the Unicode Bidirectional Algorithm. + Recursively resolves implicit embedding levels. + Implements rules I1 and I2 of the Unicode Bidirectional Algorithm. - Input: Array of direction classes - Character count - Base level + Input: Array of direction classes + Character count + Base level - In/Out: Array of embedding levels + In/Out: Array of embedding levels - Note: levels may exceed 15 on output. - Accepted subset of direction classes - R, L, AN, EN + Note: levels may exceed 15 on output. + Accepted subset of direction classes + R, L, AN, EN ------------------------------------------------------------------------*/ -int addLevel[][4] = +const fz_bidi_level add_level[][4] = { - // L, R, AN, EN = cls - // level = -/* even */ { 0, 1, 2, 2 }, // EVEN -/* odd */ { 1, 0, 1, 1 } // ODD + // L, R, AN, EN = cls + // level = +/* even */ { 0, 1, 2, 2 }, // EVEN +/* odd */ { 1, 0, 1, 1 } // ODD }; -void Bidi_resolveImplicit(const int * pcls, int * plevel, int cch) +void fz_bidi_resolve_implicit(const fz_bidi_chartype *pcls, fz_bidi_level *plevel, int cch) { - int ich; - - for (ich = 0; ich < cch; ich++) - { - // cannot resolve bn here, since some bn were resolved to strong - // types in resolveWeak. To remove these we need the original - // types, which are available again in resolveWhiteSpace - if (pcls[ich] == BDI_BN) - { - continue; - } - assert(pcls[ich] > 0); // "No Neutrals allowed to survive here." - assert(pcls[ich] < 5); // "Out of range." - plevel[ich] += addLevel[odd(plevel[ich])][pcls[ich] - 1]; - } + int ich; + + for (ich = 0; ich < cch; ich++) + { + // cannot resolve bn here, since some bn were resolved to strong + // types in resolveWeak. To remove these we need the original + // types, which are available again in resolveWhiteSpace + if (pcls[ich] == BDI_BN) + { + continue; + } + assert(pcls[ich] > 0); // "No Neutrals allowed to survive here." + assert(pcls[ich] < 5); // "Out of range." + plevel[ich] += add_level[odd(plevel[ich])][pcls[ich] - 1]; + } } +#if 0 // === REORDER =========================================================== /*------------------------------------------------------------------------ - Function: resolveLines + Function: resolve_lines - Breaks a paragraph into lines + Breaks a paragraph into lines - Input: Character count - Array of line break flags - In/Out: Array of characters + Input: Character count + Array of line break flags + In/Out: Array of characters - Returns the count of characters on the first line + Returns the count of characters on the first line - Note: This function only breaks lines at hard line breaks. Other - line breaks can be passed in. If pbrk[n] is true, then a break - occurs after the character in pszInput[n]. Breaks before the first - character are not allowed. + Note: This function only breaks lines at hard line breaks. Other + line breaks can be passed in. If pbrk[n] is true, then a break + occurs after the character in psz_input[n]. Breaks before the first + character are not allowed. ------------------------------------------------------------------------*/ -static -int resolveLines(uint16_t * pszInput, int * pbrk, int cch) +static int resolve_lines(uint32_t *psz_input, int *pbrk, int cch) { - int ich; - - // skip characters not of type LS - for(ich = 0; ich < cch; ich++) - { - if (pszInput[ich] == chLS || (pbrk && pbrk[ich])) - { - ich++; - break; - } - } - - return ich; + int ich; + + // skip characters not of type LS + for(ich = 0; ich < cch; ich++) + { + if (psz_input[ich] == chLS || (pbrk && pbrk[ich])) + { + ich++; + break; + } + } + + return ich; } +#endif /*------------------------------------------------------------------------ - Function: resolveWhiteSpace + Function: fz_bidi_resolve_whitespace - Resolves levels for WS and S - Implements rule L1 of the Unicode bidi Algorithm. + Resolves levels for WS and S + Implements rule L1 of the Unicode bidi Algorithm. - Input: Base embedding level - Character count - Array of direction classes (for one line of text) + Input: Base embedding level + Character count + Array of direction classes (for one line of text) - In/Out: Array of embedding levels (for one line of text) + In/Out: Array of embedding levels (for one line of text) - Note: this should be applied a line at a time. The default driver - code supplied in this file assumes a single line of text; for - a real implementation, cch and the initial pointer values - would have to be adjusted. + Note: this should be applied a line at a time. The default driver + code supplied in this file assumes a single line of text; for + a real implementation, cch and the initial pointer values + would have to be adjusted. ------------------------------------------------------------------------*/ -void Bidi_resolveWhitespace(int baselevel, const int *pcls, int *plevel, - int cch) +void fz_bidi_resolve_whitespace(fz_bidi_level baselevel, const fz_bidi_chartype *pcls, fz_bidi_level *plevel, + int cch) { - int cchrun = 0; - int oldlevel = baselevel; - int ich; - - for (ich = 0; ich < cch; ich++) - { - switch(pcls[ich]) - { - default: - cchrun = 0; // any other character breaks the run - break; - case BDI_WS: - cchrun++; - break; - - case BDI_RLE: - case BDI_LRE: - case BDI_LRO: - case BDI_RLO: - case BDI_PDF: - case BDI_BN: - plevel[ich] = oldlevel; - cchrun++; - break; - - case BDI_S: - case BDI_B: - // reset levels for WS before eot - SetDeferredRun(plevel, cchrun, ich, baselevel); - cchrun = 0; - plevel[ich] = baselevel; - break; - } - oldlevel = plevel[ich]; - } - // reset level before eot - SetDeferredRun(plevel, cchrun, ich, baselevel); + int cchrun = 0; + fz_bidi_level oldlevel = baselevel; + int ich; + + for (ich = 0; ich < cch; ich++) + { + switch(pcls[ich]) + { + default: + cchrun = 0; // any other character breaks the run + break; + case BDI_WS: + cchrun++; + break; + + case BDI_RLE: + case BDI_LRE: + case BDI_LRO: + case BDI_RLO: + case BDI_PDF: + case BDI_BN: + plevel[ich] = oldlevel; + cchrun++; + break; + + case BDI_S: + case BDI_B: + // reset levels for WS before eot + set_deferred_level_run(plevel, cchrun, ich, baselevel); + cchrun = 0; + plevel[ich] = baselevel; + break; + } + oldlevel = plevel[ich]; + } + // reset level before eot + set_deferred_level_run(plevel, cchrun, ich, baselevel); } #ifdef BIDI_LINE_AT_A_TIME /*------------------------------------------------------------------------ - Functions: reorder/reorderLevel + Functions: reorder/reorderLevel - Recursively reorders the display string - "From the highest level down, reverse all characters at that level and - higher, down to the lowest odd level" + Recursively reorders the display string + "From the highest level down, reverse all characters at that level and + higher, down to the lowest odd level" - Implements rule L2 of the Unicode bidi Algorithm. + Implements rule L2 of the Unicode bidi Algorithm. - Input: Array of embedding levels - Character count - Flag enabling reversal (set to false by initial caller) + Input: Array of embedding levels + Character count + Flag enabling reversal (set to false by initial caller) - In/Out: Text to reorder + In/Out: Text to reorder - Note: levels may exceed 15 resp. 61 on input. + Note: levels may exceed 15 resp. 61 on input. - Rule L3 - reorder combining marks is not implemented here - Rule L4 - glyph mirroring is implemented as a display option below + Rule L3 - reorder combining marks is not implemented here + Rule L4 - glyph mirroring is implemented as a display option below - Note: this should be applied a line at a time + Note: this should be applied a line at a time -------------------------------------------------------------------------*/ -static int reorderLevel(int level, uint16_t * pszText, const int * plevel, int cch, - int fReverse) +static int reorderLevel(fz_bidi_level level, uint32_t *psz_text, const fz_bidi_level *plevel, int cch, + int f_reverse) { - int ich; - - // true as soon as first odd level encountered - fReverse = fReverse || odd(level); - - for (ich = 0; ich < cch; ich++) - { - if (plevel[ich] < level) - { - break; - } - else if (plevel[ich] > level) - { - ich += reorderLevel(level + 1, pszText + ich, plevel + ich, - cch - ich, fReverse) - 1; - } - } - if (fReverse) - { - reverse(pszText, ich); - } - return ich; + int ich; + + // true as soon as first odd level encountered + f_reverse = f_reverse || odd(level); + + for (ich = 0; ich < cch; ich++) + { + if (plevel[ich] < level) + { + break; + } + else if (plevel[ich] > level) + { + ich += reorderLevel(level + 1, psz_text + ich, plevel + ich, + cch - ich, f_reverse) - 1; + } + } + if (f_reverse) + { + reverse(psz_text, ich); + } + return ich; } - -int Bidi_reorder(int baselevel, uint16_t * pszText, const int * plevel, int cch) +int Bidi_reorder(fz_bidi_level baselevel, uint32_t *psz_text, const fz_bidi_level *plevel, int cch) { - int ich = 0; - - while (ich < cch) - { - ich += reorderLevel(baselevel, pszText + ich, plevel + ich, - cch - ich, FALSE); - } - return ich; + int ich = 0; + + while (ich < cch) + { + ich += reorderLevel(baselevel, psz_text + ich, plevel + ich, + cch - ich, FALSE); + } + return ich; } #endif -- cgit v1.2.3