From 672b9afb9bb05a67bbcd8664ba268521ea728ca6 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Tue, 13 Nov 2018 21:49:56 +0100 Subject: Add more encoding tables. Add tables for Windows-1250, Windows-1251, and ISO-8859-1. Also add unicode_from_encoding tables. Move encodings from PDF namespace to Fitz. --- include/mupdf/fitz/font.h | 23 + include/mupdf/pdf/font.h | 7 - platform/win32/libmupdf.vcproj | 4 + scripts/8859-1.TXT | 292 ++++ scripts/CP1250.TXT | 274 +++ scripts/CP1251.TXT | 274 +++ scripts/makeencoding.py | 47 +- source/fitz/encodings.c | 33 + source/fitz/encodings.h | 3784 ++++++++++++++++++++++++++++++++++++++++ source/pdf/pdf-appearance.c | 6 +- source/pdf/pdf-encoding.c | 59 +- source/pdf/pdf-encodings.h | 203 --- source/pdf/pdf-font.c | 12 +- 13 files changed, 4735 insertions(+), 283 deletions(-) create mode 100644 scripts/8859-1.TXT create mode 100644 scripts/CP1250.TXT create mode 100644 scripts/CP1251.TXT create mode 100644 source/fitz/encodings.c create mode 100644 source/fitz/encodings.h diff --git a/include/mupdf/fitz/font.h b/include/mupdf/fitz/font.h index 4565983c..421f477a 100644 --- a/include/mupdf/fitz/font.h +++ b/include/mupdf/fitz/font.h @@ -9,6 +9,29 @@ /* forward declaration for circular dependency */ struct fz_device_s; +/* Various font encoding tables and lookup functions */ + +extern const char *fz_glyph_name_from_iso8859_1[256]; +extern const char *fz_glyph_name_from_iso8859_7[256]; +extern const char *fz_glyph_name_from_koi8u[256]; +extern const char *fz_glyph_name_from_windows_1250[256]; +extern const char *fz_glyph_name_from_windows_1251[256]; +extern const char *fz_glyph_name_from_windows_1252[256]; + +extern unsigned short fz_unicode_from_iso8859_1[256]; +extern unsigned short fz_unicode_from_iso8859_7[256]; +extern unsigned short fz_unicode_from_koi8u[256]; +extern unsigned short fz_unicode_from_windows_1250[256]; +extern unsigned short fz_unicode_from_windows_1251[256]; +extern unsigned short fz_unicode_from_windows_1252[256]; + +int fz_iso8859_1_from_unicode(int u); +int fz_iso8859_7_from_unicode(int u); +int fz_koi8u_from_unicode(int u); +int fz_windows_1250_from_unicode(int u); +int fz_windows_1251_from_unicode(int u); +int fz_windows_1252_from_unicode(int u); + /* An abstract font handle. */ diff --git a/include/mupdf/pdf/font.h b/include/mupdf/pdf/font.h index 812377e5..d386e87c 100644 --- a/include/mupdf/pdf/font.h +++ b/include/mupdf/pdf/font.h @@ -30,13 +30,6 @@ extern const char *pdf_mac_expert[256]; extern const char *pdf_win_ansi[256]; extern const char *pdf_standard[256]; -extern const char *pdf_glyph_name_from_koi8u[256]; -extern const char *pdf_glyph_name_from_iso8859_7[256]; - -int pdf_cyrillic_from_unicode(int u); -int pdf_greek_from_unicode(int u); -int pdf_winansi_from_unicode(int u); - typedef struct pdf_font_desc_s pdf_font_desc; typedef struct pdf_hmtx_s pdf_hmtx; typedef struct pdf_vmtx_s pdf_vmtx; diff --git a/platform/win32/libmupdf.vcproj b/platform/win32/libmupdf.vcproj index 18823802..ebef2db0 100644 --- a/platform/win32/libmupdf.vcproj +++ b/platform/win32/libmupdf.vcproj @@ -1899,6 +1899,10 @@ RelativePath="..\..\source\fitz\encode-basic.c" > + + diff --git a/scripts/8859-1.TXT b/scripts/8859-1.TXT new file mode 100644 index 00000000..3a55afef --- /dev/null +++ b/scripts/8859-1.TXT @@ -0,0 +1,292 @@ +# 8859-1.TXT +# Date: 2015-12-02 20:19:00 GMT [KW] +# © 2015 Unicode®, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Name: ISO/IEC 8859-1:1998 to Unicode +# Unicode version: 3.0 +# Table version: 2.0 +# Table format: Format A +# Date: 1999 July 27 (header updated: 2015 December 02) +# Authors: Ken Whistler +# +# General notes: +# +# This table contains the data the Unicode Consortium has on how +# ISO/IEC 8859-1:1998 characters map into Unicode. +# +# Format: Three tab-separated columns +# Column #1 is the ISO/IEC 8859-1 code (in hex as 0xXX) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 the Unicode name (follows a comment sign, '#') +# +# The entries are in ISO/IEC 8859-1 order. +# +# Version history +# 1.0 version: updates 0.1 version by adding mappings for all +# control characters. +# 2.0 version: updates to copyright notice and terms of use; no +# changes to character mappings +# +# Updated versions of this file may be found in: +# http://www.unicode.org/Public/MAPPINGS/ +# +# Any comments or problems, contact us at: +# http://www.unicode.org/reporting.html +# +0x00 0x0000 # NULL +0x01 0x0001 # START OF HEADING +0x02 0x0002 # START OF TEXT +0x03 0x0003 # END OF TEXT +0x04 0x0004 # END OF TRANSMISSION +0x05 0x0005 # ENQUIRY +0x06 0x0006 # ACKNOWLEDGE +0x07 0x0007 # BELL +0x08 0x0008 # BACKSPACE +0x09 0x0009 # HORIZONTAL TABULATION +0x0A 0x000A # LINE FEED +0x0B 0x000B # VERTICAL TABULATION +0x0C 0x000C # FORM FEED +0x0D 0x000D # CARRIAGE RETURN +0x0E 0x000E # SHIFT OUT +0x0F 0x000F # SHIFT IN +0x10 0x0010 # DATA LINK ESCAPE +0x11 0x0011 # DEVICE CONTROL ONE +0x12 0x0012 # DEVICE CONTROL TWO +0x13 0x0013 # DEVICE CONTROL THREE +0x14 0x0014 # DEVICE CONTROL FOUR +0x15 0x0015 # NEGATIVE ACKNOWLEDGE +0x16 0x0016 # SYNCHRONOUS IDLE +0x17 0x0017 # END OF TRANSMISSION BLOCK +0x18 0x0018 # CANCEL +0x19 0x0019 # END OF MEDIUM +0x1A 0x001A # SUBSTITUTE +0x1B 0x001B # ESCAPE +0x1C 0x001C # FILE SEPARATOR +0x1D 0x001D # GROUP SEPARATOR +0x1E 0x001E # RECORD SEPARATOR +0x1F 0x001F # UNIT SEPARATOR +0x20 0x0020 # SPACE +0x21 0x0021 # EXCLAMATION MARK +0x22 0x0022 # QUOTATION MARK +0x23 0x0023 # NUMBER SIGN +0x24 0x0024 # DOLLAR SIGN +0x25 0x0025 # PERCENT SIGN +0x26 0x0026 # AMPERSAND +0x27 0x0027 # APOSTROPHE +0x28 0x0028 # LEFT PARENTHESIS +0x29 0x0029 # RIGHT PARENTHESIS +0x2A 0x002A # ASTERISK +0x2B 0x002B # PLUS SIGN +0x2C 0x002C # COMMA +0x2D 0x002D # HYPHEN-MINUS +0x2E 0x002E # FULL STOP +0x2F 0x002F # SOLIDUS +0x30 0x0030 # DIGIT ZERO +0x31 0x0031 # DIGIT ONE +0x32 0x0032 # DIGIT TWO +0x33 0x0033 # DIGIT THREE +0x34 0x0034 # DIGIT FOUR +0x35 0x0035 # DIGIT FIVE +0x36 0x0036 # DIGIT SIX +0x37 0x0037 # DIGIT SEVEN +0x38 0x0038 # DIGIT EIGHT +0x39 0x0039 # DIGIT NINE +0x3A 0x003A # COLON +0x3B 0x003B # SEMICOLON +0x3C 0x003C # LESS-THAN SIGN +0x3D 0x003D # EQUALS SIGN +0x3E 0x003E # GREATER-THAN SIGN +0x3F 0x003F # QUESTION MARK +0x40 0x0040 # COMMERCIAL AT +0x41 0x0041 # LATIN CAPITAL LETTER A +0x42 0x0042 # LATIN CAPITAL LETTER B +0x43 0x0043 # LATIN CAPITAL LETTER C +0x44 0x0044 # LATIN CAPITAL LETTER D +0x45 0x0045 # LATIN CAPITAL LETTER E +0x46 0x0046 # LATIN CAPITAL LETTER F +0x47 0x0047 # LATIN CAPITAL LETTER G +0x48 0x0048 # LATIN CAPITAL LETTER H +0x49 0x0049 # LATIN CAPITAL LETTER I +0x4A 0x004A # LATIN CAPITAL LETTER J +0x4B 0x004B # LATIN CAPITAL LETTER K +0x4C 0x004C # LATIN CAPITAL LETTER L +0x4D 0x004D # LATIN CAPITAL LETTER M +0x4E 0x004E # LATIN CAPITAL LETTER N +0x4F 0x004F # LATIN CAPITAL LETTER O +0x50 0x0050 # LATIN CAPITAL LETTER P +0x51 0x0051 # LATIN CAPITAL LETTER Q +0x52 0x0052 # LATIN CAPITAL LETTER R +0x53 0x0053 # LATIN CAPITAL LETTER S +0x54 0x0054 # LATIN CAPITAL LETTER T +0x55 0x0055 # LATIN CAPITAL LETTER U +0x56 0x0056 # LATIN CAPITAL LETTER V +0x57 0x0057 # LATIN CAPITAL LETTER W +0x58 0x0058 # LATIN CAPITAL LETTER X +0x59 0x0059 # LATIN CAPITAL LETTER Y +0x5A 0x005A # LATIN CAPITAL LETTER Z +0x5B 0x005B # LEFT SQUARE BRACKET +0x5C 0x005C # REVERSE SOLIDUS +0x5D 0x005D # RIGHT SQUARE BRACKET +0x5E 0x005E # CIRCUMFLEX ACCENT +0x5F 0x005F # LOW LINE +0x60 0x0060 # GRAVE ACCENT +0x61 0x0061 # LATIN SMALL LETTER A +0x62 0x0062 # LATIN SMALL LETTER B +0x63 0x0063 # LATIN SMALL LETTER C +0x64 0x0064 # LATIN SMALL LETTER D +0x65 0x0065 # LATIN SMALL LETTER E +0x66 0x0066 # LATIN SMALL LETTER F +0x67 0x0067 # LATIN SMALL LETTER G +0x68 0x0068 # LATIN SMALL LETTER H +0x69 0x0069 # LATIN SMALL LETTER I +0x6A 0x006A # LATIN SMALL LETTER J +0x6B 0x006B # LATIN SMALL LETTER K +0x6C 0x006C # LATIN SMALL LETTER L +0x6D 0x006D # LATIN SMALL LETTER M +0x6E 0x006E # LATIN SMALL LETTER N +0x6F 0x006F # LATIN SMALL LETTER O +0x70 0x0070 # LATIN SMALL LETTER P +0x71 0x0071 # LATIN SMALL LETTER Q +0x72 0x0072 # LATIN SMALL LETTER R +0x73 0x0073 # LATIN SMALL LETTER S +0x74 0x0074 # LATIN SMALL LETTER T +0x75 0x0075 # LATIN SMALL LETTER U +0x76 0x0076 # LATIN SMALL LETTER V +0x77 0x0077 # LATIN SMALL LETTER W +0x78 0x0078 # LATIN SMALL LETTER X +0x79 0x0079 # LATIN SMALL LETTER Y +0x7A 0x007A # LATIN SMALL LETTER Z +0x7B 0x007B # LEFT CURLY BRACKET +0x7C 0x007C # VERTICAL LINE +0x7D 0x007D # RIGHT CURLY BRACKET +0x7E 0x007E # TILDE +0x7F 0x007F # DELETE +0x80 0x0080 # +0x81 0x0081 # +0x82 0x0082 # +0x83 0x0083 # +0x84 0x0084 # +0x85 0x0085 # +0x86 0x0086 # +0x87 0x0087 # +0x88 0x0088 # +0x89 0x0089 # +0x8A 0x008A # +0x8B 0x008B # +0x8C 0x008C # +0x8D 0x008D # +0x8E 0x008E # +0x8F 0x008F # +0x90 0x0090 # +0x91 0x0091 # +0x92 0x0092 # +0x93 0x0093 # +0x94 0x0094 # +0x95 0x0095 # +0x96 0x0096 # +0x97 0x0097 # +0x98 0x0098 # +0x99 0x0099 # +0x9A 0x009A # +0x9B 0x009B # +0x9C 0x009C # +0x9D 0x009D # +0x9E 0x009E # +0x9F 0x009F # +0xA0 0x00A0 # NO-BREAK SPACE +0xA1 0x00A1 # INVERTED EXCLAMATION MARK +0xA2 0x00A2 # CENT SIGN +0xA3 0x00A3 # POUND SIGN +0xA4 0x00A4 # CURRENCY SIGN +0xA5 0x00A5 # YEN SIGN +0xA6 0x00A6 # BROKEN BAR +0xA7 0x00A7 # SECTION SIGN +0xA8 0x00A8 # DIAERESIS +0xA9 0x00A9 # COPYRIGHT SIGN +0xAA 0x00AA # FEMININE ORDINAL INDICATOR +0xAB 0x00AB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC # NOT SIGN +0xAD 0x00AD # SOFT HYPHEN +0xAE 0x00AE # REGISTERED SIGN +0xAF 0x00AF # MACRON +0xB0 0x00B0 # DEGREE SIGN +0xB1 0x00B1 # PLUS-MINUS SIGN +0xB2 0x00B2 # SUPERSCRIPT TWO +0xB3 0x00B3 # SUPERSCRIPT THREE +0xB4 0x00B4 # ACUTE ACCENT +0xB5 0x00B5 # MICRO SIGN +0xB6 0x00B6 # PILCROW SIGN +0xB7 0x00B7 # MIDDLE DOT +0xB8 0x00B8 # CEDILLA +0xB9 0x00B9 # SUPERSCRIPT ONE +0xBA 0x00BA # MASCULINE ORDINAL INDICATOR +0xBB 0x00BB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x00BC # VULGAR FRACTION ONE QUARTER +0xBD 0x00BD # VULGAR FRACTION ONE HALF +0xBE 0x00BE # VULGAR FRACTION THREE QUARTERS +0xBF 0x00BF # INVERTED QUESTION MARK +0xC0 0x00C0 # LATIN CAPITAL LETTER A WITH GRAVE +0xC1 0x00C1 # LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x00C3 # LATIN CAPITAL LETTER A WITH TILDE +0xC4 0x00C4 # LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x00C5 # LATIN CAPITAL LETTER A WITH RING ABOVE +0xC6 0x00C6 # LATIN CAPITAL LETTER AE +0xC7 0x00C7 # LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x00C8 # LATIN CAPITAL LETTER E WITH GRAVE +0xC9 0x00C9 # LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x00CA # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +0xCB 0x00CB # LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x00CC # LATIN CAPITAL LETTER I WITH GRAVE +0xCD 0x00CD # LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x00CF # LATIN CAPITAL LETTER I WITH DIAERESIS +0xD0 0x00D0 # LATIN CAPITAL LETTER ETH (Icelandic) +0xD1 0x00D1 # LATIN CAPITAL LETTER N WITH TILDE +0xD2 0x00D2 # LATIN CAPITAL LETTER O WITH GRAVE +0xD3 0x00D3 # LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x00D5 # LATIN CAPITAL LETTER O WITH TILDE +0xD6 0x00D6 # LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 # MULTIPLICATION SIGN +0xD8 0x00D8 # LATIN CAPITAL LETTER O WITH STROKE +0xD9 0x00D9 # LATIN CAPITAL LETTER U WITH GRAVE +0xDA 0x00DA # LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x00DB # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +0xDC 0x00DC # LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD # LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x00DE # LATIN CAPITAL LETTER THORN (Icelandic) +0xDF 0x00DF # LATIN SMALL LETTER SHARP S (German) +0xE0 0x00E0 # LATIN SMALL LETTER A WITH GRAVE +0xE1 0x00E1 # LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 # LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x00E3 # LATIN SMALL LETTER A WITH TILDE +0xE4 0x00E4 # LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x00E5 # LATIN SMALL LETTER A WITH RING ABOVE +0xE6 0x00E6 # LATIN SMALL LETTER AE +0xE7 0x00E7 # LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x00E8 # LATIN SMALL LETTER E WITH GRAVE +0xE9 0x00E9 # LATIN SMALL LETTER E WITH ACUTE +0xEA 0x00EA # LATIN SMALL LETTER E WITH CIRCUMFLEX +0xEB 0x00EB # LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x00EC # LATIN SMALL LETTER I WITH GRAVE +0xED 0x00ED # LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE # LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x00EF # LATIN SMALL LETTER I WITH DIAERESIS +0xF0 0x00F0 # LATIN SMALL LETTER ETH (Icelandic) +0xF1 0x00F1 # LATIN SMALL LETTER N WITH TILDE +0xF2 0x00F2 # LATIN SMALL LETTER O WITH GRAVE +0xF3 0x00F3 # LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 # LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x00F5 # LATIN SMALL LETTER O WITH TILDE +0xF6 0x00F6 # LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 # DIVISION SIGN +0xF8 0x00F8 # LATIN SMALL LETTER O WITH STROKE +0xF9 0x00F9 # LATIN SMALL LETTER U WITH GRAVE +0xFA 0x00FA # LATIN SMALL LETTER U WITH ACUTE +0xFB 0x00FB # LATIN SMALL LETTER U WITH CIRCUMFLEX +0xFC 0x00FC # LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD # LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x00FE # LATIN SMALL LETTER THORN (Icelandic) +0xFF 0x00FF # LATIN SMALL LETTER Y WITH DIAERESIS diff --git a/scripts/CP1250.TXT b/scripts/CP1250.TXT new file mode 100644 index 00000000..6bfab938 --- /dev/null +++ b/scripts/CP1250.TXT @@ -0,0 +1,274 @@ +# +# Name: cp1250 to Unicode table +# Unicode version: 2.0 +# Table version: 2.01 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: Shawn.Steele@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1250 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1250 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x20AC #EURO SIGN +0x81 #UNDEFINED +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 #UNDEFINED +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 #UNDEFINED +0x89 0x2030 #PER MILLE SIGN +0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x015A #LATIN CAPITAL LETTER S WITH ACUTE +0x8D 0x0164 #LATIN CAPITAL LETTER T WITH CARON +0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON +0x8F 0x0179 #LATIN CAPITAL LETTER Z WITH ACUTE +0x90 #UNDEFINED +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x015B #LATIN SMALL LETTER S WITH ACUTE +0x9D 0x0165 #LATIN SMALL LETTER T WITH CARON +0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON +0x9F 0x017A #LATIN SMALL LETTER Z WITH ACUTE +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x02C7 #CARON +0xA2 0x02D8 #BREVE +0xA3 0x0141 #LATIN CAPITAL LETTER L WITH STROKE +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x0104 #LATIN CAPITAL LETTER A WITH OGONEK +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x00A8 #DIAERESIS +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x015E #LATIN CAPITAL LETTER S WITH CEDILLA +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x017B #LATIN CAPITAL LETTER Z WITH DOT ABOVE +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x02DB #OGONEK +0xB3 0x0142 #LATIN SMALL LETTER L WITH STROKE +0xB4 0x00B4 #ACUTE ACCENT +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x00B8 #CEDILLA +0xB9 0x0105 #LATIN SMALL LETTER A WITH OGONEK +0xBA 0x015F #LATIN SMALL LETTER S WITH CEDILLA +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x013D #LATIN CAPITAL LETTER L WITH CARON +0xBD 0x02DD #DOUBLE ACUTE ACCENT +0xBE 0x013E #LATIN SMALL LETTER L WITH CARON +0xBF 0x017C #LATIN SMALL LETTER Z WITH DOT ABOVE +0xC0 0x0154 #LATIN CAPITAL LETTER R WITH ACUTE +0xC1 0x00C1 #LATIN CAPITAL LETTER A WITH ACUTE +0xC2 0x00C2 #LATIN CAPITAL LETTER A WITH CIRCUMFLEX +0xC3 0x0102 #LATIN CAPITAL LETTER A WITH BREVE +0xC4 0x00C4 #LATIN CAPITAL LETTER A WITH DIAERESIS +0xC5 0x0139 #LATIN CAPITAL LETTER L WITH ACUTE +0xC6 0x0106 #LATIN CAPITAL LETTER C WITH ACUTE +0xC7 0x00C7 #LATIN CAPITAL LETTER C WITH CEDILLA +0xC8 0x010C #LATIN CAPITAL LETTER C WITH CARON +0xC9 0x00C9 #LATIN CAPITAL LETTER E WITH ACUTE +0xCA 0x0118 #LATIN CAPITAL LETTER E WITH OGONEK +0xCB 0x00CB #LATIN CAPITAL LETTER E WITH DIAERESIS +0xCC 0x011A #LATIN CAPITAL LETTER E WITH CARON +0xCD 0x00CD #LATIN CAPITAL LETTER I WITH ACUTE +0xCE 0x00CE #LATIN CAPITAL LETTER I WITH CIRCUMFLEX +0xCF 0x010E #LATIN CAPITAL LETTER D WITH CARON +0xD0 0x0110 #LATIN CAPITAL LETTER D WITH STROKE +0xD1 0x0143 #LATIN CAPITAL LETTER N WITH ACUTE +0xD2 0x0147 #LATIN CAPITAL LETTER N WITH CARON +0xD3 0x00D3 #LATIN CAPITAL LETTER O WITH ACUTE +0xD4 0x00D4 #LATIN CAPITAL LETTER O WITH CIRCUMFLEX +0xD5 0x0150 #LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0xD6 0x00D6 #LATIN CAPITAL LETTER O WITH DIAERESIS +0xD7 0x00D7 #MULTIPLICATION SIGN +0xD8 0x0158 #LATIN CAPITAL LETTER R WITH CARON +0xD9 0x016E #LATIN CAPITAL LETTER U WITH RING ABOVE +0xDA 0x00DA #LATIN CAPITAL LETTER U WITH ACUTE +0xDB 0x0170 #LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0xDC 0x00DC #LATIN CAPITAL LETTER U WITH DIAERESIS +0xDD 0x00DD #LATIN CAPITAL LETTER Y WITH ACUTE +0xDE 0x0162 #LATIN CAPITAL LETTER T WITH CEDILLA +0xDF 0x00DF #LATIN SMALL LETTER SHARP S +0xE0 0x0155 #LATIN SMALL LETTER R WITH ACUTE +0xE1 0x00E1 #LATIN SMALL LETTER A WITH ACUTE +0xE2 0x00E2 #LATIN SMALL LETTER A WITH CIRCUMFLEX +0xE3 0x0103 #LATIN SMALL LETTER A WITH BREVE +0xE4 0x00E4 #LATIN SMALL LETTER A WITH DIAERESIS +0xE5 0x013A #LATIN SMALL LETTER L WITH ACUTE +0xE6 0x0107 #LATIN SMALL LETTER C WITH ACUTE +0xE7 0x00E7 #LATIN SMALL LETTER C WITH CEDILLA +0xE8 0x010D #LATIN SMALL LETTER C WITH CARON +0xE9 0x00E9 #LATIN SMALL LETTER E WITH ACUTE +0xEA 0x0119 #LATIN SMALL LETTER E WITH OGONEK +0xEB 0x00EB #LATIN SMALL LETTER E WITH DIAERESIS +0xEC 0x011B #LATIN SMALL LETTER E WITH CARON +0xED 0x00ED #LATIN SMALL LETTER I WITH ACUTE +0xEE 0x00EE #LATIN SMALL LETTER I WITH CIRCUMFLEX +0xEF 0x010F #LATIN SMALL LETTER D WITH CARON +0xF0 0x0111 #LATIN SMALL LETTER D WITH STROKE +0xF1 0x0144 #LATIN SMALL LETTER N WITH ACUTE +0xF2 0x0148 #LATIN SMALL LETTER N WITH CARON +0xF3 0x00F3 #LATIN SMALL LETTER O WITH ACUTE +0xF4 0x00F4 #LATIN SMALL LETTER O WITH CIRCUMFLEX +0xF5 0x0151 #LATIN SMALL LETTER O WITH DOUBLE ACUTE +0xF6 0x00F6 #LATIN SMALL LETTER O WITH DIAERESIS +0xF7 0x00F7 #DIVISION SIGN +0xF8 0x0159 #LATIN SMALL LETTER R WITH CARON +0xF9 0x016F #LATIN SMALL LETTER U WITH RING ABOVE +0xFA 0x00FA #LATIN SMALL LETTER U WITH ACUTE +0xFB 0x0171 #LATIN SMALL LETTER U WITH DOUBLE ACUTE +0xFC 0x00FC #LATIN SMALL LETTER U WITH DIAERESIS +0xFD 0x00FD #LATIN SMALL LETTER Y WITH ACUTE +0xFE 0x0163 #LATIN SMALL LETTER T WITH CEDILLA +0xFF 0x02D9 #DOT ABOVE diff --git a/scripts/CP1251.TXT b/scripts/CP1251.TXT new file mode 100644 index 00000000..4d9b3558 --- /dev/null +++ b/scripts/CP1251.TXT @@ -0,0 +1,274 @@ +# +# Name: cp1251 to Unicode table +# Unicode version: 2.0 +# Table version: 2.01 +# Table format: Format A +# Date: 04/15/98 +# +# Contact: Shawn.Steele@microsoft.com +# +# General notes: none +# +# Format: Three tab-separated columns +# Column #1 is the cp1251 code (in hex) +# Column #2 is the Unicode (in hex as 0xXXXX) +# Column #3 is the Unicode name (follows a comment sign, '#') +# +# The entries are in cp1251 order +# +0x00 0x0000 #NULL +0x01 0x0001 #START OF HEADING +0x02 0x0002 #START OF TEXT +0x03 0x0003 #END OF TEXT +0x04 0x0004 #END OF TRANSMISSION +0x05 0x0005 #ENQUIRY +0x06 0x0006 #ACKNOWLEDGE +0x07 0x0007 #BELL +0x08 0x0008 #BACKSPACE +0x09 0x0009 #HORIZONTAL TABULATION +0x0A 0x000A #LINE FEED +0x0B 0x000B #VERTICAL TABULATION +0x0C 0x000C #FORM FEED +0x0D 0x000D #CARRIAGE RETURN +0x0E 0x000E #SHIFT OUT +0x0F 0x000F #SHIFT IN +0x10 0x0010 #DATA LINK ESCAPE +0x11 0x0011 #DEVICE CONTROL ONE +0x12 0x0012 #DEVICE CONTROL TWO +0x13 0x0013 #DEVICE CONTROL THREE +0x14 0x0014 #DEVICE CONTROL FOUR +0x15 0x0015 #NEGATIVE ACKNOWLEDGE +0x16 0x0016 #SYNCHRONOUS IDLE +0x17 0x0017 #END OF TRANSMISSION BLOCK +0x18 0x0018 #CANCEL +0x19 0x0019 #END OF MEDIUM +0x1A 0x001A #SUBSTITUTE +0x1B 0x001B #ESCAPE +0x1C 0x001C #FILE SEPARATOR +0x1D 0x001D #GROUP SEPARATOR +0x1E 0x001E #RECORD SEPARATOR +0x1F 0x001F #UNIT SEPARATOR +0x20 0x0020 #SPACE +0x21 0x0021 #EXCLAMATION MARK +0x22 0x0022 #QUOTATION MARK +0x23 0x0023 #NUMBER SIGN +0x24 0x0024 #DOLLAR SIGN +0x25 0x0025 #PERCENT SIGN +0x26 0x0026 #AMPERSAND +0x27 0x0027 #APOSTROPHE +0x28 0x0028 #LEFT PARENTHESIS +0x29 0x0029 #RIGHT PARENTHESIS +0x2A 0x002A #ASTERISK +0x2B 0x002B #PLUS SIGN +0x2C 0x002C #COMMA +0x2D 0x002D #HYPHEN-MINUS +0x2E 0x002E #FULL STOP +0x2F 0x002F #SOLIDUS +0x30 0x0030 #DIGIT ZERO +0x31 0x0031 #DIGIT ONE +0x32 0x0032 #DIGIT TWO +0x33 0x0033 #DIGIT THREE +0x34 0x0034 #DIGIT FOUR +0x35 0x0035 #DIGIT FIVE +0x36 0x0036 #DIGIT SIX +0x37 0x0037 #DIGIT SEVEN +0x38 0x0038 #DIGIT EIGHT +0x39 0x0039 #DIGIT NINE +0x3A 0x003A #COLON +0x3B 0x003B #SEMICOLON +0x3C 0x003C #LESS-THAN SIGN +0x3D 0x003D #EQUALS SIGN +0x3E 0x003E #GREATER-THAN SIGN +0x3F 0x003F #QUESTION MARK +0x40 0x0040 #COMMERCIAL AT +0x41 0x0041 #LATIN CAPITAL LETTER A +0x42 0x0042 #LATIN CAPITAL LETTER B +0x43 0x0043 #LATIN CAPITAL LETTER C +0x44 0x0044 #LATIN CAPITAL LETTER D +0x45 0x0045 #LATIN CAPITAL LETTER E +0x46 0x0046 #LATIN CAPITAL LETTER F +0x47 0x0047 #LATIN CAPITAL LETTER G +0x48 0x0048 #LATIN CAPITAL LETTER H +0x49 0x0049 #LATIN CAPITAL LETTER I +0x4A 0x004A #LATIN CAPITAL LETTER J +0x4B 0x004B #LATIN CAPITAL LETTER K +0x4C 0x004C #LATIN CAPITAL LETTER L +0x4D 0x004D #LATIN CAPITAL LETTER M +0x4E 0x004E #LATIN CAPITAL LETTER N +0x4F 0x004F #LATIN CAPITAL LETTER O +0x50 0x0050 #LATIN CAPITAL LETTER P +0x51 0x0051 #LATIN CAPITAL LETTER Q +0x52 0x0052 #LATIN CAPITAL LETTER R +0x53 0x0053 #LATIN CAPITAL LETTER S +0x54 0x0054 #LATIN CAPITAL LETTER T +0x55 0x0055 #LATIN CAPITAL LETTER U +0x56 0x0056 #LATIN CAPITAL LETTER V +0x57 0x0057 #LATIN CAPITAL LETTER W +0x58 0x0058 #LATIN CAPITAL LETTER X +0x59 0x0059 #LATIN CAPITAL LETTER Y +0x5A 0x005A #LATIN CAPITAL LETTER Z +0x5B 0x005B #LEFT SQUARE BRACKET +0x5C 0x005C #REVERSE SOLIDUS +0x5D 0x005D #RIGHT SQUARE BRACKET +0x5E 0x005E #CIRCUMFLEX ACCENT +0x5F 0x005F #LOW LINE +0x60 0x0060 #GRAVE ACCENT +0x61 0x0061 #LATIN SMALL LETTER A +0x62 0x0062 #LATIN SMALL LETTER B +0x63 0x0063 #LATIN SMALL LETTER C +0x64 0x0064 #LATIN SMALL LETTER D +0x65 0x0065 #LATIN SMALL LETTER E +0x66 0x0066 #LATIN SMALL LETTER F +0x67 0x0067 #LATIN SMALL LETTER G +0x68 0x0068 #LATIN SMALL LETTER H +0x69 0x0069 #LATIN SMALL LETTER I +0x6A 0x006A #LATIN SMALL LETTER J +0x6B 0x006B #LATIN SMALL LETTER K +0x6C 0x006C #LATIN SMALL LETTER L +0x6D 0x006D #LATIN SMALL LETTER M +0x6E 0x006E #LATIN SMALL LETTER N +0x6F 0x006F #LATIN SMALL LETTER O +0x70 0x0070 #LATIN SMALL LETTER P +0x71 0x0071 #LATIN SMALL LETTER Q +0x72 0x0072 #LATIN SMALL LETTER R +0x73 0x0073 #LATIN SMALL LETTER S +0x74 0x0074 #LATIN SMALL LETTER T +0x75 0x0075 #LATIN SMALL LETTER U +0x76 0x0076 #LATIN SMALL LETTER V +0x77 0x0077 #LATIN SMALL LETTER W +0x78 0x0078 #LATIN SMALL LETTER X +0x79 0x0079 #LATIN SMALL LETTER Y +0x7A 0x007A #LATIN SMALL LETTER Z +0x7B 0x007B #LEFT CURLY BRACKET +0x7C 0x007C #VERTICAL LINE +0x7D 0x007D #RIGHT CURLY BRACKET +0x7E 0x007E #TILDE +0x7F 0x007F #DELETE +0x80 0x0402 #CYRILLIC CAPITAL LETTER DJE +0x81 0x0403 #CYRILLIC CAPITAL LETTER GJE +0x82 0x201A #SINGLE LOW-9 QUOTATION MARK +0x83 0x0453 #CYRILLIC SMALL LETTER GJE +0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK +0x85 0x2026 #HORIZONTAL ELLIPSIS +0x86 0x2020 #DAGGER +0x87 0x2021 #DOUBLE DAGGER +0x88 0x20AC #EURO SIGN +0x89 0x2030 #PER MILLE SIGN +0x8A 0x0409 #CYRILLIC CAPITAL LETTER LJE +0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK +0x8C 0x040A #CYRILLIC CAPITAL LETTER NJE +0x8D 0x040C #CYRILLIC CAPITAL LETTER KJE +0x8E 0x040B #CYRILLIC CAPITAL LETTER TSHE +0x8F 0x040F #CYRILLIC CAPITAL LETTER DZHE +0x90 0x0452 #CYRILLIC SMALL LETTER DJE +0x91 0x2018 #LEFT SINGLE QUOTATION MARK +0x92 0x2019 #RIGHT SINGLE QUOTATION MARK +0x93 0x201C #LEFT DOUBLE QUOTATION MARK +0x94 0x201D #RIGHT DOUBLE QUOTATION MARK +0x95 0x2022 #BULLET +0x96 0x2013 #EN DASH +0x97 0x2014 #EM DASH +0x98 #UNDEFINED +0x99 0x2122 #TRADE MARK SIGN +0x9A 0x0459 #CYRILLIC SMALL LETTER LJE +0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +0x9C 0x045A #CYRILLIC SMALL LETTER NJE +0x9D 0x045C #CYRILLIC SMALL LETTER KJE +0x9E 0x045B #CYRILLIC SMALL LETTER TSHE +0x9F 0x045F #CYRILLIC SMALL LETTER DZHE +0xA0 0x00A0 #NO-BREAK SPACE +0xA1 0x040E #CYRILLIC CAPITAL LETTER SHORT U +0xA2 0x045E #CYRILLIC SMALL LETTER SHORT U +0xA3 0x0408 #CYRILLIC CAPITAL LETTER JE +0xA4 0x00A4 #CURRENCY SIGN +0xA5 0x0490 #CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0xA6 0x00A6 #BROKEN BAR +0xA7 0x00A7 #SECTION SIGN +0xA8 0x0401 #CYRILLIC CAPITAL LETTER IO +0xA9 0x00A9 #COPYRIGHT SIGN +0xAA 0x0404 #CYRILLIC CAPITAL LETTER UKRAINIAN IE +0xAB 0x00AB #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +0xAC 0x00AC #NOT SIGN +0xAD 0x00AD #SOFT HYPHEN +0xAE 0x00AE #REGISTERED SIGN +0xAF 0x0407 #CYRILLIC CAPITAL LETTER YI +0xB0 0x00B0 #DEGREE SIGN +0xB1 0x00B1 #PLUS-MINUS SIGN +0xB2 0x0406 #CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0xB3 0x0456 #CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0xB4 0x0491 #CYRILLIC SMALL LETTER GHE WITH UPTURN +0xB5 0x00B5 #MICRO SIGN +0xB6 0x00B6 #PILCROW SIGN +0xB7 0x00B7 #MIDDLE DOT +0xB8 0x0451 #CYRILLIC SMALL LETTER IO +0xB9 0x2116 #NUMERO SIGN +0xBA 0x0454 #CYRILLIC SMALL LETTER UKRAINIAN IE +0xBB 0x00BB #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0xBC 0x0458 #CYRILLIC SMALL LETTER JE +0xBD 0x0405 #CYRILLIC CAPITAL LETTER DZE +0xBE 0x0455 #CYRILLIC SMALL LETTER DZE +0xBF 0x0457 #CYRILLIC SMALL LETTER YI +0xC0 0x0410 #CYRILLIC CAPITAL LETTER A +0xC1 0x0411 #CYRILLIC CAPITAL LETTER BE +0xC2 0x0412 #CYRILLIC CAPITAL LETTER VE +0xC3 0x0413 #CYRILLIC CAPITAL LETTER GHE +0xC4 0x0414 #CYRILLIC CAPITAL LETTER DE +0xC5 0x0415 #CYRILLIC CAPITAL LETTER IE +0xC6 0x0416 #CYRILLIC CAPITAL LETTER ZHE +0xC7 0x0417 #CYRILLIC CAPITAL LETTER ZE +0xC8 0x0418 #CYRILLIC CAPITAL LETTER I +0xC9 0x0419 #CYRILLIC CAPITAL LETTER SHORT I +0xCA 0x041A #CYRILLIC CAPITAL LETTER KA +0xCB 0x041B #CYRILLIC CAPITAL LETTER EL +0xCC 0x041C #CYRILLIC CAPITAL LETTER EM +0xCD 0x041D #CYRILLIC CAPITAL LETTER EN +0xCE 0x041E #CYRILLIC CAPITAL LETTER O +0xCF 0x041F #CYRILLIC CAPITAL LETTER PE +0xD0 0x0420 #CYRILLIC CAPITAL LETTER ER +0xD1 0x0421 #CYRILLIC CAPITAL LETTER ES +0xD2 0x0422 #CYRILLIC CAPITAL LETTER TE +0xD3 0x0423 #CYRILLIC CAPITAL LETTER U +0xD4 0x0424 #CYRILLIC CAPITAL LETTER EF +0xD5 0x0425 #CYRILLIC CAPITAL LETTER HA +0xD6 0x0426 #CYRILLIC CAPITAL LETTER TSE +0xD7 0x0427 #CYRILLIC CAPITAL LETTER CHE +0xD8 0x0428 #CYRILLIC CAPITAL LETTER SHA +0xD9 0x0429 #CYRILLIC CAPITAL LETTER SHCHA +0xDA 0x042A #CYRILLIC CAPITAL LETTER HARD SIGN +0xDB 0x042B #CYRILLIC CAPITAL LETTER YERU +0xDC 0x042C #CYRILLIC CAPITAL LETTER SOFT SIGN +0xDD 0x042D #CYRILLIC CAPITAL LETTER E +0xDE 0x042E #CYRILLIC CAPITAL LETTER YU +0xDF 0x042F #CYRILLIC CAPITAL LETTER YA +0xE0 0x0430 #CYRILLIC SMALL LETTER A +0xE1 0x0431 #CYRILLIC SMALL LETTER BE +0xE2 0x0432 #CYRILLIC SMALL LETTER VE +0xE3 0x0433 #CYRILLIC SMALL LETTER GHE +0xE4 0x0434 #CYRILLIC SMALL LETTER DE +0xE5 0x0435 #CYRILLIC SMALL LETTER IE +0xE6 0x0436 #CYRILLIC SMALL LETTER ZHE +0xE7 0x0437 #CYRILLIC SMALL LETTER ZE +0xE8 0x0438 #CYRILLIC SMALL LETTER I +0xE9 0x0439 #CYRILLIC SMALL LETTER SHORT I +0xEA 0x043A #CYRILLIC SMALL LETTER KA +0xEB 0x043B #CYRILLIC SMALL LETTER EL +0xEC 0x043C #CYRILLIC SMALL LETTER EM +0xED 0x043D #CYRILLIC SMALL LETTER EN +0xEE 0x043E #CYRILLIC SMALL LETTER O +0xEF 0x043F #CYRILLIC SMALL LETTER PE +0xF0 0x0440 #CYRILLIC SMALL LETTER ER +0xF1 0x0441 #CYRILLIC SMALL LETTER ES +0xF2 0x0442 #CYRILLIC SMALL LETTER TE +0xF3 0x0443 #CYRILLIC SMALL LETTER U +0xF4 0x0444 #CYRILLIC SMALL LETTER EF +0xF5 0x0445 #CYRILLIC SMALL LETTER HA +0xF6 0x0446 #CYRILLIC SMALL LETTER TSE +0xF7 0x0447 #CYRILLIC SMALL LETTER CHE +0xF8 0x0448 #CYRILLIC SMALL LETTER SHA +0xF9 0x0449 #CYRILLIC SMALL LETTER SHCHA +0xFA 0x044A #CYRILLIC SMALL LETTER HARD SIGN +0xFB 0x044B #CYRILLIC SMALL LETTER YERU +0xFC 0x044C #CYRILLIC SMALL LETTER SOFT SIGN +0xFD 0x044D #CYRILLIC SMALL LETTER E +0xFE 0x044E #CYRILLIC SMALL LETTER YU +0xFF 0x044F #CYRILLIC SMALL LETTER YA diff --git a/scripts/makeencoding.py b/scripts/makeencoding.py index 3656a64b..cf24b57c 100644 --- a/scripts/makeencoding.py +++ b/scripts/makeencoding.py @@ -1,17 +1,37 @@ # Convert unicode mapping table to C arrays mapping glyph names and unicode values. # # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT # ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT +# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT # ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT # +BANNED = [ + "controlSTX", "controlSOT", "controlETX", "controlEOT", "controlENQ", + "controlACK", "controlBEL", "controlBS", "controlHT", "controlLF", + "controlVT", "controlFF", "controlCR", "controlSO", "controlSI", + "controlDLE", "controlDC1", "controlDC2", "controlDC3", "controlDC4", + "controlNAK", "controlSYN", "controlETB", "controlCAN", "controlEM", + "controlSUB", "controlESC", "controlFS", "controlGS", "controlRS", + "controlUS", + "SF100000", "SF110000", "SF010000", "SF030000", "SF020000", "SF040000", + "SF080000", "SF090000", "SF060000", "SF070000", "SF050000", "SF430000", + "SF240000", "SF510000", "SF390000", "SF250000", "SF500000", "SF490000", + "SF380000", "SF280000", "SF260000", "SF360000", "SF370000", "SF420000", + "SF190000", "SF230000", "SF410000", "SF450000", "SF460000", "SF400000", + "SF540000", "SF440000", +] + glyphs = {} for line in open("scripts/glyphlist.txt").readlines(): if line[0] != '#': n, u = line.rstrip().split(';') if len(u) == 4: u = int(u, base=16) - glyphs[u] = n + if u not in glyphs and n not in BANNED: + glyphs[u] = n def load_table(fn): table = [0] * 256 @@ -25,15 +45,21 @@ def load_table(fn): return table def dump_table(name, table): - print "const char *pdf_glyph_name_from_%s[%d] = {" % (name, len(table)) + print "unsigned short fz_unicode_from_%s[256] = {" % name + for u in table: + print '\t%d,' % u + print "};" + print + + print "const char *fz_glyph_name_from_%s[%d] = {" % (name, len(table)) for u in table: if u in glyphs: - print '"%s",' % glyphs[u] + print '\t"%s",' % glyphs[u] else: - print '_notdef,' + print '\t_notdef,' print "};" print - print "static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name + rev = [] i = 0 for u in table: @@ -42,11 +68,16 @@ def dump_table(name, table): rev += ['{0x%04x,%d},' % (u, i)] i = i + 1 rev.sort() + + print "static const struct { unsigned short u, c; } %s_from_unicode[] = {" % name for s in rev: - print s + print "\t" + s print "};" print -dump_table("koi8u", load_table("scripts/KOI8-U.TXT")) +dump_table("iso8859_1", load_table("scripts/8859-1.TXT")) dump_table("iso8859_7", load_table("scripts/8859-7.TXT")) -dump_table("winansi", load_table("scripts/CP1252.TXT")) +dump_table("koi8u", load_table("scripts/KOI8-U.TXT")) +dump_table("windows_1250", load_table("scripts/CP1250.TXT")) +dump_table("windows_1251", load_table("scripts/CP1251.TXT")) +dump_table("windows_1252", load_table("scripts/CP1252.TXT")) diff --git a/source/fitz/encodings.c b/source/fitz/encodings.c new file mode 100644 index 00000000..004ae90f --- /dev/null +++ b/source/fitz/encodings.c @@ -0,0 +1,33 @@ +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" + +#include "encodings.h" + +#include +#include + +#define FROM_UNICODE(ENC) \ +{ \ + int l = 0; \ + int r = nelem(ENC##_from_unicode) - 1; \ + if (u < 128) \ + return u; \ + while (l <= r) \ + { \ + int m = (l + r) >> 1; \ + if (u < ENC##_from_unicode[m].u) \ + r = m - 1; \ + else if (u > ENC##_from_unicode[m].u) \ + l = m + 1; \ + else \ + return ENC##_from_unicode[m].c; \ + } \ + return -1; \ +} + +int fz_iso8859_1_from_unicode(int u) FROM_UNICODE(iso8859_1) +int fz_iso8859_7_from_unicode(int u) FROM_UNICODE(iso8859_7) +int fz_koi8u_from_unicode(int u) FROM_UNICODE(koi8u) +int fz_windows_1250_from_unicode(int u) FROM_UNICODE(windows_1250) +int fz_windows_1251_from_unicode(int u) FROM_UNICODE(windows_1251) +int fz_windows_1252_from_unicode(int u) FROM_UNICODE(windows_1252) diff --git a/source/fitz/encodings.h b/source/fitz/encodings.h new file mode 100644 index 00000000..89f81e8f --- /dev/null +++ b/source/fitz/encodings.h @@ -0,0 +1,3784 @@ +#define _notdef NULL + +unsigned short fz_unicode_from_iso8859_1[256] = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, +}; + +const char *fz_glyph_name_from_iso8859_1[256] = { + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quotesingle", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "grave", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "controlDEL", + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "nbspace", + "exclamdown", + "cent", + "sterling", + "currency", + "yen", + "brokenbar", + "section", + "dieresis", + "copyright", + "ordfeminine", + "guillemotleft", + "logicalnot", + "sfthyphen", + "registered", + "macron", + "degree", + "plusminus", + "twosuperior", + "threesuperior", + "acute", + "mu", + "paragraph", + "middot", + "cedilla", + "onesuperior", + "ordmasculine", + "guillemotright", + "onequarter", + "onehalf", + "threequarters", + "questiondown", + "Agrave", + "Aacute", + "Acircumflex", + "Atilde", + "Adieresis", + "Aring", + "AE", + "Ccedilla", + "Egrave", + "Eacute", + "Ecircumflex", + "Edieresis", + "Igrave", + "Iacute", + "Icircumflex", + "Idieresis", + "Eth", + "Ntilde", + "Ograve", + "Oacute", + "Ocircumflex", + "Otilde", + "Odieresis", + "multiply", + "Oslash", + "Ugrave", + "Uacute", + "Ucircumflex", + "Udieresis", + "Yacute", + "Thorn", + "germandbls", + "agrave", + "aacute", + "acircumflex", + "atilde", + "adieresis", + "aring", + "ae", + "ccedilla", + "egrave", + "eacute", + "ecircumflex", + "edieresis", + "igrave", + "iacute", + "icircumflex", + "idieresis", + "eth", + "ntilde", + "ograve", + "oacute", + "ocircumflex", + "otilde", + "odieresis", + "divide", + "oslash", + "ugrave", + "uacute", + "ucircumflex", + "udieresis", + "yacute", + "thorn", + "ydieresis", +}; + +static const struct { unsigned short u, c; } iso8859_1_from_unicode[] = { + {0x00a0,160}, + {0x00a1,161}, + {0x00a2,162}, + {0x00a3,163}, + {0x00a4,164}, + {0x00a5,165}, + {0x00a6,166}, + {0x00a7,167}, + {0x00a8,168}, + {0x00a9,169}, + {0x00aa,170}, + {0x00ab,171}, + {0x00ac,172}, + {0x00ad,173}, + {0x00ae,174}, + {0x00af,175}, + {0x00b0,176}, + {0x00b1,177}, + {0x00b2,178}, + {0x00b3,179}, + {0x00b4,180}, + {0x00b5,181}, + {0x00b6,182}, + {0x00b7,183}, + {0x00b8,184}, + {0x00b9,185}, + {0x00ba,186}, + {0x00bb,187}, + {0x00bc,188}, + {0x00bd,189}, + {0x00be,190}, + {0x00bf,191}, + {0x00c0,192}, + {0x00c1,193}, + {0x00c2,194}, + {0x00c3,195}, + {0x00c4,196}, + {0x00c5,197}, + {0x00c6,198}, + {0x00c7,199}, + {0x00c8,200}, + {0x00c9,201}, + {0x00ca,202}, + {0x00cb,203}, + {0x00cc,204}, + {0x00cd,205}, + {0x00ce,206}, + {0x00cf,207}, + {0x00d0,208}, + {0x00d1,209}, + {0x00d2,210}, + {0x00d3,211}, + {0x00d4,212}, + {0x00d5,213}, + {0x00d6,214}, + {0x00d7,215}, + {0x00d8,216}, + {0x00d9,217}, + {0x00da,218}, + {0x00db,219}, + {0x00dc,220}, + {0x00dd,221}, + {0x00de,222}, + {0x00df,223}, + {0x00e0,224}, + {0x00e1,225}, + {0x00e2,226}, + {0x00e3,227}, + {0x00e4,228}, + {0x00e5,229}, + {0x00e6,230}, + {0x00e7,231}, + {0x00e8,232}, + {0x00e9,233}, + {0x00ea,234}, + {0x00eb,235}, + {0x00ec,236}, + {0x00ed,237}, + {0x00ee,238}, + {0x00ef,239}, + {0x00f0,240}, + {0x00f1,241}, + {0x00f2,242}, + {0x00f3,243}, + {0x00f4,244}, + {0x00f5,245}, + {0x00f6,246}, + {0x00f7,247}, + {0x00f8,248}, + {0x00f9,249}, + {0x00fa,250}, + {0x00fb,251}, + {0x00fc,252}, + {0x00fd,253}, + {0x00fe,254}, + {0x00ff,255}, +}; + +unsigned short fz_unicode_from_iso8859_7[256] = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 128, + 129, + 130, + 131, + 132, + 133, + 134, + 135, + 136, + 137, + 138, + 139, + 140, + 141, + 142, + 143, + 144, + 145, + 146, + 147, + 148, + 149, + 150, + 151, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 8216, + 8217, + 163, + 8364, + 8367, + 166, + 167, + 168, + 169, + 890, + 171, + 172, + 173, + 0, + 8213, + 176, + 177, + 178, + 179, + 900, + 901, + 902, + 183, + 904, + 905, + 906, + 187, + 908, + 189, + 910, + 911, + 912, + 913, + 914, + 915, + 916, + 917, + 918, + 919, + 920, + 921, + 922, + 923, + 924, + 925, + 926, + 927, + 928, + 929, + 0, + 931, + 932, + 933, + 934, + 935, + 936, + 937, + 938, + 939, + 940, + 941, + 942, + 943, + 944, + 945, + 946, + 947, + 948, + 949, + 950, + 951, + 952, + 953, + 954, + 955, + 956, + 957, + 958, + 959, + 960, + 961, + 962, + 963, + 964, + 965, + 966, + 967, + 968, + 969, + 970, + 971, + 972, + 973, + 974, + 0, +}; + +const char *fz_glyph_name_from_iso8859_7[256] = { + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quotesingle", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "grave", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "controlDEL", + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "nbspace", + "quoteleft", + "quoteright", + "sterling", + "Euro", + _notdef, + "brokenbar", + "section", + "dieresis", + "copyright", + "ypogegrammeni", + "guillemotleft", + "logicalnot", + "sfthyphen", + _notdef, + "afii00208", + "degree", + "plusminus", + "twosuperior", + "threesuperior", + "tonos", + "dialytikatonos", + "Alphatonos", + "middot", + "Epsilontonos", + "Etatonos", + "Iotatonos", + "guillemotright", + "Omicrontonos", + "onehalf", + "Upsilontonos", + "Omegatonos", + "iotadieresistonos", + "Alpha", + "Beta", + "Gamma", + "Deltagreek", + "Epsilon", + "Zeta", + "Eta", + "Theta", + "Iota", + "Kappa", + "Lambda", + "Mu", + "Nu", + "Xi", + "Omicron", + "Pi", + "Rho", + _notdef, + "Sigma", + "Tau", + "Upsilon", + "Phi", + "Chi", + "Psi", + "Omegagreek", + "Iotadieresis", + "Upsilondieresis", + "alphatonos", + "epsilontonos", + "etatonos", + "iotatonos", + "upsilondieresistonos", + "alpha", + "beta", + "gamma", + "delta", + "epsilon", + "zeta", + "eta", + "theta", + "iota", + "kappa", + "lambda", + "mugreek", + "nu", + "xi", + "omicron", + "pi", + "rho", + "sigma1", + "sigma", + "tau", + "upsilon", + "phi", + "chi", + "psi", + "omega", + "iotadieresis", + "upsilondieresis", + "omicrontonos", + "upsilontonos", + "omegatonos", + _notdef, +}; + +static const struct { unsigned short u, c; } iso8859_7_from_unicode[] = { + {0x00a0,160}, + {0x00a3,163}, + {0x00a6,166}, + {0x00a7,167}, + {0x00a8,168}, + {0x00a9,169}, + {0x00ab,171}, + {0x00ac,172}, + {0x00ad,173}, + {0x00b0,176}, + {0x00b1,177}, + {0x00b2,178}, + {0x00b3,179}, + {0x00b7,183}, + {0x00bb,187}, + {0x00bd,189}, + {0x037a,170}, + {0x0384,180}, + {0x0385,181}, + {0x0386,182}, + {0x0388,184}, + {0x0389,185}, + {0x038a,186}, + {0x038c,188}, + {0x038e,190}, + {0x038f,191}, + {0x0390,192}, + {0x0391,193}, + {0x0392,194}, + {0x0393,195}, + {0x0394,196}, + {0x0395,197}, + {0x0396,198}, + {0x0397,199}, + {0x0398,200}, + {0x0399,201}, + {0x039a,202}, + {0x039b,203}, + {0x039c,204}, + {0x039d,205}, + {0x039e,206}, + {0x039f,207}, + {0x03a0,208}, + {0x03a1,209}, + {0x03a3,211}, + {0x03a4,212}, + {0x03a5,213}, + {0x03a6,214}, + {0x03a7,215}, + {0x03a8,216}, + {0x03a9,217}, + {0x03aa,218}, + {0x03ab,219}, + {0x03ac,220}, + {0x03ad,221}, + {0x03ae,222}, + {0x03af,223}, + {0x03b0,224}, + {0x03b1,225}, + {0x03b2,226}, + {0x03b3,227}, + {0x03b4,228}, + {0x03b5,229}, + {0x03b6,230}, + {0x03b7,231}, + {0x03b8,232}, + {0x03b9,233}, + {0x03ba,234}, + {0x03bb,235}, + {0x03bc,236}, + {0x03bd,237}, + {0x03be,238}, + {0x03bf,239}, + {0x03c0,240}, + {0x03c1,241}, + {0x03c2,242}, + {0x03c3,243}, + {0x03c4,244}, + {0x03c5,245}, + {0x03c6,246}, + {0x03c7,247}, + {0x03c8,248}, + {0x03c9,249}, + {0x03ca,250}, + {0x03cb,251}, + {0x03cc,252}, + {0x03cd,253}, + {0x03ce,254}, + {0x2015,175}, + {0x2018,161}, + {0x2019,162}, + {0x20ac,164}, +}; + +unsigned short fz_unicode_from_koi8u[256] = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 9472, + 9474, + 9484, + 9488, + 9492, + 9496, + 9500, + 9508, + 9516, + 9524, + 9532, + 9600, + 9604, + 9608, + 9612, + 9616, + 9617, + 9618, + 9619, + 8992, + 9632, + 8729, + 8730, + 8776, + 8804, + 8805, + 160, + 8993, + 176, + 178, + 183, + 247, + 9552, + 9553, + 9554, + 1105, + 1108, + 9556, + 1110, + 1111, + 9559, + 9560, + 9561, + 9562, + 9563, + 1169, + 9565, + 9566, + 9567, + 9568, + 9569, + 1025, + 1028, + 9571, + 1030, + 1031, + 9574, + 9575, + 9576, + 9577, + 9578, + 1168, + 9580, + 169, + 1102, + 1072, + 1073, + 1094, + 1076, + 1077, + 1092, + 1075, + 1093, + 1080, + 1081, + 1082, + 1083, + 1084, + 1085, + 1086, + 1087, + 1103, + 1088, + 1089, + 1090, + 1091, + 1078, + 1074, + 1100, + 1099, + 1079, + 1096, + 1101, + 1097, + 1095, + 1098, + 1070, + 1040, + 1041, + 1062, + 1044, + 1045, + 1060, + 1043, + 1061, + 1048, + 1049, + 1050, + 1051, + 1052, + 1053, + 1054, + 1055, + 1071, + 1056, + 1057, + 1058, + 1059, + 1046, + 1042, + 1068, + 1067, + 1047, + 1064, + 1069, + 1065, + 1063, + 1066, +}; + +const char *fz_glyph_name_from_koi8u[256] = { + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quotesingle", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "grave", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "controlDEL", + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "upblock", + "dnblock", + "block", + "lfblock", + "rtblock", + "ltshade", + "shade", + "dkshade", + "integraltop", + "blacksquare", + "bulletoperator", + "radical", + "approxequal", + "lessequal", + "greaterequal", + "nbspace", + "integralbottom", + "degree", + "twosuperior", + "middot", + "divide", + _notdef, + _notdef, + _notdef, + "afii10071", + "afii10101", + _notdef, + "afii10103", + "afii10104", + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "afii10098", + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "Iocyrillic", + "Ecyrillic", + _notdef, + "Icyrillic", + "Yicyrillic", + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "Gheupturncyrillic", + _notdef, + "copyright", + "afii10096", + "acyrillic", + "afii10066", + "afii10088", + "afii10069", + "afii10070", + "afii10086", + "afii10068", + "afii10087", + "afii10074", + "afii10075", + "afii10076", + "afii10077", + "afii10078", + "afii10079", + "afii10080", + "afii10081", + "afii10097", + "afii10082", + "afii10083", + "afii10084", + "afii10085", + "afii10072", + "afii10067", + "afii10094", + "afii10093", + "afii10073", + "afii10090", + "afii10095", + "afii10091", + "afii10089", + "afii10092", + "IUcyrillic", + "Acyrillic", + "Becyrillic", + "Tsecyrillic", + "Decyrillic", + "Iecyrillic", + "Efcyrillic", + "Gecyrillic", + "Khacyrillic", + "Iicyrillic", + "Iishortcyrillic", + "Kacyrillic", + "Elcyrillic", + "Emcyrillic", + "Encyrillic", + "Ocyrillic", + "Pecyrillic", + "IAcyrillic", + "Ercyrillic", + "Escyrillic", + "Tecyrillic", + "Ucyrillic", + "Zhecyrillic", + "Vecyrillic", + "Softsigncyrillic", + "Yericyrillic", + "Zecyrillic", + "Shacyrillic", + "Ereversedcyrillic", + "Shchacyrillic", + "Checyrillic", + "Hardsigncyrillic", +}; + +static const struct { unsigned short u, c; } koi8u_from_unicode[] = { + {0x00a0,154}, + {0x00a9,191}, + {0x00b0,156}, + {0x00b2,157}, + {0x00b7,158}, + {0x00f7,159}, + {0x0401,179}, + {0x0404,180}, + {0x0406,182}, + {0x0407,183}, + {0x0410,225}, + {0x0411,226}, + {0x0412,247}, + {0x0413,231}, + {0x0414,228}, + {0x0415,229}, + {0x0416,246}, + {0x0417,250}, + {0x0418,233}, + {0x0419,234}, + {0x041a,235}, + {0x041b,236}, + {0x041c,237}, + {0x041d,238}, + {0x041e,239}, + {0x041f,240}, + {0x0420,242}, + {0x0421,243}, + {0x0422,244}, + {0x0423,245}, + {0x0424,230}, + {0x0425,232}, + {0x0426,227}, + {0x0427,254}, + {0x0428,251}, + {0x0429,253}, + {0x042a,255}, + {0x042b,249}, + {0x042c,248}, + {0x042d,252}, + {0x042e,224}, + {0x042f,241}, + {0x0430,193}, + {0x0431,194}, + {0x0432,215}, + {0x0433,199}, + {0x0434,196}, + {0x0435,197}, + {0x0436,214}, + {0x0437,218}, + {0x0438,201}, + {0x0439,202}, + {0x043a,203}, + {0x043b,204}, + {0x043c,205}, + {0x043d,206}, + {0x043e,207}, + {0x043f,208}, + {0x0440,210}, + {0x0441,211}, + {0x0442,212}, + {0x0443,213}, + {0x0444,198}, + {0x0445,200}, + {0x0446,195}, + {0x0447,222}, + {0x0448,219}, + {0x0449,221}, + {0x044a,223}, + {0x044b,217}, + {0x044c,216}, + {0x044d,220}, + {0x044e,192}, + {0x044f,209}, + {0x0451,163}, + {0x0454,164}, + {0x0456,166}, + {0x0457,167}, + {0x0490,189}, + {0x0491,173}, + {0x2219,149}, + {0x221a,150}, + {0x2248,151}, + {0x2264,152}, + {0x2265,153}, + {0x2320,147}, + {0x2321,155}, + {0x2580,139}, + {0x2584,140}, + {0x2588,141}, + {0x258c,142}, + {0x2590,143}, + {0x2591,144}, + {0x2592,145}, + {0x2593,146}, + {0x25a0,148}, +}; + +unsigned short fz_unicode_from_windows_1250[256] = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 8364, + 0, + 8218, + 0, + 8222, + 8230, + 8224, + 8225, + 0, + 8240, + 352, + 8249, + 346, + 356, + 381, + 377, + 0, + 8216, + 8217, + 8220, + 8221, + 8226, + 8211, + 8212, + 0, + 8482, + 353, + 8250, + 347, + 357, + 382, + 378, + 160, + 711, + 728, + 321, + 164, + 260, + 166, + 167, + 168, + 169, + 350, + 171, + 172, + 173, + 174, + 379, + 176, + 177, + 731, + 322, + 180, + 181, + 182, + 183, + 184, + 261, + 351, + 187, + 317, + 733, + 318, + 380, + 340, + 193, + 194, + 258, + 196, + 313, + 262, + 199, + 268, + 201, + 280, + 203, + 282, + 205, + 206, + 270, + 272, + 323, + 327, + 211, + 212, + 336, + 214, + 215, + 344, + 366, + 218, + 368, + 220, + 221, + 354, + 223, + 341, + 225, + 226, + 259, + 228, + 314, + 263, + 231, + 269, + 233, + 281, + 235, + 283, + 237, + 238, + 271, + 273, + 324, + 328, + 243, + 244, + 337, + 246, + 247, + 345, + 367, + 250, + 369, + 252, + 253, + 355, + 729, +}; + +const char *fz_glyph_name_from_windows_1250[256] = { + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quotesingle", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "grave", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "controlDEL", + "Euro", + _notdef, + "quotesinglbase", + _notdef, + "quotedblbase", + "ellipsis", + "dagger", + "daggerdbl", + _notdef, + "perthousand", + "Scaron", + "guilsinglleft", + "Sacute", + "Tcaron", + "Zcaron", + "Zacute", + _notdef, + "quoteleft", + "quoteright", + "quotedblleft", + "quotedblright", + "bullet", + "endash", + "emdash", + _notdef, + "trademark", + "scaron", + "guilsinglright", + "sacute", + "tcaron", + "zcaron", + "zacute", + "nbspace", + "caron", + "breve", + "Lslash", + "currency", + "Aogonek", + "brokenbar", + "section", + "dieresis", + "copyright", + "Scedilla", + "guillemotleft", + "logicalnot", + "sfthyphen", + "registered", + "Zdot", + "degree", + "plusminus", + "ogonek", + "lslash", + "acute", + "mu", + "paragraph", + "middot", + "cedilla", + "aogonek", + "scedilla", + "guillemotright", + "Lcaron", + "hungarumlaut", + "lcaron", + "zdot", + "Racute", + "Aacute", + "Acircumflex", + "Abreve", + "Adieresis", + "Lacute", + "Cacute", + "Ccedilla", + "Ccaron", + "Eacute", + "Eogonek", + "Edieresis", + "Ecaron", + "Iacute", + "Icircumflex", + "Dcaron", + "Dcroat", + "Nacute", + "Ncaron", + "Oacute", + "Ocircumflex", + "Odblacute", + "Odieresis", + "multiply", + "Rcaron", + "Uring", + "Uacute", + "Udblacute", + "Udieresis", + "Yacute", + "Tcedilla", + "germandbls", + "racute", + "aacute", + "acircumflex", + "abreve", + "adieresis", + "lacute", + "cacute", + "ccedilla", + "ccaron", + "eacute", + "eogonek", + "edieresis", + "ecaron", + "iacute", + "icircumflex", + "dcaron", + "dcroat", + "nacute", + "ncaron", + "oacute", + "ocircumflex", + "odblacute", + "odieresis", + "divide", + "rcaron", + "uring", + "uacute", + "udblacute", + "udieresis", + "yacute", + "tcedilla", + "dotaccent", +}; + +static const struct { unsigned short u, c; } windows_1250_from_unicode[] = { + {0x00a0,160}, + {0x00a4,164}, + {0x00a6,166}, + {0x00a7,167}, + {0x00a8,168}, + {0x00a9,169}, + {0x00ab,171}, + {0x00ac,172}, + {0x00ad,173}, + {0x00ae,174}, + {0x00b0,176}, + {0x00b1,177}, + {0x00b4,180}, + {0x00b5,181}, + {0x00b6,182}, + {0x00b7,183}, + {0x00b8,184}, + {0x00bb,187}, + {0x00c1,193}, + {0x00c2,194}, + {0x00c4,196}, + {0x00c7,199}, + {0x00c9,201}, + {0x00cb,203}, + {0x00cd,205}, + {0x00ce,206}, + {0x00d3,211}, + {0x00d4,212}, + {0x00d6,214}, + {0x00d7,215}, + {0x00da,218}, + {0x00dc,220}, + {0x00dd,221}, + {0x00df,223}, + {0x00e1,225}, + {0x00e2,226}, + {0x00e4,228}, + {0x00e7,231}, + {0x00e9,233}, + {0x00eb,235}, + {0x00ed,237}, + {0x00ee,238}, + {0x00f3,243}, + {0x00f4,244}, + {0x00f6,246}, + {0x00f7,247}, + {0x00fa,250}, + {0x00fc,252}, + {0x00fd,253}, + {0x0102,195}, + {0x0103,227}, + {0x0104,165}, + {0x0105,185}, + {0x0106,198}, + {0x0107,230}, + {0x010c,200}, + {0x010d,232}, + {0x010e,207}, + {0x010f,239}, + {0x0110,208}, + {0x0111,240}, + {0x0118,202}, + {0x0119,234}, + {0x011a,204}, + {0x011b,236}, + {0x0139,197}, + {0x013a,229}, + {0x013d,188}, + {0x013e,190}, + {0x0141,163}, + {0x0142,179}, + {0x0143,209}, + {0x0144,241}, + {0x0147,210}, + {0x0148,242}, + {0x0150,213}, + {0x0151,245}, + {0x0154,192}, + {0x0155,224}, + {0x0158,216}, + {0x0159,248}, + {0x015a,140}, + {0x015b,156}, + {0x015e,170}, + {0x015f,186}, + {0x0160,138}, + {0x0161,154}, + {0x0162,222}, + {0x0163,254}, + {0x0164,141}, + {0x0165,157}, + {0x016e,217}, + {0x016f,249}, + {0x0170,219}, + {0x0171,251}, + {0x0179,143}, + {0x017a,159}, + {0x017b,175}, + {0x017c,191}, + {0x017d,142}, + {0x017e,158}, + {0x02c7,161}, + {0x02d8,162}, + {0x02d9,255}, + {0x02db,178}, + {0x02dd,189}, + {0x2013,150}, + {0x2014,151}, + {0x2018,145}, + {0x2019,146}, + {0x201a,130}, + {0x201c,147}, + {0x201d,148}, + {0x201e,132}, + {0x2020,134}, + {0x2021,135}, + {0x2022,149}, + {0x2026,133}, + {0x2030,137}, + {0x2039,139}, + {0x203a,155}, + {0x20ac,128}, + {0x2122,153}, +}; + +unsigned short fz_unicode_from_windows_1251[256] = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 1026, + 1027, + 8218, + 1107, + 8222, + 8230, + 8224, + 8225, + 8364, + 8240, + 1033, + 8249, + 1034, + 1036, + 1035, + 1039, + 1106, + 8216, + 8217, + 8220, + 8221, + 8226, + 8211, + 8212, + 0, + 8482, + 1113, + 8250, + 1114, + 1116, + 1115, + 1119, + 160, + 1038, + 1118, + 1032, + 164, + 1168, + 166, + 167, + 1025, + 169, + 1028, + 171, + 172, + 173, + 174, + 1031, + 176, + 177, + 1030, + 1110, + 1169, + 181, + 182, + 183, + 1105, + 8470, + 1108, + 187, + 1112, + 1029, + 1109, + 1111, + 1040, + 1041, + 1042, + 1043, + 1044, + 1045, + 1046, + 1047, + 1048, + 1049, + 1050, + 1051, + 1052, + 1053, + 1054, + 1055, + 1056, + 1057, + 1058, + 1059, + 1060, + 1061, + 1062, + 1063, + 1064, + 1065, + 1066, + 1067, + 1068, + 1069, + 1070, + 1071, + 1072, + 1073, + 1074, + 1075, + 1076, + 1077, + 1078, + 1079, + 1080, + 1081, + 1082, + 1083, + 1084, + 1085, + 1086, + 1087, + 1088, + 1089, + 1090, + 1091, + 1092, + 1093, + 1094, + 1095, + 1096, + 1097, + 1098, + 1099, + 1100, + 1101, + 1102, + 1103, +}; + +const char *fz_glyph_name_from_windows_1251[256] = { + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quotesingle", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "grave", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "controlDEL", + "Djecyrillic", + "Gjecyrillic", + "quotesinglbase", + "afii10100", + "quotedblbase", + "ellipsis", + "dagger", + "daggerdbl", + "Euro", + "perthousand", + "Ljecyrillic", + "guilsinglleft", + "Njecyrillic", + "Kjecyrillic", + "Tshecyrillic", + "Dzhecyrillic", + "afii10099", + "quoteleft", + "quoteright", + "quotedblleft", + "quotedblright", + "bullet", + "endash", + "emdash", + _notdef, + "trademark", + "afii10106", + "guilsinglright", + "afii10107", + "afii10109", + "afii10108", + "afii10193", + "nbspace", + "Ushortcyrillic", + "afii10110", + "Jecyrillic", + "currency", + "Gheupturncyrillic", + "brokenbar", + "section", + "Iocyrillic", + "copyright", + "Ecyrillic", + "guillemotleft", + "logicalnot", + "sfthyphen", + "registered", + "Yicyrillic", + "degree", + "plusminus", + "Icyrillic", + "afii10103", + "afii10098", + "mu", + "paragraph", + "middot", + "afii10071", + "afii61352", + "afii10101", + "guillemotright", + "afii10105", + "Dzecyrillic", + "afii10102", + "afii10104", + "Acyrillic", + "Becyrillic", + "Vecyrillic", + "Gecyrillic", + "Decyrillic", + "Iecyrillic", + "Zhecyrillic", + "Zecyrillic", + "Iicyrillic", + "Iishortcyrillic", + "Kacyrillic", + "Elcyrillic", + "Emcyrillic", + "Encyrillic", + "Ocyrillic", + "Pecyrillic", + "Ercyrillic", + "Escyrillic", + "Tecyrillic", + "Ucyrillic", + "Efcyrillic", + "Khacyrillic", + "Tsecyrillic", + "Checyrillic", + "Shacyrillic", + "Shchacyrillic", + "Hardsigncyrillic", + "Yericyrillic", + "Softsigncyrillic", + "Ereversedcyrillic", + "IUcyrillic", + "IAcyrillic", + "acyrillic", + "afii10066", + "afii10067", + "afii10068", + "afii10069", + "afii10070", + "afii10072", + "afii10073", + "afii10074", + "afii10075", + "afii10076", + "afii10077", + "afii10078", + "afii10079", + "afii10080", + "afii10081", + "afii10082", + "afii10083", + "afii10084", + "afii10085", + "afii10086", + "afii10087", + "afii10088", + "afii10089", + "afii10090", + "afii10091", + "afii10092", + "afii10093", + "afii10094", + "afii10095", + "afii10096", + "afii10097", +}; + +static const struct { unsigned short u, c; } windows_1251_from_unicode[] = { + {0x00a0,160}, + {0x00a4,164}, + {0x00a6,166}, + {0x00a7,167}, + {0x00a9,169}, + {0x00ab,171}, + {0x00ac,172}, + {0x00ad,173}, + {0x00ae,174}, + {0x00b0,176}, + {0x00b1,177}, + {0x00b5,181}, + {0x00b6,182}, + {0x00b7,183}, + {0x00bb,187}, + {0x0401,168}, + {0x0402,128}, + {0x0403,129}, + {0x0404,170}, + {0x0405,189}, + {0x0406,178}, + {0x0407,175}, + {0x0408,163}, + {0x0409,138}, + {0x040a,140}, + {0x040b,142}, + {0x040c,141}, + {0x040e,161}, + {0x040f,143}, + {0x0410,192}, + {0x0411,193}, + {0x0412,194}, + {0x0413,195}, + {0x0414,196}, + {0x0415,197}, + {0x0416,198}, + {0x0417,199}, + {0x0418,200}, + {0x0419,201}, + {0x041a,202}, + {0x041b,203}, + {0x041c,204}, + {0x041d,205}, + {0x041e,206}, + {0x041f,207}, + {0x0420,208}, + {0x0421,209}, + {0x0422,210}, + {0x0423,211}, + {0x0424,212}, + {0x0425,213}, + {0x0426,214}, + {0x0427,215}, + {0x0428,216}, + {0x0429,217}, + {0x042a,218}, + {0x042b,219}, + {0x042c,220}, + {0x042d,221}, + {0x042e,222}, + {0x042f,223}, + {0x0430,224}, + {0x0431,225}, + {0x0432,226}, + {0x0433,227}, + {0x0434,228}, + {0x0435,229}, + {0x0436,230}, + {0x0437,231}, + {0x0438,232}, + {0x0439,233}, + {0x043a,234}, + {0x043b,235}, + {0x043c,236}, + {0x043d,237}, + {0x043e,238}, + {0x043f,239}, + {0x0440,240}, + {0x0441,241}, + {0x0442,242}, + {0x0443,243}, + {0x0444,244}, + {0x0445,245}, + {0x0446,246}, + {0x0447,247}, + {0x0448,248}, + {0x0449,249}, + {0x044a,250}, + {0x044b,251}, + {0x044c,252}, + {0x044d,253}, + {0x044e,254}, + {0x044f,255}, + {0x0451,184}, + {0x0452,144}, + {0x0453,131}, + {0x0454,186}, + {0x0455,190}, + {0x0456,179}, + {0x0457,191}, + {0x0458,188}, + {0x0459,154}, + {0x045a,156}, + {0x045b,158}, + {0x045c,157}, + {0x045e,162}, + {0x045f,159}, + {0x0490,165}, + {0x0491,180}, + {0x2013,150}, + {0x2014,151}, + {0x2018,145}, + {0x2019,146}, + {0x201a,130}, + {0x201c,147}, + {0x201d,148}, + {0x201e,132}, + {0x2020,134}, + {0x2021,135}, + {0x2022,149}, + {0x2026,133}, + {0x2030,137}, + {0x2039,139}, + {0x203a,155}, + {0x20ac,136}, + {0x2116,185}, + {0x2122,153}, +}; + +unsigned short fz_unicode_from_windows_1252[256] = { + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 123, + 124, + 125, + 126, + 127, + 8364, + 0, + 8218, + 402, + 8222, + 8230, + 8224, + 8225, + 710, + 8240, + 352, + 8249, + 338, + 0, + 381, + 0, + 0, + 8216, + 8217, + 8220, + 8221, + 8226, + 8211, + 8212, + 732, + 8482, + 353, + 8250, + 339, + 0, + 382, + 376, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 195, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 232, + 233, + 234, + 235, + 236, + 237, + 238, + 239, + 240, + 241, + 242, + 243, + 244, + 245, + 246, + 247, + 248, + 249, + 250, + 251, + 252, + 253, + 254, + 255, +}; + +const char *fz_glyph_name_from_windows_1252[256] = { + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + _notdef, + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quotesingle", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "grave", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "controlDEL", + "Euro", + _notdef, + "quotesinglbase", + "florin", + "quotedblbase", + "ellipsis", + "dagger", + "daggerdbl", + "circumflex", + "perthousand", + "Scaron", + "guilsinglleft", + "OE", + _notdef, + "Zcaron", + _notdef, + _notdef, + "quoteleft", + "quoteright", + "quotedblleft", + "quotedblright", + "bullet", + "endash", + "emdash", + "ilde", + "trademark", + "scaron", + "guilsinglright", + "oe", + _notdef, + "zcaron", + "Ydieresis", + "nbspace", + "exclamdown", + "cent", + "sterling", + "currency", + "yen", + "brokenbar", + "section", + "dieresis", + "copyright", + "ordfeminine", + "guillemotleft", + "logicalnot", + "sfthyphen", + "registered", + "macron", + "degree", + "plusminus", + "twosuperior", + "threesuperior", + "acute", + "mu", + "paragraph", + "middot", + "cedilla", + "onesuperior", + "ordmasculine", + "guillemotright", + "onequarter", + "onehalf", + "threequarters", + "questiondown", + "Agrave", + "Aacute", + "Acircumflex", + "Atilde", + "Adieresis", + "Aring", + "AE", + "Ccedilla", + "Egrave", + "Eacute", + "Ecircumflex", + "Edieresis", + "Igrave", + "Iacute", + "Icircumflex", + "Idieresis", + "Eth", + "Ntilde", + "Ograve", + "Oacute", + "Ocircumflex", + "Otilde", + "Odieresis", + "multiply", + "Oslash", + "Ugrave", + "Uacute", + "Ucircumflex", + "Udieresis", + "Yacute", + "Thorn", + "germandbls", + "agrave", + "aacute", + "acircumflex", + "atilde", + "adieresis", + "aring", + "ae", + "ccedilla", + "egrave", + "eacute", + "ecircumflex", + "edieresis", + "igrave", + "iacute", + "icircumflex", + "idieresis", + "eth", + "ntilde", + "ograve", + "oacute", + "ocircumflex", + "otilde", + "odieresis", + "divide", + "oslash", + "ugrave", + "uacute", + "ucircumflex", + "udieresis", + "yacute", + "thorn", + "ydieresis", +}; + +static const struct { unsigned short u, c; } windows_1252_from_unicode[] = { + {0x00a0,160}, + {0x00a1,161}, + {0x00a2,162}, + {0x00a3,163}, + {0x00a4,164}, + {0x00a5,165}, + {0x00a6,166}, + {0x00a7,167}, + {0x00a8,168}, + {0x00a9,169}, + {0x00aa,170}, + {0x00ab,171}, + {0x00ac,172}, + {0x00ad,173}, + {0x00ae,174}, + {0x00af,175}, + {0x00b0,176}, + {0x00b1,177}, + {0x00b2,178}, + {0x00b3,179}, + {0x00b4,180}, + {0x00b5,181}, + {0x00b6,182}, + {0x00b7,183}, + {0x00b8,184}, + {0x00b9,185}, + {0x00ba,186}, + {0x00bb,187}, + {0x00bc,188}, + {0x00bd,189}, + {0x00be,190}, + {0x00bf,191}, + {0x00c0,192}, + {0x00c1,193}, + {0x00c2,194}, + {0x00c3,195}, + {0x00c4,196}, + {0x00c5,197}, + {0x00c6,198}, + {0x00c7,199}, + {0x00c8,200}, + {0x00c9,201}, + {0x00ca,202}, + {0x00cb,203}, + {0x00cc,204}, + {0x00cd,205}, + {0x00ce,206}, + {0x00cf,207}, + {0x00d0,208}, + {0x00d1,209}, + {0x00d2,210}, + {0x00d3,211}, + {0x00d4,212}, + {0x00d5,213}, + {0x00d6,214}, + {0x00d7,215}, + {0x00d8,216}, + {0x00d9,217}, + {0x00da,218}, + {0x00db,219}, + {0x00dc,220}, + {0x00dd,221}, + {0x00de,222}, + {0x00df,223}, + {0x00e0,224}, + {0x00e1,225}, + {0x00e2,226}, + {0x00e3,227}, + {0x00e4,228}, + {0x00e5,229}, + {0x00e6,230}, + {0x00e7,231}, + {0x00e8,232}, + {0x00e9,233}, + {0x00ea,234}, + {0x00eb,235}, + {0x00ec,236}, + {0x00ed,237}, + {0x00ee,238}, + {0x00ef,239}, + {0x00f0,240}, + {0x00f1,241}, + {0x00f2,242}, + {0x00f3,243}, + {0x00f4,244}, + {0x00f5,245}, + {0x00f6,246}, + {0x00f7,247}, + {0x00f8,248}, + {0x00f9,249}, + {0x00fa,250}, + {0x00fb,251}, + {0x00fc,252}, + {0x00fd,253}, + {0x00fe,254}, + {0x00ff,255}, + {0x0152,140}, + {0x0153,156}, + {0x0160,138}, + {0x0161,154}, + {0x0178,159}, + {0x017d,142}, + {0x017e,158}, + {0x0192,131}, + {0x02c6,136}, + {0x02dc,152}, + {0x2013,150}, + {0x2014,151}, + {0x2018,145}, + {0x2019,146}, + {0x201a,130}, + {0x201c,147}, + {0x201d,148}, + {0x201e,132}, + {0x2020,134}, + {0x2021,135}, + {0x2022,149}, + {0x2026,133}, + {0x2030,137}, + {0x2039,139}, + {0x203a,155}, + {0x20ac,128}, + {0x2122,153}, +}; diff --git a/source/pdf/pdf-appearance.c b/source/pdf/pdf-appearance.c index c18d5860..a3067e16 100644 --- a/source/pdf/pdf-appearance.c +++ b/source/pdf/pdf-appearance.c @@ -673,7 +673,7 @@ measure_simple_string(fz_context *ctx, fz_font *font, const char *text) { int c, g; text += fz_chartorune(&c, text); - c = pdf_winansi_from_unicode(c); + c = fz_windows_1252_from_unicode(c); if (c < 0) c = REPLACEMENT; g = fz_encode_character(ctx, font, c); w += fz_advance_glyph(ctx, font, g, 0); @@ -689,7 +689,7 @@ write_simple_string(fz_context *ctx, fz_buffer *buf, const char *a, const char * { int c; a += fz_chartorune(&c, a); - c = pdf_winansi_from_unicode(c); + c = fz_windows_1252_from_unicode(c); if (c < 0) c = REPLACEMENT; if (c == '(' || c == ')' || c == '\\') fz_append_byte(ctx, buf, '\\'); @@ -871,7 +871,7 @@ write_comb_string(fz_context *ctx, fz_buffer *buf, const char *a, const char *b, int c, g; a += fz_chartorune(&c, a); - c = pdf_winansi_from_unicode(c); + c = fz_windows_1252_from_unicode(c); if (c < 0) c = REPLACEMENT; g = fz_encode_character(ctx, font, c); diff --git a/source/pdf/pdf-encoding.c b/source/pdf/pdf-encoding.c index f4fe584c..bf0d3df8 100644 --- a/source/pdf/pdf-encoding.c +++ b/source/pdf/pdf-encoding.c @@ -7,6 +7,8 @@ #include #include +#define pdf_win_ansi fz_glyph_name_from_windows_1252 + void pdf_load_encoding(const char **estrings, const char *encoding) { @@ -85,60 +87,3 @@ pdf_lookup_agl_duplicates(int ucs) } return empty_dup_list; } - -int pdf_cyrillic_from_unicode(int u) -{ - int l = 0; - int r = nelem(koi8u_from_unicode) - 1; - if (u < 128) - return u; - while (l <= r) - { - int m = (l + r) >> 1; - if (u < koi8u_from_unicode[m].u) - r = m - 1; - else if (u > koi8u_from_unicode[m].u) - l = m + 1; - else - return koi8u_from_unicode[m].c; - } - return -1; -} - -int pdf_greek_from_unicode(int u) -{ - int l = 0; - int r = nelem(iso8859_7_from_unicode) - 1; - if (u < 128) - return u; - while (l <= r) - { - int m = (l + r) >> 1; - if (u < iso8859_7_from_unicode[m].u) - r = m - 1; - else if (u > iso8859_7_from_unicode[m].u) - l = m + 1; - else - return iso8859_7_from_unicode[m].c; - } - return -1; -} - -int pdf_winansi_from_unicode(int u) -{ - int l = 0; - int r = nelem(winansi_from_unicode) - 1; - if (u < 128) - return u; - while (l <= r) - { - int m = (l + r) >> 1; - if (u < winansi_from_unicode[m].u) - r = m - 1; - else if (u > winansi_from_unicode[m].u) - l = m + 1; - else - return winansi_from_unicode[m].c; - } - return -1; -} diff --git a/source/pdf/pdf-encodings.h b/source/pdf/pdf-encodings.h index f9e84c49..efdfe1f7 100644 --- a/source/pdf/pdf-encodings.h +++ b/source/pdf/pdf-encodings.h @@ -166,206 +166,3 @@ const char *pdf_mac_expert[256] = { "periodsuperior", "Dotaccentsmall", "Ringsmall", _notdef, _notdef, _notdef, _notdef }; - -const char *pdf_win_ansi[256] = { - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - "space", "exclam", "quotedbl", "numbersign", "dollar", "percent", - "ampersand", "quotesingle", "parenleft", "parenright", "asterisk", - "plus", "comma", "hyphen", "period", "slash", "zero", "one", "two", - "three", "four", "five", "six", "seven", "eight", "nine", "colon", - "semicolon", "less", "equal", "greater", "question", "at", "A", "B", - "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", - "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "bracketleft", - "backslash", "bracketright", "asciicircum", "underscore", "grave", "a", - "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft", - "bar", "braceright", "asciitilde", "bullet", "Euro", "bullet", - "quotesinglbase", "florin", "quotedblbase", "ellipsis", "dagger", - "daggerdbl", "circumflex", "perthousand", "Scaron", "guilsinglleft", - "OE", "bullet", "Zcaron", "bullet", "bullet", "quoteleft", - "quoteright", "quotedblleft", "quotedblright", "bullet", "endash", - "emdash", "tilde", "trademark", "scaron", "guilsinglright", "oe", - "bullet", "zcaron", "Ydieresis", "space", "exclamdown", "cent", - "sterling", "currency", "yen", "brokenbar", "section", "dieresis", - "copyright", "ordfeminine", "guillemotleft", "logicalnot", "hyphen", - "registered", "macron", "degree", "plusminus", "twosuperior", - "threesuperior", "acute", "mu", "paragraph", "periodcentered", - "cedilla", "onesuperior", "ordmasculine", "guillemotright", - "onequarter", "onehalf", "threequarters", "questiondown", "Agrave", - "Aacute", "Acircumflex", "Atilde", "Adieresis", "Aring", "AE", - "Ccedilla", "Egrave", "Eacute", "Ecircumflex", "Edieresis", "Igrave", - "Iacute", "Icircumflex", "Idieresis", "Eth", "Ntilde", "Ograve", - "Oacute", "Ocircumflex", "Otilde", "Odieresis", "multiply", "Oslash", - "Ugrave", "Uacute", "Ucircumflex", "Udieresis", "Yacute", "Thorn", - "germandbls", "agrave", "aacute", "acircumflex", "atilde", "adieresis", - "aring", "ae", "ccedilla", "egrave", "eacute", "ecircumflex", - "edieresis", "igrave", "iacute", "icircumflex", "idieresis", "eth", - "ntilde", "ograve", "oacute", "ocircumflex", "otilde", "odieresis", - "divide", "oslash", "ugrave", "uacute", "ucircumflex", "udieresis", - "yacute", "thorn", "ydieresis" -}; - -const char *pdf_glyph_name_from_koi8u[256] = { - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - "space", "exclam", "quotedbl", "numbersign", "dollar", "percent", - "ampersand", "quotesingle", "parenleft", "parenright", "asterisk", - "plus", "comma", "hyphen", "period", "slash", "zero", "one", "two", - "three", "four", "five", "six", "seven", "eight", "nine", "colon", - "semicolon", "less", "equal", "greater", "question", "at", "A", "B", - "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", - "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "bracketleft", - "backslash", "bracketright", "asciicircum", "underscore", "grave", "a", - "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft", - "bar", "braceright", "asciitilde", _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, "integraltp", _notdef, "bulletoperator", - "radical", "approxequal", "lessequal", "greaterequal", - "nonbreakingspace", "integralbt", "degree", "twosuperior", - "periodcentered", "divide", _notdef, _notdef, _notdef, "iocyrillic", - "ecyrillic", _notdef, "icyrillic", "yicyrillic", _notdef, _notdef, - _notdef, _notdef, _notdef, "gheupturncyrillic", _notdef, _notdef, - _notdef, _notdef, _notdef, "afii10023", "afii10053", _notdef, - "afii10055", "afii10056", _notdef, _notdef, _notdef, _notdef, _notdef, - "afii10050", _notdef, "copyright", "iucyrillic", "afii10065", - "becyrillic", "tsecyrillic", "decyrillic", "iecyrillic", "efcyrillic", - "gecyrillic", "khacyrillic", "iicyrillic", "iishortcyrillic", - "kacyrillic", "elcyrillic", "emcyrillic", "encyrillic", "ocyrillic", - "pecyrillic", "iacyrillic", "ercyrillic", "escyrillic", "tecyrillic", - "ucyrillic", "zhecyrillic", "vecyrillic", "softsigncyrillic", - "yericyrillic", "zecyrillic", "shacyrillic", "ereversedcyrillic", - "shchacyrillic", "checyrillic", "hardsigncyrillic", "afii10048", - "afii10017", "afii10018", "afii10040", "afii10021", "afii10022", - "afii10038", "afii10020", "afii10039", "afii10026", "afii10027", - "afii10028", "afii10029", "afii10030", "afii10031", "afii10032", - "afii10033", "afii10049", "afii10034", "afii10035", "afii10036", - "afii10037", "afii10024", "afii10019", "afii10046", "afii10045", - "afii10025", "afii10042", "afii10047", "afii10043", "afii10041", - "afii10044", -}; - -static const struct { unsigned short u, c; } koi8u_from_unicode[] = { - {0x00a0,154}, {0x00a9,191}, {0x00b0,156}, {0x00b2,157}, {0x00b7,158}, - {0x00f7,159}, {0x0401,179}, {0x0404,180}, {0x0406,182}, {0x0407,183}, - {0x0410,225}, {0x0411,226}, {0x0412,247}, {0x0413,231}, {0x0414,228}, - {0x0415,229}, {0x0416,246}, {0x0417,250}, {0x0418,233}, {0x0419,234}, - {0x041a,235}, {0x041b,236}, {0x041c,237}, {0x041d,238}, {0x041e,239}, - {0x041f,240}, {0x0420,242}, {0x0421,243}, {0x0422,244}, {0x0423,245}, - {0x0424,230}, {0x0425,232}, {0x0426,227}, {0x0427,254}, {0x0428,251}, - {0x0429,253}, {0x042a,255}, {0x042b,249}, {0x042c,248}, {0x042d,252}, - {0x042e,224}, {0x042f,241}, {0x0430,193}, {0x0431,194}, {0x0432,215}, - {0x0433,199}, {0x0434,196}, {0x0435,197}, {0x0436,214}, {0x0437,218}, - {0x0438,201}, {0x0439,202}, {0x043a,203}, {0x043b,204}, {0x043c,205}, - {0x043d,206}, {0x043e,207}, {0x043f,208}, {0x0440,210}, {0x0441,211}, - {0x0442,212}, {0x0443,213}, {0x0444,198}, {0x0445,200}, {0x0446,195}, - {0x0447,222}, {0x0448,219}, {0x0449,221}, {0x044a,223}, {0x044b,217}, - {0x044c,216}, {0x044d,220}, {0x044e,192}, {0x044f,209}, {0x0451,163}, - {0x0454,164}, {0x0456,166}, {0x0457,167}, {0x0490,189}, {0x0491,173}, - {0x2219,149}, {0x221a,150}, {0x2248,151}, {0x2264,152}, {0x2265,153}, - {0x2320,147}, {0x2321,155}, {0x2500,128}, {0x2502,129}, {0x250c,130}, - {0x2510,131}, {0x2514,132}, {0x2518,133}, {0x251c,134}, {0x2524,135}, - {0x252c,136}, {0x2534,137}, {0x253c,138}, {0x2550,160}, {0x2551,161}, - {0x2552,162}, {0x2554,165}, {0x2557,168}, {0x2558,169}, {0x2559,170}, - {0x255a,171}, {0x255b,172}, {0x255d,174}, {0x255e,175}, {0x255f,176}, - {0x2560,177}, {0x2561,178}, {0x2563,181}, {0x2566,184}, {0x2567,185}, - {0x2568,186}, {0x2569,187}, {0x256a,188}, {0x256c,190}, {0x2580,139}, - {0x2584,140}, {0x2588,141}, {0x258c,142}, {0x2590,143}, {0x2591,144}, - {0x2592,145}, {0x2593,146}, {0x25a0,148} -}; - -const char *pdf_glyph_name_from_iso8859_7[256] = { - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - "space", "exclam", "quotedbl", "numbersign", "dollar", "percent", - "ampersand", "quotesingle", "parenleft", "parenright", "asterisk", - "plus", "comma", "hyphen", "period", "slash", "zero", "one", "two", - "three", "four", "five", "six", "seven", "eight", "nine", "colon", - "semicolon", "less", "equal", "greater", "question", "at", "A", "B", - "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", - "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "bracketleft", - "backslash", "bracketright", "asciicircum", "underscore", "grave", "a", - "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "braceleft", - "bar", "braceright", "asciitilde", _notdef, - /* the block drawing characters have been omitted */ - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, _notdef, - "nonbreakingspace", "quoteleft", "quoteright", "sterling", "euro", - _notdef, "brokenbar", "section", "dieresis", "copyright", - "ypogegrammeni", "guillemotleft", "logicalnot", "softhyphen", _notdef, - "horizontalbar", "degree", "plusminus", "twosuperior", "threesuperior", - "tonos", "dieresistonos", "Alphatonos", "periodcentered", - "Epsilontonos", "Etatonos", "Iotatonos", "guillemotright", - "Omicrontonos", "onehalf", "Upsilontonos", "Omegatonos", - "iotadieresistonos", "Alpha", "Beta", "Gamma", "Deltagreek", "Epsilon", - "Zeta", "Eta", "Theta", "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", - "Omicron", "Pi", "Rho", _notdef, "Sigma", "Tau", "Upsilon", "Phi", - "Chi", "Psi", "Omegagreek", "Iotadieresis", "Upsilondieresis", - "alphatonos", "epsilontonos", "etatonos", "iotatonos", - "upsilondieresistonos", "alpha", "beta", "gamma", "delta", "epsilon", - "zeta", "eta", "theta", "iota", "kappa", "lambda", "mugreek", "nu", - "xi", "omicron", "pi", "rho", "sigmafinal", "sigma", "tau", "upsilon", - "phi", "chi", "psi", "omega", "iotadieresis", "upsilondieresis", - "omicrontonos", "upsilontonos", "omegatonos", _notdef, -}; - -static const struct { unsigned short u, c; } iso8859_7_from_unicode[] = { - {0x00a0,160}, {0x00a3,163}, {0x00a6,166}, {0x00a7,167}, {0x00a8,168}, - {0x00a9,169}, {0x00ab,171}, {0x00ac,172}, {0x00ad,173}, {0x00b0,176}, - {0x00b1,177}, {0x00b2,178}, {0x00b3,179}, {0x00b7,183}, {0x00bb,187}, - {0x00bd,189}, {0x037a,170}, {0x0384,180}, {0x0385,181}, {0x0386,182}, - {0x0388,184}, {0x0389,185}, {0x038a,186}, {0x038c,188}, {0x038e,190}, - {0x038f,191}, {0x0390,192}, {0x0391,193}, {0x0392,194}, {0x0393,195}, - {0x0394,196}, {0x0395,197}, {0x0396,198}, {0x0397,199}, {0x0398,200}, - {0x0399,201}, {0x039a,202}, {0x039b,203}, {0x039c,204}, {0x039d,205}, - {0x039e,206}, {0x039f,207}, {0x03a0,208}, {0x03a1,209}, {0x03a3,211}, - {0x03a4,212}, {0x03a5,213}, {0x03a6,214}, {0x03a7,215}, {0x03a8,216}, - {0x03a9,217}, {0x03aa,218}, {0x03ab,219}, {0x03ac,220}, {0x03ad,221}, - {0x03ae,222}, {0x03af,223}, {0x03b0,224}, {0x03b1,225}, {0x03b2,226}, - {0x03b3,227}, {0x03b4,228}, {0x03b5,229}, {0x03b6,230}, {0x03b7,231}, - {0x03b8,232}, {0x03b9,233}, {0x03ba,234}, {0x03bb,235}, {0x03bc,236}, - {0x03bd,237}, {0x03be,238}, {0x03bf,239}, {0x03c0,240}, {0x03c1,241}, - {0x03c2,242}, {0x03c3,243}, {0x03c4,244}, {0x03c5,245}, {0x03c6,246}, - {0x03c7,247}, {0x03c8,248}, {0x03c9,249}, {0x03ca,250}, {0x03cb,251}, - {0x03cc,252}, {0x03cd,253}, {0x03ce,254}, {0x2015,175}, {0x2018,161}, - {0x2019,162}, {0x20ac,164}, -}; - -static const struct { unsigned short u, c; } winansi_from_unicode[] = { - {0x00a0,160}, {0x00a1,161}, {0x00a2,162}, {0x00a3,163}, {0x00a4,164}, - {0x00a5,165}, {0x00a6,166}, {0x00a7,167}, {0x00a8,168}, {0x00a9,169}, - {0x00aa,170}, {0x00ab,171}, {0x00ac,172}, {0x00ad,173}, {0x00ae,174}, - {0x00af,175}, {0x00b0,176}, {0x00b1,177}, {0x00b2,178}, {0x00b3,179}, - {0x00b4,180}, {0x00b5,181}, {0x00b6,182}, {0x00b7,183}, {0x00b8,184}, - {0x00b9,185}, {0x00ba,186}, {0x00bb,187}, {0x00bc,188}, {0x00bd,189}, - {0x00be,190}, {0x00bf,191}, {0x00c0,192}, {0x00c1,193}, {0x00c2,194}, - {0x00c3,195}, {0x00c4,196}, {0x00c5,197}, {0x00c6,198}, {0x00c7,199}, - {0x00c8,200}, {0x00c9,201}, {0x00ca,202}, {0x00cb,203}, {0x00cc,204}, - {0x00cd,205}, {0x00ce,206}, {0x00cf,207}, {0x00d0,208}, {0x00d1,209}, - {0x00d2,210}, {0x00d3,211}, {0x00d4,212}, {0x00d5,213}, {0x00d6,214}, - {0x00d7,215}, {0x00d8,216}, {0x00d9,217}, {0x00da,218}, {0x00db,219}, - {0x00dc,220}, {0x00dd,221}, {0x00de,222}, {0x00df,223}, {0x00e0,224}, - {0x00e1,225}, {0x00e2,226}, {0x00e3,227}, {0x00e4,228}, {0x00e5,229}, - {0x00e6,230}, {0x00e7,231}, {0x00e8,232}, {0x00e9,233}, {0x00ea,234}, - {0x00eb,235}, {0x00ec,236}, {0x00ed,237}, {0x00ee,238}, {0x00ef,239}, - {0x00f0,240}, {0x00f1,241}, {0x00f2,242}, {0x00f3,243}, {0x00f4,244}, - {0x00f5,245}, {0x00f6,246}, {0x00f7,247}, {0x00f8,248}, {0x00f9,249}, - {0x00fa,250}, {0x00fb,251}, {0x00fc,252}, {0x00fd,253}, {0x00fe,254}, - {0x00ff,255}, {0x0152,140}, {0x0153,156}, {0x0160,138}, {0x0161,154}, - {0x0178,159}, {0x017d,142}, {0x017e,158}, {0x0192,131}, {0x02c6,136}, - {0x02dc,152}, {0x2013,150}, {0x2014,151}, {0x2018,145}, {0x2019,146}, - {0x201a,130}, {0x201c,147}, {0x201d,148}, {0x201e,132}, {0x2020,134}, - {0x2021,135}, {0x2022,149}, {0x2026,133}, {0x2030,137}, {0x2039,139}, - {0x203a,155}, {0x20ac,128}, {0x2122,153}, -}; diff --git a/source/pdf/pdf-font.c b/source/pdf/pdf-font.c index a0df70fd..391a1024 100644 --- a/source/pdf/pdf-font.c +++ b/source/pdf/pdf-font.c @@ -20,6 +20,8 @@ #define FT_SFNT_HEAD ft_sfnt_head #endif +#define pdf_win_ansi fz_glyph_name_from_windows_1252 + static void pdf_load_font_descriptor(fz_context *ctx, pdf_document *doc, pdf_font_desc *fontdesc, pdf_obj *dict, const char *collection, const char *basefont, int iscidfont); @@ -2037,10 +2039,10 @@ pdf_add_simple_font_encoding(fz_context *ctx, pdf_document *doc, pdf_obj *fobj, pdf_dict_put(ctx, fobj, PDF_NAME(Encoding), PDF_NAME(WinAnsiEncoding)); break; case PDF_SIMPLE_ENCODING_GREEK: - pdf_add_simple_font_encoding_imp(ctx, doc, fobj, pdf_glyph_name_from_iso8859_7); + pdf_add_simple_font_encoding_imp(ctx, doc, fobj, fz_glyph_name_from_iso8859_7); break; case PDF_SIMPLE_ENCODING_CYRILLIC: - pdf_add_simple_font_encoding_imp(ctx, doc, fobj, pdf_glyph_name_from_koi8u); + pdf_add_simple_font_encoding_imp(ctx, doc, fobj, fz_glyph_name_from_koi8u); break; } } @@ -2061,9 +2063,9 @@ pdf_add_simple_font(fz_context *ctx, pdf_document *doc, fz_font *font, int encod switch (encoding) { default: enc = pdf_win_ansi; break; - case PDF_SIMPLE_ENCODING_LATIN: enc = pdf_win_ansi; break; - case PDF_SIMPLE_ENCODING_GREEK: enc = pdf_glyph_name_from_iso8859_7; break; - case PDF_SIMPLE_ENCODING_CYRILLIC: enc = pdf_glyph_name_from_koi8u; break; + case PDF_SIMPLE_ENCODING_LATIN: enc = fz_glyph_name_from_windows_1252; break; + case PDF_SIMPLE_ENCODING_GREEK: enc = fz_glyph_name_from_iso8859_7; break; + case PDF_SIMPLE_ENCODING_CYRILLIC: enc = fz_glyph_name_from_koi8u; break; } fobj = pdf_add_new_dict(ctx, doc, 10); -- cgit v1.2.3