From 54b9166366085b30b7ee3094c2b71cd36e377153 Mon Sep 17 00:00:00 2001 From: Nicolas Pena Date: Fri, 5 May 2017 16:49:30 -0400 Subject: Encode unicodes in UTF-16BE in ToUnicode map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: pdfium:667 Change-Id: I811571c334ff28162905a65781ca14f03caf2966 Reviewed-on: https://pdfium-review.googlesource.com/4910 Commit-Queue: Nicolás Peña Reviewed-by: Tom Sepez Reviewed-by: Lei Zhang --- core/fxcrt/fx_extension.cpp | 25 +++++++++++++++ core/fxcrt/fx_extension.h | 6 ++++ core/fxcrt/fx_extension_unittest.cpp | 50 ++++++++++++++++++++++++++++++ fpdfsdk/fpdfedittext.cpp | 59 +++++++++++++++++++----------------- 4 files changed, 113 insertions(+), 27 deletions(-) diff --git a/core/fxcrt/fx_extension.cpp b/core/fxcrt/fx_extension.cpp index 209584b68d..2b290ed79d 100644 --- a/core/fxcrt/fx_extension.cpp +++ b/core/fxcrt/fx_extension.cpp @@ -137,6 +137,31 @@ uint32_t FX_HashCode_GetW(const CFX_WideStringC& str, bool bIgnoreCase) { return dwHashCode; } +void FXSYS_IntToTwoHexChars(uint8_t n, char* buf) { + static const char kHex[] = "0123456789ABCDEF"; + buf[0] = kHex[n / 16]; + buf[1] = kHex[n % 16]; +} + +void FXSYS_IntToFourHexChars(uint16_t n, char* buf) { + FXSYS_IntToTwoHexChars(n / 256, buf); + FXSYS_IntToTwoHexChars(n % 256, buf + 2); +} + +size_t FXSYS_ToUTF16BE(uint32_t unicode, char* buf) { + ASSERT(unicode <= 0xD7FF || (unicode > 0xDFFF && unicode <= 0x10FFFF)); + if (unicode <= 0xFFFF) { + FXSYS_IntToFourHexChars(unicode, buf); + return 4; + } + unicode -= 0x010000; + // High ten bits plus 0xD800 + FXSYS_IntToFourHexChars(0xD800 + unicode / 0x400, buf); + // Low ten bits plus 0xDC00 + FXSYS_IntToFourHexChars(0xDC00 + unicode % 0x400, buf + 4); + return 8; +} + void* FX_Random_MT_Start(uint32_t dwSeed) { FX_MTRANDOMCONTEXT* pContext = FX_Alloc(FX_MTRANDOMCONTEXT, 1); pContext->mt[0] = dwSeed; diff --git a/core/fxcrt/fx_extension.h b/core/fxcrt/fx_extension.h index f55153c0ad..255ee2e3df 100644 --- a/core/fxcrt/fx_extension.h +++ b/core/fxcrt/fx_extension.h @@ -76,6 +76,12 @@ inline int FXSYS_DecimalCharToInt(const wchar_t c) { return std::iswdigit(c) ? c - L'0' : 0; } +void FXSYS_IntToTwoHexChars(uint8_t c, char* buf); + +void FXSYS_IntToFourHexChars(uint16_t c, char* buf); + +size_t FXSYS_ToUTF16BE(uint32_t unicode, char* buf); + float FXSYS_FractionalScale(size_t scale_factor, int value); int FXSYS_FractionalScaleCount(); diff --git a/core/fxcrt/fx_extension_unittest.cpp b/core/fxcrt/fx_extension_unittest.cpp index 1bc3ec6298..38b66ba2d2 100644 --- a/core/fxcrt/fx_extension_unittest.cpp +++ b/core/fxcrt/fx_extension_unittest.cpp @@ -39,3 +39,53 @@ TEST(fxcrt, FX_HashCode_Wide) { EXPECT_EQ(97u, FX_HashCode_GetW(L"A", true)); EXPECT_EQ(1313 * 65u + 66u, FX_HashCode_GetW(L"AB", false)); } + +TEST(fxcrt, FXSYS_IntToTwoHexChars) { + char buf[3] = {0}; + FXSYS_IntToTwoHexChars(0x0, buf); + EXPECT_STREQ("00", buf); + FXSYS_IntToTwoHexChars(0x9, buf); + EXPECT_STREQ("09", buf); + FXSYS_IntToTwoHexChars(0xA, buf); + EXPECT_STREQ("0A", buf); + FXSYS_IntToTwoHexChars(0x8C, buf); + EXPECT_STREQ("8C", buf); + FXSYS_IntToTwoHexChars(0xBE, buf); + EXPECT_STREQ("BE", buf); + FXSYS_IntToTwoHexChars(0xD0, buf); + EXPECT_STREQ("D0", buf); + FXSYS_IntToTwoHexChars(0xFF, buf); + EXPECT_STREQ("FF", buf); +} + +TEST(fxcrt, FXSYS_IntToFourHexChars) { + char buf[5] = {0}; + FXSYS_IntToFourHexChars(0x0, buf); + EXPECT_STREQ("0000", buf); + FXSYS_IntToFourHexChars(0xA23, buf); + EXPECT_STREQ("0A23", buf); + FXSYS_IntToFourHexChars(0xB701, buf); + EXPECT_STREQ("B701", buf); + FXSYS_IntToFourHexChars(0xFFFF, buf); + EXPECT_STREQ("FFFF", buf); +} + +TEST(fxcrt, FXSYS_ToUTF16BE) { + char buf[9] = {0}; + // Test U+0000 to U+D7FF and U+E000 to U+FFFF + EXPECT_EQ(4U, FXSYS_ToUTF16BE(0x0, buf)); + EXPECT_STREQ("0000", buf); + EXPECT_EQ(4U, FXSYS_ToUTF16BE(0xD7FF, buf)); + EXPECT_STREQ("D7FF", buf); + EXPECT_EQ(4U, FXSYS_ToUTF16BE(0xE000, buf)); + EXPECT_STREQ("E000", buf); + EXPECT_EQ(4U, FXSYS_ToUTF16BE(0xFFFF, buf)); + EXPECT_STREQ("FFFF", buf); + // Test U+10000 to U+10FFFF + EXPECT_EQ(8U, FXSYS_ToUTF16BE(0x10000, buf)); + EXPECT_STREQ("D800DC00", buf); + EXPECT_EQ(8U, FXSYS_ToUTF16BE(0x10FFFF, buf)); + EXPECT_STREQ("DBFFDFFF", buf); + EXPECT_EQ(8U, FXSYS_ToUTF16BE(0x2003E, buf)); + EXPECT_STREQ("D840DC3E", buf); +} diff --git a/fpdfsdk/fpdfedittext.cpp b/fpdfsdk/fpdfedittext.cpp index 9b01775235..54388ef701 100644 --- a/fpdfsdk/fpdfedittext.cpp +++ b/fpdfsdk/fpdfedittext.cpp @@ -19,6 +19,7 @@ #include "core/fpdfapi/parser/cpdf_number.h" #include "core/fpdfapi/parser/cpdf_reference.h" #include "core/fpdfapi/parser/cpdf_stream.h" +#include "core/fxcrt/fx_extension.h" #include "core/fxge/cfx_fontmgr.h" #include "core/fxge/fx_font.h" #include "fpdfsdk/fsdk_define.h" @@ -90,20 +91,27 @@ const char ToUnicodeStart[] = "1 begincodespacerange\n" "<0000> \n"; -const char hex[] = "0123456789ABCDEF"; - -void AddNum(CFX_ByteTextBuf* pBuffer, uint32_t number) { +void AddCharcode(CFX_ByteTextBuf* pBuffer, uint32_t number) { + ASSERT(number <= 0xFFFF); *pBuffer << "<"; char ans[4]; - for (size_t i = 0; i < 4; ++i) { - ans[3 - i] = hex[number % 16]; - number /= 16; - } + FXSYS_IntToFourHexChars(number, ans); for (size_t i = 0; i < 4; ++i) pBuffer->AppendChar(ans[i]); *pBuffer << ">"; } +// PDF spec 1.7 Section 5.9.2: "Unicode character sequences as expressed in +// UTF-16BE encoding." See https://en.wikipedia.org/wiki/UTF-16#Description +void AddUnicode(CFX_ByteTextBuf* pBuffer, uint32_t unicode) { + char ans[8]; + *pBuffer << "<"; + size_t numChars = FXSYS_ToUTF16BE(unicode, ans); + for (size_t i = 0; i < numChars; ++i) + pBuffer->AppendChar(ans[i]); + *pBuffer << ">"; +} + // Loads the charcode to unicode mapping into a stream CPDF_Stream* LoadUnicode(CPDF_Document* pDoc, const std::map& to_unicode) { @@ -173,37 +181,37 @@ CPDF_Stream* LoadUnicode(CPDF_Document* pDoc, } // Add maps to buffer buffer << static_cast(char_to_uni.size()) << " beginbfchar\n"; - for (auto iter : char_to_uni) { - AddNum(&buffer, iter.first); + for (const auto& iter : char_to_uni) { + AddCharcode(&buffer, iter.first); buffer << " "; - AddNum(&buffer, iter.second); + AddUnicode(&buffer, iter.second); buffer << "\n"; } buffer << "endbfchar\n" << static_cast(map_range_vector.size() + map_range.size()) << " beginbfrange\n"; - for (auto iter : map_range_vector) { + for (const auto& iter : map_range_vector) { const std::pair& charcodeRange = iter.first; - AddNum(&buffer, charcodeRange.first); + AddCharcode(&buffer, charcodeRange.first); buffer << " "; - AddNum(&buffer, charcodeRange.second); + AddCharcode(&buffer, charcodeRange.second); buffer << " ["; const std::vector& unicodes = iter.second; for (size_t i = 0; i < unicodes.size(); ++i) { uint32_t uni = unicodes[i]; - AddNum(&buffer, uni); + AddUnicode(&buffer, uni); if (i != unicodes.size() - 1) buffer << " "; } buffer << "]\n"; } - for (auto iter : map_range) { + for (const auto& iter : map_range) { const std::pair& charcodeRange = iter.first; - AddNum(&buffer, charcodeRange.first); + AddCharcode(&buffer, charcodeRange.first); buffer << " "; - AddNum(&buffer, charcodeRange.second); + AddCharcode(&buffer, charcodeRange.second); buffer << " "; - AddNum(&buffer, iter.second); + AddUnicode(&buffer, iter.second); buffer << "\n"; } // TODO(npm): Encrypt / Compress? @@ -389,10 +397,10 @@ DLLEXPORT FPDF_PAGEOBJECT STDCALL FPDFPageObj_NewTextObj(FPDF_DOCUMENT document, DLLEXPORT FPDF_BOOL STDCALL FPDFText_SetText(FPDF_PAGEOBJECT text_object, FPDF_WIDESTRING text) { - if (!text_object) + auto* pTextObj = static_cast(text_object); + if (!pTextObj) return false; - auto* pTextObj = reinterpret_cast(text_object); FX_STRSIZE len = CFX_WideString::WStringLength(text); CFX_WideString encodedText = CFX_WideString::FromUTF16LE(text, len); CFX_ByteString byteText; @@ -428,10 +436,10 @@ DLLEXPORT FPDF_FONT STDCALL FPDFText_LoadFont(FPDF_DOCUMENT document, } DLLEXPORT void STDCALL FPDFFont_Close(FPDF_FONT font) { - if (!font) + CPDF_Font* pFont = static_cast(font); + if (!pFont) return; - CPDF_Font* pFont = reinterpret_cast(font); CPDF_Document* pDoc = pFont->GetDocument(); if (!pDoc) return; @@ -445,14 +453,11 @@ DLLEXPORT FPDF_PAGEOBJECT STDCALL FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document, FPDF_FONT font, float font_size) { - if (!font) - return nullptr; - CPDF_Document* pDoc = CPDFDocumentFromFPDFDocument(document); - if (!pDoc) + CPDF_Font* pFont = static_cast(font); + if (!pDoc || !pFont) return nullptr; - CPDF_Font* pFont = reinterpret_cast(font); auto pTextObj = pdfium::MakeUnique(); pTextObj->m_TextState.SetFont(pDoc->LoadFont(pFont->GetFontDict())); pTextObj->m_TextState.SetFontSize(font_size); -- cgit v1.2.3