From f45ade3a0af908a1d6a51c5cc675f81517c9a22a Mon Sep 17 00:00:00 2001 From: Nicolas Pena Date: Wed, 3 May 2017 10:23:49 -0400 Subject: Add a ToUnicode mapping when loading CID fonts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL adds ToUnicode for CID fonts and adds a test to prove that using it works as intended. The test uses a Linux font for Japanese characters, and tests for other OS will be added in a followup. The ToUnicode works by defining the PDF charcodes as equal to the glyph indices and assuming that the freetype charcodes given by FXFT_Get_Next_Char are in fact the unicode values. Bug: pdfium:667 Change-Id: I419724b87c3936c730a05f771548ae4787a576eb Reviewed-on: https://pdfium-review.googlesource.com/4810 Commit-Queue: Nicolás Peña Reviewed-by: dsinclair --- fpdfsdk/fpdfedit_embeddertest.cpp | 90 +++++++++++--- fpdfsdk/fpdfedittext.cpp | 240 +++++++++++++++++++++++++++++++------- 2 files changed, 270 insertions(+), 60 deletions(-) diff --git a/fpdfsdk/fpdfedit_embeddertest.cpp b/fpdfsdk/fpdfedit_embeddertest.cpp index 01ac01e706..383e207f54 100644 --- a/fpdfsdk/fpdfedit_embeddertest.cpp +++ b/fpdfsdk/fpdfedit_embeddertest.cpp @@ -646,21 +646,7 @@ TEST_F(FPDFEditEmbeddertest, LoadCIDType0Font) { // Check widths CPDF_Array* widths_array = cidfont_dict->GetArrayFor("W"); ASSERT_TRUE(widths_array); - // Note: widths can be described in different ways in the widths array. The - // following checks are specific to our current implementation. - EXPECT_EQ(32, widths_array->GetNumberAt(0)); - CPDF_Array* arr = widths_array->GetArrayAt(1); - ASSERT_TRUE(arr); - // This font support chars 32 to 126 - EXPECT_EQ(95U, arr->GetCount()); - EXPECT_EQ(250, arr->GetNumberAt(0)); - EXPECT_EQ(610, arr->GetNumberAt(44)); - EXPECT_EQ(541, arr->GetNumberAt(94)); - // Next range: 160 - 383 - EXPECT_EQ(160, widths_array->GetNumberAt(2)); - arr = widths_array->GetArrayAt(3); - ASSERT_TRUE(arr); - + EXPECT_GT(widths_array->GetCount(), 1U); CheckCompositeFontWidths(widths_array, typed_font); } @@ -786,3 +772,77 @@ TEST_F(FPDFEditEmbeddertest, AddTrueTypeFontText) { FPDF_ClosePage(new_page); FPDF_CloseDocument(new_doc); } + +// TODO(npm): Add tests using Japanese fonts in other OS. +#if _FXM_PLATFORM_ == _FXM_PLATFORM_LINUX_ +TEST_F(FPDFEditEmbeddertest, AddCIDFontText) { + // Start with a blank page + FPDF_PAGE page = FPDFPage_New(CreateNewDocument(), 0, 612, 792); + CFX_Font CIDfont; + { + // First, get the data from the font + CIDfont.LoadSubst("IPAGothic", 1, 0, 400, 0, 932, 0); + EXPECT_EQ("IPAGothic", CIDfont.GetFaceName()); + const uint8_t* data = CIDfont.GetFontData(); + const uint32_t size = CIDfont.GetSize(); + + // Load the data into a FPDF_Font. + std::unique_ptr font( + FPDFText_LoadFont(document(), data, size, FPDF_FONT_TRUETYPE, 1)); + ASSERT_TRUE(font.get()); + + // Add some text to the page + FPDF_PAGEOBJECT text_object = + FPDFPageObj_CreateTextObj(document(), font.get(), 12.0f); + ASSERT_TRUE(text_object); + std::wstring wstr = L"ABCDEFGhijklmnop."; + std::unique_ptr text = + GetFPDFWideString(wstr); + EXPECT_TRUE(FPDFText_SetText(text_object, text.get())); + FPDFPageObj_Transform(text_object, 1, 0, 0, 1, 200, 200); + FPDFPage_InsertObject(page, text_object); + + // And add some Japanese characters + FPDF_PAGEOBJECT text_object2 = + FPDFPageObj_CreateTextObj(document(), font.get(), 18.0f); + ASSERT_TRUE(text_object2); + std::wstring wstr2 = + L"\u3053\u3093\u306B\u3061\u306f\u4e16\u754C\u3002\u3053\u3053\u306B1" + L"\u756A"; + std::unique_ptr text2 = + GetFPDFWideString(wstr2); + EXPECT_TRUE(FPDFText_SetText(text_object2, text2.get())); + FPDFPageObj_Transform(text_object2, 1, 0, 0, 1, 100, 500); + FPDFPage_InsertObject(page, text_object2); + } + + // Generate contents and check that the text renders properly. + EXPECT_TRUE(FPDFPage_GenerateContent(page)); + FPDF_BITMAP page_bitmap = RenderPage(page); + const char md5[] = "2bc6c1aaa2252e73246a75775ccf38c2"; + CompareBitmap(page_bitmap, 612, 792, md5); + FPDFBitmap_Destroy(page_bitmap); + + // Save the document, close the page. + EXPECT_TRUE(FPDF_SaveAsCopy(document(), this, 0)); + FPDF_ClosePage(page); + std::string new_file = GetString(); + + // Render the saved result + FPDF_FILEACCESS file_access; + memset(&file_access, 0, sizeof(file_access)); + file_access.m_FileLen = new_file.size(); + file_access.m_GetBlock = GetBlockFromString; + file_access.m_Param = &new_file; + FPDF_DOCUMENT new_doc = FPDF_LoadCustomDocument(&file_access, nullptr); + ASSERT_NE(nullptr, new_doc); + EXPECT_EQ(1, FPDF_GetPageCount(new_doc)); + FPDF_PAGE new_page = FPDF_LoadPage(new_doc, 0); + ASSERT_NE(nullptr, new_page); + FPDF_BITMAP new_bitmap = RenderPage(new_page); + CompareBitmap(new_bitmap, 612, 792, md5); + FPDFBitmap_Destroy(new_bitmap); + FPDF_ClosePage(new_page); + FPDF_CloseDocument(new_doc); +} +#endif // _FXM_PLATFORM_ == _FXM_PLATFORM_LINUX_ diff --git a/fpdfsdk/fpdfedittext.cpp b/fpdfsdk/fpdfedittext.cpp index f4e1d66bc1..cfb44f513d 100644 --- a/fpdfsdk/fpdfedittext.cpp +++ b/fpdfsdk/fpdfedittext.cpp @@ -2,8 +2,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include #include #include +#include #include "core/fpdfapi/cpdf_modulemgr.h" #include "core/fpdfapi/font/cpdf_font.h" @@ -74,6 +76,144 @@ CPDF_Dictionary* LoadFontDesc(CPDF_Document* pDoc, return fontDesc; } +const char ToUnicodeStart[] = + "/CIDInit /ProcSet findresource begin\n" + "12 dict begin\n" + "begincmap\n" + "/CIDSystemInfo\n" + "<> def\n" + "/CMapName /Adobe-Identity-H def\n" + "CMapType 2 def\n" + "1 begincodespacerange\n" + "<0000> \n"; + +const char hex[] = "0123456789ABCDEF"; + +void AddNum(CFX_ByteTextBuf* pBuffer, uint32_t number) { + *pBuffer << "<"; + char ans[4]; + for (size_t i = 0; i < 4; ++i) { + ans[3 - i] = hex[number % 16]; + number /= 16; + } + for (size_t i = 0; i < 4; ++i) + pBuffer->AppendChar(ans[i]); + *pBuffer << ">"; +} + +// Loads the charcode to unicode mapping into a stream +CPDF_Stream* LoadUnicode(CPDF_Document* pDoc, + const std::map& to_unicode) { + CFX_ByteTextBuf buffer; + buffer << ToUnicodeStart; + // A map charcode->unicode + std::map char_to_uni; + // A map to vector v of unicode characters of size (end + // - start + 1). This abbreviates: start->v[0], start+1->v[1], etc. PDF spec + // 1.7 Section 5.9.2 says that only the last byte of the unicode may change. + std::map, std::vector> + map_range_vector; + // A map -> unicode + // This abbreviates: start->unicode, start+1->unicode+1, etc. + // PDF spec 1.7 Section 5.9.2 says that only the last byte of the unicode may + // change. + std::map, uint32_t> map_range; + + // Calculate the maps + for (auto iter = to_unicode.begin(); iter != to_unicode.end(); ++iter) { + uint32_t firstCharcode = iter->first; + uint32_t firstUnicode = iter->second; + if (std::next(iter) == to_unicode.end() || + firstCharcode + 1 != std::next(iter)->first) { + char_to_uni[firstCharcode] = firstUnicode; + continue; + } + ++iter; + uint32_t curCharcode = iter->first; + uint32_t curUnicode = iter->second; + if (curCharcode % 256 == 0) { + char_to_uni[firstCharcode] = firstUnicode; + char_to_uni[curCharcode] = curUnicode; + continue; + } + const size_t maxExtra = 255 - (curCharcode % 256); + auto next_it = std::next(iter); + if (firstUnicode + 1 != curUnicode) { + // Consecutive charcodes mapping to non-consecutive unicodes + std::vector unicodes; + unicodes.push_back(firstUnicode); + unicodes.push_back(curUnicode); + for (size_t i = 0; i < maxExtra; ++i) { + if (next_it == to_unicode.end() || curCharcode + 1 != next_it->first) + break; + ++iter; + ++curCharcode; + unicodes.push_back(iter->second); + next_it = std::next(iter); + } + ASSERT(iter->first - firstCharcode + 1 == unicodes.size()); + map_range_vector[std::make_pair(firstCharcode, iter->first)] = unicodes; + continue; + } + // Consecutive charcodes mapping to consecutive unicodes + for (size_t i = 0; i < maxExtra; ++i) { + if (next_it == to_unicode.end() || curCharcode + 1 != next_it->first || + curUnicode + 1 != next_it->second) { + break; + } + ++iter; + ++curCharcode; + ++curUnicode; + next_it = std::next(iter); + } + map_range[std::make_pair(firstCharcode, curCharcode)] = firstUnicode; + } + // Add maps to buffer + buffer << static_cast(char_to_uni.size()) << " beginbfchar\n"; + for (auto iter : char_to_uni) { + AddNum(&buffer, iter.first); + buffer << " "; + AddNum(&buffer, iter.second); + buffer << "\n"; + } + buffer << "endbfchar\n" + << static_cast(map_range_vector.size() + map_range.size()) + << " beginbfrange\n"; + for (auto iter : map_range_vector) { + const std::pair& charcodeRange = iter.first; + AddNum(&buffer, charcodeRange.first); + buffer << " "; + AddNum(&buffer, charcodeRange.second); + buffer << " ["; + const std::vector& unicodes = iter.second; + for (size_t i = 0; i < unicodes.size(); ++i) { + uint32_t uni = unicodes[i]; + AddNum(&buffer, uni); + if (i != unicodes.size() - 1) + buffer << " "; + } + buffer << "]\n"; + } + for (auto iter : map_range) { + const std::pair& charcodeRange = iter.first; + AddNum(&buffer, charcodeRange.first); + buffer << " "; + AddNum(&buffer, charcodeRange.second); + buffer << " "; + AddNum(&buffer, iter.second); + buffer << "\n"; + } + // TODO(npm): Encrypt / Compress? + uint32_t bufferSize = buffer.GetSize(); + auto pDict = pdfium::MakeUnique(); + pDict->SetNewFor("Length", static_cast(bufferSize)); + return pDoc->NewIndirect(buffer.DetachBuffer(), bufferSize, + std::move(pDict)); +} + void* LoadSimpleFont(CPDF_Document* pDoc, std::unique_ptr pFont, const uint8_t* data, @@ -93,8 +233,7 @@ void* LoadSimpleFont(CPDF_Document* pDoc, fontDict->SetNewFor("FirstChar", currentChar); CPDF_Array* widthsArray = pDoc->NewIndirect(); while (true) { - int width = pFont->GetGlyphWidth(glyphIndex); - widthsArray->AddNew(width); + widthsArray->AddNew(pFont->GetGlyphWidth(glyphIndex)); int nextChar = FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex); // Simple fonts have 1-byte charcodes only. @@ -154,57 +293,67 @@ void* LoadCompositeFont(CPDF_Document* pDoc, uint32_t glyphIndex; int currentChar = FXFT_Get_First_Char(pFont->GetFace(), &glyphIndex); - CPDF_Array* widthsArray = pDoc->NewIndirect(); + // If it doesn't have a single char, just fail + if (glyphIndex == 0) + return nullptr; + + std::map to_unicode; + std::map widths; while (true) { - int width = pFont->GetGlyphWidth(glyphIndex); - int nextChar = + widths[glyphIndex] = pFont->GetGlyphWidth(glyphIndex); + to_unicode[glyphIndex] = currentChar; + currentChar = FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex); - if (glyphIndex == 0) { + if (glyphIndex == 0) + break; + } + CPDF_Array* widthsArray = pDoc->NewIndirect(); + for (auto it = widths.begin(); it != widths.end(); ++it) { + int ch = it->first; + int w = it->second; + if (std::next(it) == widths.end()) { // Only one char left, use format c [w] auto oneW = pdfium::MakeUnique(); - oneW->AddNew(width); - widthsArray->AddNew(currentChar); + oneW->AddNew(w); + widthsArray->AddNew(ch); widthsArray->Add(std::move(oneW)); break; } - int nextWidth = pFont->GetGlyphWidth(glyphIndex); - if (nextChar == currentChar + 1 && nextWidth == width) { + ++it; + int next_ch = it->first; + int next_w = it->second; + if (next_ch == ch + 1 && next_w == w) { // The array can have a group c_first c_last w: all CIDs in the range from // c_first to c_last will have width w - widthsArray->AddNew(currentChar); - currentChar = nextChar; + widthsArray->AddNew(ch); + ch = next_ch; while (true) { - nextChar = - FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex); - if (glyphIndex == 0) - break; - nextWidth = pFont->GetGlyphWidth(glyphIndex); - if (nextChar != currentChar + 1 || nextWidth != width) - break; - currentChar = nextChar; - } - widthsArray->AddNew(currentChar); - widthsArray->AddNew(width); - } else { - // Otherwise we can have a group of the form c [w1 w2 ...]: c has width - // w1, c+1 has width w2, etc. - widthsArray->AddNew(currentChar); - auto curWidthArray = pdfium::MakeUnique(); - curWidthArray->AddNew(width); - while (nextChar == currentChar + 1) { - curWidthArray->AddNew(nextWidth); - currentChar = nextChar; - nextChar = - FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex); - if (glyphIndex == 0) + auto next_it = std::next(it); + if (next_it == widths.end() || next_it->first != it->first + 1 || + next_it->second != it->second) { break; - nextWidth = pFont->GetGlyphWidth(glyphIndex); + } + ++it; + ch = it->first; } - widthsArray->Add(std::move(curWidthArray)); + widthsArray->AddNew(ch); + widthsArray->AddNew(w); + continue; } - if (glyphIndex == 0) - break; - currentChar = nextChar; + // Otherwise we can have a group of the form c [w1 w2 ...]: c has width + // w1, c+1 has width w2, etc. + widthsArray->AddNew(ch); + auto curWidthArray = pdfium::MakeUnique(); + curWidthArray->AddNew(w); + curWidthArray->AddNew(next_w); + while (true) { + auto next_it = std::next(it); + if (next_it == widths.end() || next_it->first != it->first + 1) + break; + ++it; + curWidthArray->AddNew(static_cast(it->second)); + } + widthsArray->Add(std::move(curWidthArray)); } pCIDFont->SetNewFor("W", pDoc, widthsArray->GetObjNum()); // TODO(npm): Support vertical writing @@ -212,7 +361,9 @@ void* LoadCompositeFont(CPDF_Document* pDoc, auto pDescendant = pdfium::MakeUnique(); pDescendant->AddNew(pDoc, pCIDFont->GetObjNum()); fontDict->SetFor("DescendantFonts", std::move(pDescendant)); - // TODO(npm): do we need a ToUnicode? + CPDF_Stream* toUnicodeStream = LoadUnicode(pDoc, to_unicode); + fontDict->SetNewFor("ToUnicode", pDoc, + toUnicodeStream->GetObjNum()); return pDoc->LoadFont(fontDict); } @@ -245,10 +396,9 @@ DLLEXPORT FPDF_BOOL STDCALL FPDFText_SetText(FPDF_PAGEOBJECT text_object, FX_STRSIZE len = CFX_WideString::WStringLength(text); CFX_WideString encodedText = CFX_WideString::FromUTF16LE(text, len); CFX_ByteString byteText; - for (int i = 0; i < encodedText.GetLength(); ++i) { - uint32_t charcode = - pTextObj->GetFont()->CharCodeFromUnicode(encodedText[i]); - pTextObj->GetFont()->AppendChar(&byteText, charcode); + for (wchar_t wc : encodedText) { + pTextObj->GetFont()->AppendChar( + &byteText, pTextObj->GetFont()->CharCodeFromUnicode(wc)); } pTextObj->SetText(byteText); return true; -- cgit v1.2.3