summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicolas Pena <npm@chromium.org>2017-05-03 10:23:49 -0400
committerChromium commit bot <commit-bot@chromium.org>2017-05-03 14:36:16 +0000
commitf45ade3a0af908a1d6a51c5cc675f81517c9a22a (patch)
tree91b01053cf00cb584e31b3f8f012074f3d989573
parent69c743ff1034fc127fabfb9cc6fdb90dc7904e7d (diff)
downloadpdfium-f45ade3a0af908a1d6a51c5cc675f81517c9a22a.tar.xz
Add a ToUnicode mapping when loading CID fonts
This CL adds ToUnicode for CID fonts and adds a test to prove that using it works as intended. The test uses a Linux font for Japanese characters, and tests for other OS will be added in a followup. The ToUnicode works by defining the PDF charcodes as equal to the glyph indices and assuming that the freetype charcodes given by FXFT_Get_Next_Char are in fact the unicode values. Bug: pdfium:667 Change-Id: I419724b87c3936c730a05f771548ae4787a576eb Reviewed-on: https://pdfium-review.googlesource.com/4810 Commit-Queue: Nicolás Peña <npm@chromium.org> Reviewed-by: dsinclair <dsinclair@chromium.org>
-rw-r--r--fpdfsdk/fpdfedit_embeddertest.cpp90
-rw-r--r--fpdfsdk/fpdfedittext.cpp240
2 files changed, 270 insertions, 60 deletions
diff --git a/fpdfsdk/fpdfedit_embeddertest.cpp b/fpdfsdk/fpdfedit_embeddertest.cpp
index 01ac01e706..383e207f54 100644
--- a/fpdfsdk/fpdfedit_embeddertest.cpp
+++ b/fpdfsdk/fpdfedit_embeddertest.cpp
@@ -646,21 +646,7 @@ TEST_F(FPDFEditEmbeddertest, LoadCIDType0Font) {
// Check widths
CPDF_Array* widths_array = cidfont_dict->GetArrayFor("W");
ASSERT_TRUE(widths_array);
- // Note: widths can be described in different ways in the widths array. The
- // following checks are specific to our current implementation.
- EXPECT_EQ(32, widths_array->GetNumberAt(0));
- CPDF_Array* arr = widths_array->GetArrayAt(1);
- ASSERT_TRUE(arr);
- // This font support chars 32 to 126
- EXPECT_EQ(95U, arr->GetCount());
- EXPECT_EQ(250, arr->GetNumberAt(0));
- EXPECT_EQ(610, arr->GetNumberAt(44));
- EXPECT_EQ(541, arr->GetNumberAt(94));
- // Next range: 160 - 383
- EXPECT_EQ(160, widths_array->GetNumberAt(2));
- arr = widths_array->GetArrayAt(3);
- ASSERT_TRUE(arr);
-
+ EXPECT_GT(widths_array->GetCount(), 1U);
CheckCompositeFontWidths(widths_array, typed_font);
}
@@ -786,3 +772,77 @@ TEST_F(FPDFEditEmbeddertest, AddTrueTypeFontText) {
FPDF_ClosePage(new_page);
FPDF_CloseDocument(new_doc);
}
+
+// TODO(npm): Add tests using Japanese fonts in other OS.
+#if _FXM_PLATFORM_ == _FXM_PLATFORM_LINUX_
+TEST_F(FPDFEditEmbeddertest, AddCIDFontText) {
+ // Start with a blank page
+ FPDF_PAGE page = FPDFPage_New(CreateNewDocument(), 0, 612, 792);
+ CFX_Font CIDfont;
+ {
+ // First, get the data from the font
+ CIDfont.LoadSubst("IPAGothic", 1, 0, 400, 0, 932, 0);
+ EXPECT_EQ("IPAGothic", CIDfont.GetFaceName());
+ const uint8_t* data = CIDfont.GetFontData();
+ const uint32_t size = CIDfont.GetSize();
+
+ // Load the data into a FPDF_Font.
+ std::unique_ptr<void, FPDFFontDeleter> font(
+ FPDFText_LoadFont(document(), data, size, FPDF_FONT_TRUETYPE, 1));
+ ASSERT_TRUE(font.get());
+
+ // Add some text to the page
+ FPDF_PAGEOBJECT text_object =
+ FPDFPageObj_CreateTextObj(document(), font.get(), 12.0f);
+ ASSERT_TRUE(text_object);
+ std::wstring wstr = L"ABCDEFGhijklmnop.";
+ std::unique_ptr<unsigned short, pdfium::FreeDeleter> text =
+ GetFPDFWideString(wstr);
+ EXPECT_TRUE(FPDFText_SetText(text_object, text.get()));
+ FPDFPageObj_Transform(text_object, 1, 0, 0, 1, 200, 200);
+ FPDFPage_InsertObject(page, text_object);
+
+ // And add some Japanese characters
+ FPDF_PAGEOBJECT text_object2 =
+ FPDFPageObj_CreateTextObj(document(), font.get(), 18.0f);
+ ASSERT_TRUE(text_object2);
+ std::wstring wstr2 =
+ L"\u3053\u3093\u306B\u3061\u306f\u4e16\u754C\u3002\u3053\u3053\u306B1"
+ L"\u756A";
+ std::unique_ptr<unsigned short, pdfium::FreeDeleter> text2 =
+ GetFPDFWideString(wstr2);
+ EXPECT_TRUE(FPDFText_SetText(text_object2, text2.get()));
+ FPDFPageObj_Transform(text_object2, 1, 0, 0, 1, 100, 500);
+ FPDFPage_InsertObject(page, text_object2);
+ }
+
+ // Generate contents and check that the text renders properly.
+ EXPECT_TRUE(FPDFPage_GenerateContent(page));
+ FPDF_BITMAP page_bitmap = RenderPage(page);
+ const char md5[] = "2bc6c1aaa2252e73246a75775ccf38c2";
+ CompareBitmap(page_bitmap, 612, 792, md5);
+ FPDFBitmap_Destroy(page_bitmap);
+
+ // Save the document, close the page.
+ EXPECT_TRUE(FPDF_SaveAsCopy(document(), this, 0));
+ FPDF_ClosePage(page);
+ std::string new_file = GetString();
+
+ // Render the saved result
+ FPDF_FILEACCESS file_access;
+ memset(&file_access, 0, sizeof(file_access));
+ file_access.m_FileLen = new_file.size();
+ file_access.m_GetBlock = GetBlockFromString;
+ file_access.m_Param = &new_file;
+ FPDF_DOCUMENT new_doc = FPDF_LoadCustomDocument(&file_access, nullptr);
+ ASSERT_NE(nullptr, new_doc);
+ EXPECT_EQ(1, FPDF_GetPageCount(new_doc));
+ FPDF_PAGE new_page = FPDF_LoadPage(new_doc, 0);
+ ASSERT_NE(nullptr, new_page);
+ FPDF_BITMAP new_bitmap = RenderPage(new_page);
+ CompareBitmap(new_bitmap, 612, 792, md5);
+ FPDFBitmap_Destroy(new_bitmap);
+ FPDF_ClosePage(new_page);
+ FPDF_CloseDocument(new_doc);
+}
+#endif // _FXM_PLATFORM_ == _FXM_PLATFORM_LINUX_
diff --git a/fpdfsdk/fpdfedittext.cpp b/fpdfsdk/fpdfedittext.cpp
index f4e1d66bc1..cfb44f513d 100644
--- a/fpdfsdk/fpdfedittext.cpp
+++ b/fpdfsdk/fpdfedittext.cpp
@@ -2,8 +2,10 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <map>
#include <memory>
#include <utility>
+#include <vector>
#include "core/fpdfapi/cpdf_modulemgr.h"
#include "core/fpdfapi/font/cpdf_font.h"
@@ -74,6 +76,144 @@ CPDF_Dictionary* LoadFontDesc(CPDF_Document* pDoc,
return fontDesc;
}
+const char ToUnicodeStart[] =
+ "/CIDInit /ProcSet findresource begin\n"
+ "12 dict begin\n"
+ "begincmap\n"
+ "/CIDSystemInfo\n"
+ "<</Registry (Adobe)\n"
+ "/Ordering (Identity)\n"
+ "/Supplement 0\n"
+ ">> def\n"
+ "/CMapName /Adobe-Identity-H def\n"
+ "CMapType 2 def\n"
+ "1 begincodespacerange\n"
+ "<0000> <FFFFF>\n";
+
+const char hex[] = "0123456789ABCDEF";
+
+void AddNum(CFX_ByteTextBuf* pBuffer, uint32_t number) {
+ *pBuffer << "<";
+ char ans[4];
+ for (size_t i = 0; i < 4; ++i) {
+ ans[3 - i] = hex[number % 16];
+ number /= 16;
+ }
+ for (size_t i = 0; i < 4; ++i)
+ pBuffer->AppendChar(ans[i]);
+ *pBuffer << ">";
+}
+
+// Loads the charcode to unicode mapping into a stream
+CPDF_Stream* LoadUnicode(CPDF_Document* pDoc,
+ const std::map<uint32_t, uint32_t>& to_unicode) {
+ CFX_ByteTextBuf buffer;
+ buffer << ToUnicodeStart;
+ // A map charcode->unicode
+ std::map<uint32_t, uint32_t> char_to_uni;
+ // A map <char_start, char_end> to vector v of unicode characters of size (end
+ // - start + 1). This abbreviates: start->v[0], start+1->v[1], etc. PDF spec
+ // 1.7 Section 5.9.2 says that only the last byte of the unicode may change.
+ std::map<std::pair<uint32_t, uint32_t>, std::vector<uint32_t>>
+ map_range_vector;
+ // A map <start, end> -> unicode
+ // This abbreviates: start->unicode, start+1->unicode+1, etc.
+ // PDF spec 1.7 Section 5.9.2 says that only the last byte of the unicode may
+ // change.
+ std::map<std::pair<uint32_t, uint32_t>, uint32_t> map_range;
+
+ // Calculate the maps
+ for (auto iter = to_unicode.begin(); iter != to_unicode.end(); ++iter) {
+ uint32_t firstCharcode = iter->first;
+ uint32_t firstUnicode = iter->second;
+ if (std::next(iter) == to_unicode.end() ||
+ firstCharcode + 1 != std::next(iter)->first) {
+ char_to_uni[firstCharcode] = firstUnicode;
+ continue;
+ }
+ ++iter;
+ uint32_t curCharcode = iter->first;
+ uint32_t curUnicode = iter->second;
+ if (curCharcode % 256 == 0) {
+ char_to_uni[firstCharcode] = firstUnicode;
+ char_to_uni[curCharcode] = curUnicode;
+ continue;
+ }
+ const size_t maxExtra = 255 - (curCharcode % 256);
+ auto next_it = std::next(iter);
+ if (firstUnicode + 1 != curUnicode) {
+ // Consecutive charcodes mapping to non-consecutive unicodes
+ std::vector<uint32_t> unicodes;
+ unicodes.push_back(firstUnicode);
+ unicodes.push_back(curUnicode);
+ for (size_t i = 0; i < maxExtra; ++i) {
+ if (next_it == to_unicode.end() || curCharcode + 1 != next_it->first)
+ break;
+ ++iter;
+ ++curCharcode;
+ unicodes.push_back(iter->second);
+ next_it = std::next(iter);
+ }
+ ASSERT(iter->first - firstCharcode + 1 == unicodes.size());
+ map_range_vector[std::make_pair(firstCharcode, iter->first)] = unicodes;
+ continue;
+ }
+ // Consecutive charcodes mapping to consecutive unicodes
+ for (size_t i = 0; i < maxExtra; ++i) {
+ if (next_it == to_unicode.end() || curCharcode + 1 != next_it->first ||
+ curUnicode + 1 != next_it->second) {
+ break;
+ }
+ ++iter;
+ ++curCharcode;
+ ++curUnicode;
+ next_it = std::next(iter);
+ }
+ map_range[std::make_pair(firstCharcode, curCharcode)] = firstUnicode;
+ }
+ // Add maps to buffer
+ buffer << static_cast<uint32_t>(char_to_uni.size()) << " beginbfchar\n";
+ for (auto iter : char_to_uni) {
+ AddNum(&buffer, iter.first);
+ buffer << " ";
+ AddNum(&buffer, iter.second);
+ buffer << "\n";
+ }
+ buffer << "endbfchar\n"
+ << static_cast<uint32_t>(map_range_vector.size() + map_range.size())
+ << " beginbfrange\n";
+ for (auto iter : map_range_vector) {
+ const std::pair<uint32_t, uint32_t>& charcodeRange = iter.first;
+ AddNum(&buffer, charcodeRange.first);
+ buffer << " ";
+ AddNum(&buffer, charcodeRange.second);
+ buffer << " [";
+ const std::vector<uint32_t>& unicodes = iter.second;
+ for (size_t i = 0; i < unicodes.size(); ++i) {
+ uint32_t uni = unicodes[i];
+ AddNum(&buffer, uni);
+ if (i != unicodes.size() - 1)
+ buffer << " ";
+ }
+ buffer << "]\n";
+ }
+ for (auto iter : map_range) {
+ const std::pair<uint32_t, uint32_t>& charcodeRange = iter.first;
+ AddNum(&buffer, charcodeRange.first);
+ buffer << " ";
+ AddNum(&buffer, charcodeRange.second);
+ buffer << " ";
+ AddNum(&buffer, iter.second);
+ buffer << "\n";
+ }
+ // TODO(npm): Encrypt / Compress?
+ uint32_t bufferSize = buffer.GetSize();
+ auto pDict = pdfium::MakeUnique<CPDF_Dictionary>();
+ pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(bufferSize));
+ return pDoc->NewIndirect<CPDF_Stream>(buffer.DetachBuffer(), bufferSize,
+ std::move(pDict));
+}
+
void* LoadSimpleFont(CPDF_Document* pDoc,
std::unique_ptr<CFX_Font> pFont,
const uint8_t* data,
@@ -93,8 +233,7 @@ void* LoadSimpleFont(CPDF_Document* pDoc,
fontDict->SetNewFor<CPDF_Number>("FirstChar", currentChar);
CPDF_Array* widthsArray = pDoc->NewIndirect<CPDF_Array>();
while (true) {
- int width = pFont->GetGlyphWidth(glyphIndex);
- widthsArray->AddNew<CPDF_Number>(width);
+ widthsArray->AddNew<CPDF_Number>(pFont->GetGlyphWidth(glyphIndex));
int nextChar =
FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex);
// Simple fonts have 1-byte charcodes only.
@@ -154,57 +293,67 @@ void* LoadCompositeFont(CPDF_Document* pDoc,
uint32_t glyphIndex;
int currentChar = FXFT_Get_First_Char(pFont->GetFace(), &glyphIndex);
- CPDF_Array* widthsArray = pDoc->NewIndirect<CPDF_Array>();
+ // If it doesn't have a single char, just fail
+ if (glyphIndex == 0)
+ return nullptr;
+
+ std::map<uint32_t, uint32_t> to_unicode;
+ std::map<uint32_t, uint32_t> widths;
while (true) {
- int width = pFont->GetGlyphWidth(glyphIndex);
- int nextChar =
+ widths[glyphIndex] = pFont->GetGlyphWidth(glyphIndex);
+ to_unicode[glyphIndex] = currentChar;
+ currentChar =
FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex);
- if (glyphIndex == 0) {
+ if (glyphIndex == 0)
+ break;
+ }
+ CPDF_Array* widthsArray = pDoc->NewIndirect<CPDF_Array>();
+ for (auto it = widths.begin(); it != widths.end(); ++it) {
+ int ch = it->first;
+ int w = it->second;
+ if (std::next(it) == widths.end()) {
// Only one char left, use format c [w]
auto oneW = pdfium::MakeUnique<CPDF_Array>();
- oneW->AddNew<CPDF_Number>(width);
- widthsArray->AddNew<CPDF_Number>(currentChar);
+ oneW->AddNew<CPDF_Number>(w);
+ widthsArray->AddNew<CPDF_Number>(ch);
widthsArray->Add(std::move(oneW));
break;
}
- int nextWidth = pFont->GetGlyphWidth(glyphIndex);
- if (nextChar == currentChar + 1 && nextWidth == width) {
+ ++it;
+ int next_ch = it->first;
+ int next_w = it->second;
+ if (next_ch == ch + 1 && next_w == w) {
// The array can have a group c_first c_last w: all CIDs in the range from
// c_first to c_last will have width w
- widthsArray->AddNew<CPDF_Number>(currentChar);
- currentChar = nextChar;
+ widthsArray->AddNew<CPDF_Number>(ch);
+ ch = next_ch;
while (true) {
- nextChar =
- FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex);
- if (glyphIndex == 0)
- break;
- nextWidth = pFont->GetGlyphWidth(glyphIndex);
- if (nextChar != currentChar + 1 || nextWidth != width)
- break;
- currentChar = nextChar;
- }
- widthsArray->AddNew<CPDF_Number>(currentChar);
- widthsArray->AddNew<CPDF_Number>(width);
- } else {
- // Otherwise we can have a group of the form c [w1 w2 ...]: c has width
- // w1, c+1 has width w2, etc.
- widthsArray->AddNew<CPDF_Number>(currentChar);
- auto curWidthArray = pdfium::MakeUnique<CPDF_Array>();
- curWidthArray->AddNew<CPDF_Number>(width);
- while (nextChar == currentChar + 1) {
- curWidthArray->AddNew<CPDF_Number>(nextWidth);
- currentChar = nextChar;
- nextChar =
- FXFT_Get_Next_Char(pFont->GetFace(), currentChar, &glyphIndex);
- if (glyphIndex == 0)
+ auto next_it = std::next(it);
+ if (next_it == widths.end() || next_it->first != it->first + 1 ||
+ next_it->second != it->second) {
break;
- nextWidth = pFont->GetGlyphWidth(glyphIndex);
+ }
+ ++it;
+ ch = it->first;
}
- widthsArray->Add(std::move(curWidthArray));
+ widthsArray->AddNew<CPDF_Number>(ch);
+ widthsArray->AddNew<CPDF_Number>(w);
+ continue;
}
- if (glyphIndex == 0)
- break;
- currentChar = nextChar;
+ // Otherwise we can have a group of the form c [w1 w2 ...]: c has width
+ // w1, c+1 has width w2, etc.
+ widthsArray->AddNew<CPDF_Number>(ch);
+ auto curWidthArray = pdfium::MakeUnique<CPDF_Array>();
+ curWidthArray->AddNew<CPDF_Number>(w);
+ curWidthArray->AddNew<CPDF_Number>(next_w);
+ while (true) {
+ auto next_it = std::next(it);
+ if (next_it == widths.end() || next_it->first != it->first + 1)
+ break;
+ ++it;
+ curWidthArray->AddNew<CPDF_Number>(static_cast<int>(it->second));
+ }
+ widthsArray->Add(std::move(curWidthArray));
}
pCIDFont->SetNewFor<CPDF_Reference>("W", pDoc, widthsArray->GetObjNum());
// TODO(npm): Support vertical writing
@@ -212,7 +361,9 @@ void* LoadCompositeFont(CPDF_Document* pDoc,
auto pDescendant = pdfium::MakeUnique<CPDF_Array>();
pDescendant->AddNew<CPDF_Reference>(pDoc, pCIDFont->GetObjNum());
fontDict->SetFor("DescendantFonts", std::move(pDescendant));
- // TODO(npm): do we need a ToUnicode?
+ CPDF_Stream* toUnicodeStream = LoadUnicode(pDoc, to_unicode);
+ fontDict->SetNewFor<CPDF_Reference>("ToUnicode", pDoc,
+ toUnicodeStream->GetObjNum());
return pDoc->LoadFont(fontDict);
}
@@ -245,10 +396,9 @@ DLLEXPORT FPDF_BOOL STDCALL FPDFText_SetText(FPDF_PAGEOBJECT text_object,
FX_STRSIZE len = CFX_WideString::WStringLength(text);
CFX_WideString encodedText = CFX_WideString::FromUTF16LE(text, len);
CFX_ByteString byteText;
- for (int i = 0; i < encodedText.GetLength(); ++i) {
- uint32_t charcode =
- pTextObj->GetFont()->CharCodeFromUnicode(encodedText[i]);
- pTextObj->GetFont()->AppendChar(&byteText, charcode);
+ for (wchar_t wc : encodedText) {
+ pTextObj->GetFont()->AppendChar(
+ &byteText, pTextObj->GetFont()->CharCodeFromUnicode(wc));
}
pTextObj->SetText(byteText);
return true;