diff options
author | Ryan Harrison <rharrison@chromium.org> | 2018-03-12 16:12:44 +0000 |
---|---|---|
committer | Chromium commit bot <commit-bot@chromium.org> | 2018-03-12 16:12:44 +0000 |
commit | 64c664387d71ed01d18ab2b23327bbdd9757bd46 (patch) | |
tree | 6bd8c625c78108f269b27fb9ccfa92c10239b937 /fpdfsdk/fpdftext_embeddertest.cpp | |
parent | 735eda96cf24349d10b160c8f5bd363b73d8aba1 (diff) | |
download | pdfium-64c664387d71ed01d18ab2b23327bbdd9757bd46.tar.xz |
Add test that non-ASCII characters to not foul hyphen processing
In this doc there is a hyphen followed a fi ligature. Older versions of
PDFium were not inserting the appropriate soft hyphen marker (0x0002)
at this point, since they were only checking for ASCII characters. That
bug is fixed in the preceding CLs, so this test is being added to avoid
regressions.
BUG=pdfium:1029
Change-Id: I9c9906de5a13ade1400a589fb18967938150516d
Reviewed-on: https://pdfium-review.googlesource.com/28470
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Reviewed-by: dsinclair <dsinclair@chromium.org>
Diffstat (limited to 'fpdfsdk/fpdftext_embeddertest.cpp')
-rw-r--r-- | fpdfsdk/fpdftext_embeddertest.cpp | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp index 86d32a711a..6c30fc6b5b 100644 --- a/fpdfsdk/fpdftext_embeddertest.cpp +++ b/fpdfsdk/fpdftext_embeddertest.cpp @@ -641,3 +641,45 @@ TEST_F(FPDFTextEmbeddertest, ControlCharacters) { FPDFText_ClosePage(textpage); UnloadPage(page); } + +// Testing that hyphen makers (0x0002) are replacing hard hyphens when +// the word contains non-ASCII characters. +TEST_F(FPDFTextEmbeddertest, bug_1029) { + EXPECT_TRUE(OpenDocument("bug_1029.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + + constexpr int page_range_offset = 171; + constexpr int page_range_length = 56; + + // This text is: + // 'METADATA table. When the split has committed, it noti' followed + // by a 'soft hyphen' (0x0002) and then 'fi'. + // + // The original text has a fi ligature, but that is broken up into + // two characters when the PDF is processed. + constexpr unsigned int expected[] = { + 0x004d, 0x0045, 0x0054, 0x0041, 0x0044, 0x0041, 0x0054, 0x0041, + 0x0020, 0x0074, 0x0061, 0x0062, 0x006c, 0x0065, 0x002e, 0x0020, + 0x0057, 0x0068, 0x0065, 0x006e, 0x0020, 0x0074, 0x0068, 0x0065, + 0x0020, 0x0073, 0x0070, 0x006c, 0x0069, 0x0074, 0x0020, 0x0068, + 0x0061, 0x0073, 0x0020, 0x0063, 0x006f, 0x006d, 0x006d, 0x0069, + 0x0074, 0x0074, 0x0065, 0x0064, 0x002c, 0x0020, 0x0069, 0x0074, + 0x0020, 0x006e, 0x006f, 0x0074, 0x0069, 0x0002, 0x0066, 0x0069}; + static_assert(page_range_length == FX_ArraySize(expected), + "Expected should be the same size as the range being " + "extracted from page."); + EXPECT_LT(page_range_offset + page_range_length, + FPDFText_CountChars(textpage)); + + for (int i = 0; i < page_range_length; ++i) { + EXPECT_EQ(expected[i], + FPDFText_GetUnicode(textpage, page_range_offset + i)); + } + + FPDFText_ClosePage(textpage); + UnloadPage(page); +} |