From 64c664387d71ed01d18ab2b23327bbdd9757bd46 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Mon, 12 Mar 2018 16:12:44 +0000 Subject: Add test that non-ASCII characters to not foul hyphen processing In this doc there is a hyphen followed a fi ligature. Older versions of PDFium were not inserting the appropriate soft hyphen marker (0x0002) at this point, since they were only checking for ASCII characters. That bug is fixed in the preceding CLs, so this test is being added to avoid regressions. BUG=pdfium:1029 Change-Id: I9c9906de5a13ade1400a589fb18967938150516d Reviewed-on: https://pdfium-review.googlesource.com/28470 Commit-Queue: Ryan Harrison Reviewed-by: Henrique Nakashima Reviewed-by: dsinclair --- fpdfsdk/fpdftext_embeddertest.cpp | 42 ++++++++++++++++++++++++++++++++++++++ testing/resources/bug_1029.pdf | Bin 0 -> 54666 bytes 2 files changed, 42 insertions(+) create mode 100644 testing/resources/bug_1029.pdf diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp index 86d32a711a..6c30fc6b5b 100644 --- a/fpdfsdk/fpdftext_embeddertest.cpp +++ b/fpdfsdk/fpdftext_embeddertest.cpp @@ -641,3 +641,45 @@ TEST_F(FPDFTextEmbeddertest, ControlCharacters) { FPDFText_ClosePage(textpage); UnloadPage(page); } + +// Testing that hyphen makers (0x0002) are replacing hard hyphens when +// the word contains non-ASCII characters. +TEST_F(FPDFTextEmbeddertest, bug_1029) { + EXPECT_TRUE(OpenDocument("bug_1029.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + + constexpr int page_range_offset = 171; + constexpr int page_range_length = 56; + + // This text is: + // 'METADATA table. When the split has committed, it noti' followed + // by a 'soft hyphen' (0x0002) and then 'fi'. + // + // The original text has a fi ligature, but that is broken up into + // two characters when the PDF is processed. + constexpr unsigned int expected[] = { + 0x004d, 0x0045, 0x0054, 0x0041, 0x0044, 0x0041, 0x0054, 0x0041, + 0x0020, 0x0074, 0x0061, 0x0062, 0x006c, 0x0065, 0x002e, 0x0020, + 0x0057, 0x0068, 0x0065, 0x006e, 0x0020, 0x0074, 0x0068, 0x0065, + 0x0020, 0x0073, 0x0070, 0x006c, 0x0069, 0x0074, 0x0020, 0x0068, + 0x0061, 0x0073, 0x0020, 0x0063, 0x006f, 0x006d, 0x006d, 0x0069, + 0x0074, 0x0074, 0x0065, 0x0064, 0x002c, 0x0020, 0x0069, 0x0074, + 0x0020, 0x006e, 0x006f, 0x0074, 0x0069, 0x0002, 0x0066, 0x0069}; + static_assert(page_range_length == FX_ArraySize(expected), + "Expected should be the same size as the range being " + "extracted from page."); + EXPECT_LT(page_range_offset + page_range_length, + FPDFText_CountChars(textpage)); + + for (int i = 0; i < page_range_length; ++i) { + EXPECT_EQ(expected[i], + FPDFText_GetUnicode(textpage, page_range_offset + i)); + } + + FPDFText_ClosePage(textpage); + UnloadPage(page); +} diff --git a/testing/resources/bug_1029.pdf b/testing/resources/bug_1029.pdf new file mode 100644 index 0000000000..c03f03de5b Binary files /dev/null and b/testing/resources/bug_1029.pdf differ -- cgit v1.2.3