Add test that non-ASCII characters to not foul hyphen processing

In this doc there is a hyphen followed a fi ligature. Older versions of PDFium were not inserting the appropriate soft hyphen marker (0x0002) at this point, since they were only checking for ASCII characters. That bug is fixed in the preceding CLs, so this test is being added to avoid regressions. BUG=pdfium:1029 Change-Id: I9c9906de5a13ade1400a589fb18967938150516d Reviewed-on: https://pdfium-review.googlesource.com/28470 Commit-Queue: Ryan Harrison <rharrison@chromium.org> Reviewed-by: Henrique Nakashima <hnakashima@chromium.org> Reviewed-by: dsinclair <dsinclair@chromium.org>
author: Ryan Harrison <rharrison@chromium.org> 2018-03-12 16:12:44 +0000
committer: Chromium commit bot <commit-bot@chromium.org> 2018-03-12 16:12:44 +0000
commit: 64c664387d71ed01d18ab2b23327bbdd9757bd46 (patch)
tree: 6bd8c625c78108f269b27fb9ccfa92c10239b937
parent: 735eda96cf24349d10b160c8f5bd363b73d8aba1 (diff)
download: pdfium-64c664387d71ed01d18ab2b23327bbdd9757bd46.tar.xz
2 files changed, 42 insertions, 0 deletions
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 86d32a711a..6c30fc6b5b 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -641,3 +641,45 @@ TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
   FPDFText_ClosePage(textpage);
   UnloadPage(page);
 }
+
+// Testing that hyphen makers (0x0002) are replacing hard hyphens when
+// the word contains non-ASCII characters.
+TEST_F(FPDFTextEmbeddertest, bug_1029) {
+  EXPECT_TRUE(OpenDocument("bug_1029.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  EXPECT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  EXPECT_TRUE(textpage);
+
+  constexpr int page_range_offset = 171;
+  constexpr int page_range_length = 56;
+
+  // This text is:
+  // 'METADATA table. When the split has committed, it noti' followed
+  // by a 'soft hyphen' (0x0002) and then 'fi'.
+  //
+  // The original text has a fi ligature, but that is broken up into
+  // two characters when the PDF is processed.
+  constexpr unsigned int expected[] = {
+      0x004d, 0x0045, 0x0054, 0x0041, 0x0044, 0x0041, 0x0054, 0x0041,
+      0x0020, 0x0074, 0x0061, 0x0062, 0x006c, 0x0065, 0x002e, 0x0020,
+      0x0057, 0x0068, 0x0065, 0x006e, 0x0020, 0x0074, 0x0068, 0x0065,
+      0x0020, 0x0073, 0x0070, 0x006c, 0x0069, 0x0074, 0x0020, 0x0068,
+      0x0061, 0x0073, 0x0020, 0x0063, 0x006f, 0x006d, 0x006d, 0x0069,
+      0x0074, 0x0074, 0x0065, 0x0064, 0x002c, 0x0020, 0x0069, 0x0074,
+      0x0020, 0x006e, 0x006f, 0x0074, 0x0069, 0x0002, 0x0066, 0x0069};
+  static_assert(page_range_length == FX_ArraySize(expected),
+                "Expected should be the same size as the range being "
+                "extracted from page.");
+  EXPECT_LT(page_range_offset + page_range_length,
+            FPDFText_CountChars(textpage));
+
+  for (int i = 0; i < page_range_length; ++i) {
+    EXPECT_EQ(expected[i],
+              FPDFText_GetUnicode(textpage, page_range_offset + i));
+  }
+
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
diff --git a/testing/resources/bug_1029.pdf b/testing/resources/bug_1029.pdf
new file mode 100644
index 0000000000..c03f03de5b
--- /dev/null
+++ b/testing/resources/bug_1029.pdf
author	Ryan Harrison <rharrison@chromium.org>	2018-03-12 16:12:44 +0000
committer	Chromium commit bot <commit-bot@chromium.org>	2018-03-12 16:12:44 +0000
commit	64c664387d71ed01d18ab2b23327bbdd9757bd46 (patch)
tree	6bd8c625c78108f269b27fb9ccfa92c10239b937
parent	735eda96cf24349d10b160c8f5bd363b73d8aba1 (diff)
download	pdfium-64c664387d71ed01d18ab2b23327bbdd9757bd46.tar.xz