From 9d0d7c8f35b8f3063d33a41186ea677d01748e45 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Mon, 20 Nov 2017 19:32:50 +0000 Subject: Add regression tests for issues with correctly removing hyphens There was a regression due to a refactor, where the public API was no longer removing soft hyphens for line broken words. This was causing issues with find and copy/paste operations that depend on selecting a region of text. This change is covered by FPDFTextEmbeddertest.GetTextWithHyphen. FPDFTextEmbeddertest.bug_782596 is a regression test for a bug that was introduced by the original fix. It only fails when running the test under ASAN. BUG=pdfium:935 Change-Id: I26096583c35f9246a3662e702f89b742f1146780 Reviewed-on: https://pdfium-review.googlesource.com/18610 Reviewed-by: Lei Zhang Reviewed-by: dsinclair Commit-Queue: Ryan Harrison --- fpdfsdk/fpdftext_embeddertest.cpp | 60 ++++++++++++++++++++++++++++++++++++++ testing/resources/bug_781804.pdf | Bin 0 -> 15690 bytes testing/resources/bug_782596.pdf | Bin 0 -> 93 bytes 3 files changed, 60 insertions(+) create mode 100644 testing/resources/bug_781804.pdf create mode 100644 testing/resources/bug_782596.pdf diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp index 1536c8679b..a4431097f1 100644 --- a/fpdfsdk/fpdftext_embeddertest.cpp +++ b/fpdfsdk/fpdftext_embeddertest.cpp @@ -525,3 +525,63 @@ TEST_F(FPDFTextEmbeddertest, Bug_921) { FPDFText_ClosePage(textpage); UnloadPage(page); } + +TEST_F(FPDFTextEmbeddertest, GetTextWithHyphen) { + EXPECT_TRUE(OpenDocument("bug_781804.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + + // Check that soft hyphens are not included + // Expecting 'Veritaserum', except there is a \uFFFE where the hyphen was in + // the original text. This is a weird thing that Adobe does, which we + // replicate. + constexpr unsigned short soft_expected[] = { + 0x0056, 0x0065, 0x0072, 0x0069, 0x0074, 0x0061, 0xfffe, + 0x0073, 0x0065, 0x0072, 0x0075, 0x006D, 0x0000}; + { + constexpr int expected_count = FX_ArraySize(soft_expected); + unsigned short buffer[expected_count]; + memset(buffer, 0, sizeof(buffer)); + + EXPECT_EQ(expected_count, + FPDFText_GetText(textpage, 0, expected_count, buffer)); + for (int i = 0; i < expected_count; i++) + EXPECT_EQ(soft_expected[i], buffer[i]); + } + + // Check that hard hyphens are included + { + // There isn't the \0 in the actual doc, but there is a \r\n, so need to + // add 1 to get aligned. + constexpr size_t offset = FX_ArraySize(soft_expected) + 1; + // Expecting 'User-\r\ngenerated', the - is a unicode character, so cannnot + // store in a char[]. + constexpr unsigned short hard_expected[] = { + 0x0055, 0x0073, 0x0065, 0x0072, 0x2010, 0x000d, 0x000a, 0x0067, 0x0065, + 0x006e, 0x0065, 0x0072, 0x0061, 0x0074, 0x0065, 0x0064, 0x0000}; + constexpr int expected_count = FX_ArraySize(hard_expected); + unsigned short buffer[expected_count]; + + EXPECT_EQ(expected_count, + FPDFText_GetText(textpage, offset, expected_count, buffer)); + for (int i = 0; i < expected_count; i++) + EXPECT_EQ(hard_expected[i], buffer[i]); + } + + FPDFText_ClosePage(textpage); + UnloadPage(page); +} + +TEST_F(FPDFTextEmbeddertest, bug_782596) { + // If there is a regression in this test, it will only fail under ASAN + EXPECT_TRUE(OpenDocument("bug_782596.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + FPDFText_ClosePage(textpage); + UnloadPage(page); +} diff --git a/testing/resources/bug_781804.pdf b/testing/resources/bug_781804.pdf new file mode 100644 index 0000000000..29304f157e Binary files /dev/null and b/testing/resources/bug_781804.pdf differ diff --git a/testing/resources/bug_782596.pdf b/testing/resources/bug_782596.pdf new file mode 100644 index 0000000000..e28d897ada Binary files /dev/null and b/testing/resources/bug_782596.pdf differ -- cgit v1.2.3