From 4d92af5ace491a2e87a6c73e5afa9ed53ac86bd0 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Tue, 28 Aug 2018 20:22:32 +0000 Subject: Handle non-printing characters at beginning of extraction region Currently if a text extraction region begins on a non-printing character then "" will be returned. This is the incorrect behaviour, instead the call should scan ahead until a printing character is found and start extracting from there. Also proactively adds a similar check and scan for the end of the extraction region. BUG=pdfium:1139 Change-Id: Ia2001ac89740f3d31d2bb69e8000773f8b01091b Reviewed-on: https://pdfium-review.googlesource.com/41532 Reviewed-by: Henrique Nakashima Commit-Queue: Ryan Harrison --- core/fpdftext/cpdf_textpage.cpp | 29 +++++++++++++--- fpdfsdk/fpdf_text_embeddertest.cpp | 23 +++++++++++++ testing/resources/bug_1139.in | 55 ++++++++++++++++++++++++++++++ testing/resources/bug_1139.pdf | 68 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 testing/resources/bug_1139.in create mode 100644 testing/resources/bug_1139.pdf diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index 5465297263..8c110b6814 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp @@ -510,15 +510,36 @@ WideString CPDF_TextPage::GetPageText(int start, int count) const { return L""; } + const int count_chars = CountChars(); int text_start = TextIndexFromCharIndex(start); - if (text_start < 0) - return L""; - count = std::min(count, CountChars() - start); + // If the character at |start| is a non-printing character, then + // TextIndexFromCharIndex will return -1, so scan ahead to the first printing + // character. + while (text_start < 0) { + if (start >= count_chars) + return L""; + start++; + text_start = TextIndexFromCharIndex(start); + } + + count = std::min(count, count_chars - start); int last = start + count - 1; int text_last = TextIndexFromCharIndex(last); - if (text_last < 0 || text_last < text_start) + + // If the character at |last| is a non-printing character, then + // TextIndexFromCharIndex will return -1, so scan back to the last printing + // character. + while (text_last < 0) { + if (last < text_start) + return L""; + + last--; + text_last = TextIndexFromCharIndex(last); + } + + if (text_last < text_start) return L""; int text_count = text_last - text_start + 1; diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp index 83b43d95d0..1f29589653 100644 --- a/fpdfsdk/fpdf_text_embeddertest.cpp +++ b/fpdfsdk/fpdf_text_embeddertest.cpp @@ -944,3 +944,26 @@ TEST_F(FPDFTextEmbeddertest, CroppedText) { UnloadPage(page); } } + +TEST_F(FPDFTextEmbeddertest, Bug_1139) { + ASSERT_TRUE(OpenDocument("bug_1139.pdf")); + FPDF_PAGE page = LoadPage(0); + ASSERT_TRUE(page); + + FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page); + ASSERT_TRUE(text_page); + + // -1 for CountChars not including the \0, but +1 for the extra control + // character. + EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page)); + + // There is an extra control character at the beginning of the string, but it + // should not appear in the output nor prevent extracting the text. + unsigned short buffer[128]; + int num_chars = FPDFText_GetText(text_page, 0, 128, buffer); + ASSERT_EQ(kHelloGoodbyeTextSize, num_chars); + EXPECT_TRUE( + check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize)); + FPDFText_ClosePage(text_page); + UnloadPage(page); +} diff --git a/testing/resources/bug_1139.in b/testing/resources/bug_1139.in new file mode 100644 index 0000000000..d5603f08aa --- /dev/null +++ b/testing/resources/bug_1139.in @@ -0,0 +1,55 @@ +{{header}} +{{object 1 0}} << + /Type /Catalog + /Pages 2 0 R +>> +endobj +{{object 2 0}} << + /Type /Pages + /MediaBox [ 0 0 200 200 ] + /Count 1 + /Kids [ 3 0 R ] +>> +endobj +{{object 3 0}} << + /Type /Page + /Parent 2 0 R + /Resources << + /Font << + /F1 4 0 R + /F2 5 0 R + >> + >> + /Contents 6 0 R +>> +endobj +{{object 4 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +{{object 5 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +{{object 6 0}} << +>> +stream +stream +BT +20 50 Td +/F1 12 Tf +(\003Hello, world!) Tj +0 50 Td +/F2 16 Tf +(Goodbye, world!) Tj +ET +endstream +endobj +{{xref}} +{{trailer}} +{{startxref}} +%%EOF diff --git a/testing/resources/bug_1139.pdf b/testing/resources/bug_1139.pdf new file mode 100644 index 0000000000..d89695e82b --- /dev/null +++ b/testing/resources/bug_1139.pdf @@ -0,0 +1,68 @@ +%PDF-1.7 +% ò¤ô +1 0 obj << + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj << + /Type /Pages + /MediaBox [ 0 0 200 200 ] + /Count 1 + /Kids [ 3 0 R ] +>> +endobj +3 0 obj << + /Type /Page + /Parent 2 0 R + /Resources << + /Font << + /F1 4 0 R + /F2 5 0 R + >> + >> + /Contents 6 0 R +>> +endobj +4 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +5 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +6 0 obj << +>> +stream +stream +BT +20 50 Td +/F1 12 Tf +(\003Hello, world!) Tj +0 50 Td +/F2 16 Tf +(Goodbye, world!) Tj +ET +endstream +endobj +xref +0 7 +0000000000 65535 f +0000000015 00000 n +0000000068 00000 n +0000000161 00000 n +0000000303 00000 n +0000000381 00000 n +0000000457 00000 n +trailer << + /Root 1 0 R + /Size 7 +>> +startxref +589 +%%EOF -- cgit v1.2.3