From 4d92af5ace491a2e87a6c73e5afa9ed53ac86bd0 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Tue, 28 Aug 2018 20:22:32 +0000 Subject: Handle non-printing characters at beginning of extraction region Currently if a text extraction region begins on a non-printing character then "" will be returned. This is the incorrect behaviour, instead the call should scan ahead until a printing character is found and start extracting from there. Also proactively adds a similar check and scan for the end of the extraction region. BUG=pdfium:1139 Change-Id: Ia2001ac89740f3d31d2bb69e8000773f8b01091b Reviewed-on: https://pdfium-review.googlesource.com/41532 Reviewed-by: Henrique Nakashima Commit-Queue: Ryan Harrison --- fpdfsdk/fpdf_text_embeddertest.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fpdfsdk') diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp index 83b43d95d0..1f29589653 100644 --- a/fpdfsdk/fpdf_text_embeddertest.cpp +++ b/fpdfsdk/fpdf_text_embeddertest.cpp @@ -944,3 +944,26 @@ TEST_F(FPDFTextEmbeddertest, CroppedText) { UnloadPage(page); } } + +TEST_F(FPDFTextEmbeddertest, Bug_1139) { + ASSERT_TRUE(OpenDocument("bug_1139.pdf")); + FPDF_PAGE page = LoadPage(0); + ASSERT_TRUE(page); + + FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page); + ASSERT_TRUE(text_page); + + // -1 for CountChars not including the \0, but +1 for the extra control + // character. + EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page)); + + // There is an extra control character at the beginning of the string, but it + // should not appear in the output nor prevent extracting the text. + unsigned short buffer[128]; + int num_chars = FPDFText_GetText(text_page, 0, 128, buffer); + ASSERT_EQ(kHelloGoodbyeTextSize, num_chars); + EXPECT_TRUE( + check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize)); + FPDFText_ClosePage(text_page); + UnloadPage(page); +} -- cgit v1.2.3