Rewrite lower level details of extracting text from page

The current implementation of text extraction was difficult to understand, duplicated logic that existed in other methods, and wasn't clear about the units the inputs were in. It also didn't handle control characters correctly. The new implementation leans on the methods for converting indices between the text buffer index and character list index spaces to avoid duplication of code. It also makes it clear to the reader that inputs are in the character list index space. Finally, it fixes issues being seen in Chrome with respect of ranges being slightly off. This CL also adds a test for extracting text that has control characters. BUG=pdfium:942,chromium:654578 Change-Id: Id9d1f360c2d7492c7b5a48d6c9ae29f530892742 Reviewed-on: https://pdfium-review.googlesource.com/20014 Commit-Queue: Ryan Harrison <rharrison@chromium.org> Reviewed-by: dsinclair <dsinclair@chromium.org> Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
author: Ryan Harrison <rharrison@chromium.org> 2017-11-30 21:02:41 +0000
committer: Chromium commit bot <commit-bot@chromium.org> 2017-11-30 21:02:41 +0000
commit: 8b357e7504ea804293983453540ae91c9fc57922 (patch)
tree: 7b8f611eac73034f9149b014fb547d6886e0d5b7 /fpdfsdk
parent: 0ae8e03cc2d310ba0ba19b878ea448f17a577cdb (diff)
download: pdfium-8b357e7504ea804293983453540ae91c9fc57922.tar.xz
2 files changed, 38 insertions, 18 deletions
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 5a2deb9a81..d9f7d572e9 100644
--- a/fpdfsdk/fpdftext.cpp
+++ b/fpdfsdk/fpdftext.cpp
@@ -179,25 +179,10 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
     return 1;
   }
 
-  // char_* values are for a data structure that includes non-printing unicode
-  // characters, where the text_* values are from a data structure that doesn't
-  // include these characters, so translation is needed.
-  int text_start = textpage->TextIndexFromCharIndex(char_start);
-  if (text_start == -1)
-    return 0;
-
-  int char_last = char_start + char_count - 1;
-  int text_last = textpage->TextIndexFromCharIndex(char_last);
-  if (text_last == -1)
-    return 0;
-
-  int text_count = text_last - text_start + 1;
-  if (text_count < 1)
-    return 0;
+  WideString str = textpage->GetPageText(char_start, char_count);
 
-  WideString str = textpage->GetPageText(text_start, text_count);
-  if (str.GetLength() > static_cast<size_t>(text_count))
-    str = str.Left(static_cast<size_t>(text_count));
+  if (str.GetLength() > static_cast<size_t>(char_count))
+    str = str.Left(static_cast<size_t>(char_count));
 
   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
   // the number of items to stay the same.
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 51216b9818..60654057b9 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -590,3 +590,38 @@ TEST_F(FPDFTextEmbeddertest, bug_782596) {
   FPDFText_ClosePage(textpage);
   UnloadPage(page);
 }
+
+TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
+  EXPECT_TRUE(OpenDocument("control_characters.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  EXPECT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  EXPECT_TRUE(textpage);
+
+  // Should not include the control characters in the output
+  static const char expected[] = "Hello, world!\r\nGoodbye, world!";
+  unsigned short fixed_buffer[128];
+  memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+  int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
+
+  ASSERT_GE(num_chars, 0);
+  EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
+  EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
+
+  // Attempting to get a chunk of text after the control characters
+  static const char expected_substring[] = "Goodbye, world!";
+  // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
+  // original stream
+  static const int offset = 17;
+  memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+  num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
+
+  ASSERT_GE(num_chars, 0);
+  EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
+  EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
+                                    sizeof(expected_substring)));
+
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
author	Ryan Harrison <rharrison@chromium.org>	2017-11-30 21:02:41 +0000
committer	Chromium commit bot <commit-bot@chromium.org>	2017-11-30 21:02:41 +0000
commit	8b357e7504ea804293983453540ae91c9fc57922 (patch)
tree	7b8f611eac73034f9149b014fb547d6886e0d5b7 /fpdfsdk
parent	0ae8e03cc2d310ba0ba19b878ea448f17a577cdb (diff)
download	pdfium-8b357e7504ea804293983453540ae91c9fc57922.tar.xz