Convert from character stream to visible text indices in GetText

Most of the API methods FPDFText operate on indices in terms of the underlying stream of characters. This stream includes non-printing control characters, which are not part of the visible text. The majority of files do not appear to have these hidden characters so there is a 1:1 correspondence between them. When they are present conversion needs to occur to make sure that GetText doesn't attempt to retrieve for a span that is out of range. BUG=chromium:788103,chromium:788220 Change-Id: I4c9fa403ea65754ba94e3f15ded49fe0641e9db5 Reviewed-on: https://pdfium-review.googlesource.com/19550 Reviewed-by: dsinclair <dsinclair@chromium.org> Commit-Queue: dsinclair <dsinclair@chromium.org>
author: Ryan Harrison <rharrison@chromium.org> 2017-11-28 18:19:57 +0000
committer: Chromium commit bot <commit-bot@chromium.org> 2017-11-28 18:19:57 +0000
commit: 0f1ca17e45bee4f1cb04d33437eba010b1afcd72 (patch)
tree: 6c8d990703abc659d252e3a33a9438449ee3be7d /fpdfsdk/fpdftext.cpp
parent: 99acb1c816dc88dbda5d03ef28dbdaa928f7e22a (diff)
download: pdfium-0f1ca17e45bee4f1cb04d33437eba010b1afcd72.tar.xz
1 files changed, 35 insertions, 11 deletions
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 9d7d56311f..022b7caa8d 100644
--- a/fpdfsdk/fpdftext.cpp
+++ b/fpdfsdk/fpdftext.cpp
@@ -160,27 +160,51 @@ FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
                 static_cast<float>(yTolerance)));
 }
 
-FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page,
-                                               int start,
-                                               int count,
+FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
+                                               int char_start,
+                                               int char_count,
                                                unsigned short* result) {
-  if (start < 0 || count < 0 || !result || !text_page)
+  if (char_start < 0 || char_count < 0 || !result || !page)
     return 0;
 
-  CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
-  if (start >= textpage->CountChars())
+  CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
+  if (char_start >= textpage->CountChars())
+    return 0;
+
+  char_count = std::min(char_count, textpage->CountChars() - char_start);
+  if (char_count == 0) {
+    // Writting out ""
+    *result = 0;
+    return 1;
+  }
+
+  int char_last = char_start + char_count - 1;
+
+  // char_* values are for a data structure that includes non-printing unicode
+  // characters, where the text_* values are from a data structure that doesn't
+  // include these characters, so translation is needed.
+  int text_start = textpage->TextIndexFromCharIndex(char_start);
+  if (text_start == -1)
+    return 0;
+
+  int text_last = textpage->TextIndexFromCharIndex(char_last);
+  if (text_last == -1)
+    return 0;
+
+  if (text_start > text_last)
     return 0;
+  int text_count = text_last - text_start + 1;
 
-  WideString str = textpage->GetPageText(start, count);
-  if (str.GetLength() > static_cast<size_t>(count))
-    str = str.Left(static_cast<size_t>(count));
+  WideString str = textpage->GetPageText(text_start, text_count);
+  if (str.GetLength() > static_cast<size_t>(text_count))
+    str = str.Left(static_cast<size_t>(text_count));
 
   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
   // the number of items to stay the same.
   ByteString byte_str = str.UTF16LE_Encode();
   ASSERT((byte_str.GetLength()) / kBytesPerCharacter <=
-         static_cast<size_t>(count + 1));  // +1 to account for the string
-                                           // terminator
+         static_cast<size_t>(char_count + 1));  // +1 to account for the string
+                                                // terminator
   memcpy(result, byte_str.GetBuffer(byte_str.GetLength()),
          byte_str.GetLength());
   return (byte_str.GetLength() / kBytesPerCharacter);
author	Ryan Harrison <rharrison@chromium.org>	2017-11-28 18:19:57 +0000
committer	Chromium commit bot <commit-bot@chromium.org>	2017-11-28 18:19:57 +0000
commit	0f1ca17e45bee4f1cb04d33437eba010b1afcd72 (patch)
tree	6c8d990703abc659d252e3a33a9438449ee3be7d /fpdfsdk/fpdftext.cpp
parent	99acb1c816dc88dbda5d03ef28dbdaa928f7e22a (diff)
download	pdfium-0f1ca17e45bee4f1cb04d33437eba010b1afcd72.tar.xz