From 0f1ca17e45bee4f1cb04d33437eba010b1afcd72 Mon Sep 17 00:00:00 2001
From: Ryan Harrison <rharrison@chromium.org>
Date: Tue, 28 Nov 2017 18:19:57 +0000
Subject: Convert from character stream to visible text indices in GetText

Most of the API methods FPDFText operate on indices in terms of the
underlying stream of characters. This stream includes non-printing
control characters, which are not part of the visible text. The
majority of files do not appear to have these hidden characters so
there is a 1:1 correspondence between them. When they are present
conversion needs to occur to make sure that GetText doesn't attempt to
retrieve for a span that is out of range.

BUG=chromium:788103,chromium:788220

Change-Id: I4c9fa403ea65754ba94e3f15ded49fe0641e9db5
Reviewed-on: https://pdfium-review.googlesource.com/19550
Reviewed-by: dsinclair <dsinclair@chromium.org>
Commit-Queue: dsinclair <dsinclair@chromium.org>
---
 fpdfsdk/fpdftext.cpp | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 9d7d56311f..022b7caa8d 100644
--- a/fpdfsdk/fpdftext.cpp
+++ b/fpdfsdk/fpdftext.cpp
@@ -160,27 +160,51 @@ FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
                 static_cast<float>(yTolerance)));
 }
 
-FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page,
-                                               int start,
-                                               int count,
+FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
+                                               int char_start,
+                                               int char_count,
                                                unsigned short* result) {
-  if (start < 0 || count < 0 || !result || !text_page)
+  if (char_start < 0 || char_count < 0 || !result || !page)
     return 0;
 
-  CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
-  if (start >= textpage->CountChars())
+  CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
+  if (char_start >= textpage->CountChars())
+    return 0;
+
+  char_count = std::min(char_count, textpage->CountChars() - char_start);
+  if (char_count == 0) {
+    // Writting out ""
+    *result = 0;
+    return 1;
+  }
+
+  int char_last = char_start + char_count - 1;
+
+  // char_* values are for a data structure that includes non-printing unicode
+  // characters, where the text_* values are from a data structure that doesn't
+  // include these characters, so translation is needed.
+  int text_start = textpage->TextIndexFromCharIndex(char_start);
+  if (text_start == -1)
+    return 0;
+
+  int text_last = textpage->TextIndexFromCharIndex(char_last);
+  if (text_last == -1)
+    return 0;
+
+  if (text_start > text_last)
     return 0;
+  int text_count = text_last - text_start + 1;
 
-  WideString str = textpage->GetPageText(start, count);
-  if (str.GetLength() > static_cast<size_t>(count))
-    str = str.Left(static_cast<size_t>(count));
+  WideString str = textpage->GetPageText(text_start, text_count);
+  if (str.GetLength() > static_cast<size_t>(text_count))
+    str = str.Left(static_cast<size_t>(text_count));
 
   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
   // the number of items to stay the same.
   ByteString byte_str = str.UTF16LE_Encode();
   ASSERT((byte_str.GetLength()) / kBytesPerCharacter <=
-         static_cast<size_t>(count + 1));  // +1 to account for the string
-                                           // terminator
+         static_cast<size_t>(char_count + 1));  // +1 to account for the string
+                                                // terminator
   memcpy(result, byte_str.GetBuffer(byte_str.GetLength()),
          byte_str.GetLength());
   return (byte_str.GetLength() / kBytesPerCharacter);
-- 
cgit v1.2.3