8 files changed, 183 insertions, 61 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 3a38343721..05cbdfb3a7 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -114,7 +114,7 @@ void CPDF_LinkExtract::ExtractLinks() {
   if (!m_pTextPage->IsParsed())
     return;
 
-  m_strPageText = m_pTextPage->GetPageText(0, -1);
+  m_strPageText = m_pTextPage->GetAllPageText();
   if (m_strPageText.IsEmpty())
     return;
 
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 7ea2061c26..8ef5522bae 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -436,49 +436,27 @@ void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
   }
 }
 
-WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
-  if (!m_bIsParsed || nCount == 0)
+WideString CPDF_TextPage::GetPageText(int start, int count) const {
+  if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed ||
+      m_CharList.empty() || m_TextBuf.GetLength() == 0) {
     return L"";
-
-  if (start < 0)
-    start = 0;
-
-  if (nCount == -1) {
-    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
-    WideStringView wsTextBuf = m_TextBuf.AsStringView();
-    return WideString(wsTextBuf.Right(wsTextBuf.GetLength() - start));
   }
-  if (nCount <= 0 || m_CharList.empty())
-    return L"";
-  if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
-    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
-  if (nCount <= 0)
+
+  int text_start = TextIndexFromCharIndex(start);
+  if (text_start < 0)
     return L"";
-  CheckMarkedContentObject(start, nCount);
-  int startindex = 0;
-  PAGECHAR_INFO charinfo = m_CharList[start];
-  int startOffset = 0;
-  while (charinfo.m_Index == -1) {
-    startOffset++;
-    if (startOffset > nCount ||
-        start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
-      return L"";
-    }
-    charinfo = m_CharList[start + startOffset];
-  }
-  startindex = charinfo.m_Index;
-  charinfo = m_CharList[start + nCount - 1];
-  int nCountOffset = 0;
-  while (charinfo.m_Index == -1) {
-    nCountOffset++;
-    if (nCountOffset >= nCount)
-      return L"";
-    charinfo = m_CharList[start + nCount - nCountOffset - 1];
-  }
-  nCount = start + nCount - nCountOffset - startindex;
-  if (nCount <= 0)
+
+  count = std::min(count, CountChars() - start);
+
+  int last = start + count - 1;
+  int text_last = TextIndexFromCharIndex(last);
+  if (text_last < 0 || text_last < text_start)
     return L"";
-  return WideString(m_TextBuf.AsStringView().Mid(startindex, nCount));
+
+  int text_count = text_last - text_start + 1;
+
+  return WideString(m_TextBuf.AsStringView().Mid(
+      static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
 }
 
 int CPDF_TextPage::CountRects(int start, int nCount) {
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index e8ab82ac2c..cd30ace3ad 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -103,7 +103,13 @@ class CPDF_TextPage {
   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
   WideString GetTextByRect(const CFX_FloatRect& rect) const;
-  WideString GetPageText(int start = 0, int nCount = -1) const;
+
+  // Returns string with the text from |m_TextBuf| that are covered by the input
+  // range. |start| and |count| are in terms of the m_CharIndex, so the range
+  // will be converted into appropriate indices.
+  WideString GetPageText(int start, int count) const;
+  WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
+
   int CountRects(int start, int nCount);
   void GetRect(int rectIndex,
                float& left,
diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp
index a874521326..9f243a0aee 100644
--- a/core/fpdftext/cpdf_textpagefind.cpp
+++ b/core/fpdftext/cpdf_textpagefind.cpp
@@ -41,7 +41,7 @@ CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
       m_resStart(0),
       m_resEnd(-1),
       m_IsFind(false) {
-  m_strText = m_pTextPage->GetPageText();
+  m_strText = m_pTextPage->GetAllPageText();
   int nCount = pTextPage->CountChars();
   if (nCount)
     m_CharIndex.push_back(0);
@@ -85,7 +85,7 @@ bool CPDF_TextPageFind::FindFirst(const WideString& findwhat,
   if (!m_pTextPage)
     return false;
   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
-    m_strText = m_pTextPage->GetPageText();
+    m_strText = m_pTextPage->GetAllPageText();
   WideString findwhatStr = findwhat;
   m_findWhat = findwhatStr;
   m_flags = flags;
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 5a2deb9a81..d9f7d572e9 100644
--- a/fpdfsdk/fpdftext.cpp
+++ b/fpdfsdk/fpdftext.cpp
@@ -179,25 +179,10 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
     return 1;
   }
 
-  // char_* values are for a data structure that includes non-printing unicode
-  // characters, where the text_* values are from a data structure that doesn't
-  // include these characters, so translation is needed.
-  int text_start = textpage->TextIndexFromCharIndex(char_start);
-  if (text_start == -1)
-    return 0;
-
-  int char_last = char_start + char_count - 1;
-  int text_last = textpage->TextIndexFromCharIndex(char_last);
-  if (text_last == -1)
-    return 0;
-
-  int text_count = text_last - text_start + 1;
-  if (text_count < 1)
-    return 0;
+  WideString str = textpage->GetPageText(char_start, char_count);
 
-  WideString str = textpage->GetPageText(text_start, text_count);
-  if (str.GetLength() > static_cast<size_t>(text_count))
-    str = str.Left(static_cast<size_t>(text_count));
+  if (str.GetLength() > static_cast<size_t>(char_count))
+    str = str.Left(static_cast<size_t>(char_count));
 
   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
   // the number of items to stay the same.
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 51216b9818..60654057b9 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -590,3 +590,38 @@ TEST_F(FPDFTextEmbeddertest, bug_782596) {
   FPDFText_ClosePage(textpage);
   UnloadPage(page);
 }
+
+TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
+  EXPECT_TRUE(OpenDocument("control_characters.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  EXPECT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  EXPECT_TRUE(textpage);
+
+  // Should not include the control characters in the output
+  static const char expected[] = "Hello, world!\r\nGoodbye, world!";
+  unsigned short fixed_buffer[128];
+  memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+  int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
+
+  ASSERT_GE(num_chars, 0);
+  EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
+  EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
+
+  // Attempting to get a chunk of text after the control characters
+  static const char expected_substring[] = "Goodbye, world!";
+  // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
+  // original stream
+  static const int offset = 17;
+  memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+  num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
+
+  ASSERT_GE(num_chars, 0);
+  EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
+  EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
+                                    sizeof(expected_substring)));
+
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
diff --git a/testing/resources/control_characters.in b/testing/resources/control_characters.in
new file mode 100644
index 0000000000..ca7827fe11
--- /dev/null
+++ b/testing/resources/control_characters.in
@@ -0,0 +1,54 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello\2\3, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/control_characters.pdf b/testing/resources/control_characters.pdf
new file mode 100644
index 0000000000..535009733f
--- /dev/null
+++ b/testing/resources/control_characters.pdf
@@ -0,0 +1,64 @@
+%PDF-1.7
+%���
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello\2\3, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+xref
+0 7
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000161 00000 n 
+0000000303 00000 n 
+0000000381 00000 n 
+0000000457 00000 n 
+trailer<< /Root 1 0 R /Size 7 >>
+startxref
+582
+%%EOF