summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--core/fpdftext/cpdf_linkextract.cpp2
-rw-r--r--core/fpdftext/cpdf_textpage.cpp56
-rw-r--r--core/fpdftext/cpdf_textpage.h8
-rw-r--r--core/fpdftext/cpdf_textpagefind.cpp4
-rw-r--r--fpdfsdk/fpdftext.cpp21
-rw-r--r--fpdfsdk/fpdftext_embeddertest.cpp35
-rw-r--r--testing/resources/control_characters.in54
-rw-r--r--testing/resources/control_characters.pdf64
8 files changed, 183 insertions, 61 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 3a38343721..05cbdfb3a7 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -114,7 +114,7 @@ void CPDF_LinkExtract::ExtractLinks() {
if (!m_pTextPage->IsParsed())
return;
- m_strPageText = m_pTextPage->GetPageText(0, -1);
+ m_strPageText = m_pTextPage->GetAllPageText();
if (m_strPageText.IsEmpty())
return;
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 7ea2061c26..8ef5522bae 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -436,49 +436,27 @@ void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
}
}
-WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
- if (!m_bIsParsed || nCount == 0)
+WideString CPDF_TextPage::GetPageText(int start, int count) const {
+ if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed ||
+ m_CharList.empty() || m_TextBuf.GetLength() == 0) {
return L"";
-
- if (start < 0)
- start = 0;
-
- if (nCount == -1) {
- nCount = pdfium::CollectionSize<int>(m_CharList) - start;
- WideStringView wsTextBuf = m_TextBuf.AsStringView();
- return WideString(wsTextBuf.Right(wsTextBuf.GetLength() - start));
}
- if (nCount <= 0 || m_CharList.empty())
- return L"";
- if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
- nCount = pdfium::CollectionSize<int>(m_CharList) - start;
- if (nCount <= 0)
+
+ int text_start = TextIndexFromCharIndex(start);
+ if (text_start < 0)
return L"";
- CheckMarkedContentObject(start, nCount);
- int startindex = 0;
- PAGECHAR_INFO charinfo = m_CharList[start];
- int startOffset = 0;
- while (charinfo.m_Index == -1) {
- startOffset++;
- if (startOffset > nCount ||
- start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
- return L"";
- }
- charinfo = m_CharList[start + startOffset];
- }
- startindex = charinfo.m_Index;
- charinfo = m_CharList[start + nCount - 1];
- int nCountOffset = 0;
- while (charinfo.m_Index == -1) {
- nCountOffset++;
- if (nCountOffset >= nCount)
- return L"";
- charinfo = m_CharList[start + nCount - nCountOffset - 1];
- }
- nCount = start + nCount - nCountOffset - startindex;
- if (nCount <= 0)
+
+ count = std::min(count, CountChars() - start);
+
+ int last = start + count - 1;
+ int text_last = TextIndexFromCharIndex(last);
+ if (text_last < 0 || text_last < text_start)
return L"";
- return WideString(m_TextBuf.AsStringView().Mid(startindex, nCount));
+
+ int text_count = text_last - text_start + 1;
+
+ return WideString(m_TextBuf.AsStringView().Mid(
+ static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
}
int CPDF_TextPage::CountRects(int start, int nCount) {
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index e8ab82ac2c..cd30ace3ad 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -103,7 +103,13 @@ class CPDF_TextPage {
std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
WideString GetTextByRect(const CFX_FloatRect& rect) const;
- WideString GetPageText(int start = 0, int nCount = -1) const;
+
+ // Returns string with the text from |m_TextBuf| that are covered by the input
+ // range. |start| and |count| are in terms of the m_CharIndex, so the range
+ // will be converted into appropriate indices.
+ WideString GetPageText(int start, int count) const;
+ WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
+
int CountRects(int start, int nCount);
void GetRect(int rectIndex,
float& left,
diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp
index a874521326..9f243a0aee 100644
--- a/core/fpdftext/cpdf_textpagefind.cpp
+++ b/core/fpdftext/cpdf_textpagefind.cpp
@@ -41,7 +41,7 @@ CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
m_resStart(0),
m_resEnd(-1),
m_IsFind(false) {
- m_strText = m_pTextPage->GetPageText();
+ m_strText = m_pTextPage->GetAllPageText();
int nCount = pTextPage->CountChars();
if (nCount)
m_CharIndex.push_back(0);
@@ -85,7 +85,7 @@ bool CPDF_TextPageFind::FindFirst(const WideString& findwhat,
if (!m_pTextPage)
return false;
if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
- m_strText = m_pTextPage->GetPageText();
+ m_strText = m_pTextPage->GetAllPageText();
WideString findwhatStr = findwhat;
m_findWhat = findwhatStr;
m_flags = flags;
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 5a2deb9a81..d9f7d572e9 100644
--- a/fpdfsdk/fpdftext.cpp
+++ b/fpdfsdk/fpdftext.cpp
@@ -179,25 +179,10 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page,
return 1;
}
- // char_* values are for a data structure that includes non-printing unicode
- // characters, where the text_* values are from a data structure that doesn't
- // include these characters, so translation is needed.
- int text_start = textpage->TextIndexFromCharIndex(char_start);
- if (text_start == -1)
- return 0;
-
- int char_last = char_start + char_count - 1;
- int text_last = textpage->TextIndexFromCharIndex(char_last);
- if (text_last == -1)
- return 0;
-
- int text_count = text_last - text_start + 1;
- if (text_count < 1)
- return 0;
+ WideString str = textpage->GetPageText(char_start, char_count);
- WideString str = textpage->GetPageText(text_start, text_count);
- if (str.GetLength() > static_cast<size_t>(text_count))
- str = str.Left(static_cast<size_t>(text_count));
+ if (str.GetLength() > static_cast<size_t>(char_count))
+ str = str.Left(static_cast<size_t>(char_count));
// UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
// the number of items to stay the same.
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 51216b9818..60654057b9 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -590,3 +590,38 @@ TEST_F(FPDFTextEmbeddertest, bug_782596) {
FPDFText_ClosePage(textpage);
UnloadPage(page);
}
+
+TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
+ EXPECT_TRUE(OpenDocument("control_characters.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ EXPECT_TRUE(page);
+
+ FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+ EXPECT_TRUE(textpage);
+
+ // Should not include the control characters in the output
+ static const char expected[] = "Hello, world!\r\nGoodbye, world!";
+ unsigned short fixed_buffer[128];
+ memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+ int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
+
+ ASSERT_GE(num_chars, 0);
+ EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
+ EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
+
+ // Attempting to get a chunk of text after the control characters
+ static const char expected_substring[] = "Goodbye, world!";
+ // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
+ // original stream
+ static const int offset = 17;
+ memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+ num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
+
+ ASSERT_GE(num_chars, 0);
+ EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
+ EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
+ sizeof(expected_substring)));
+
+ FPDFText_ClosePage(textpage);
+ UnloadPage(page);
+}
diff --git a/testing/resources/control_characters.in b/testing/resources/control_characters.in
new file mode 100644
index 0000000000..ca7827fe11
--- /dev/null
+++ b/testing/resources/control_characters.in
@@ -0,0 +1,54 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /MediaBox [ 0 0 200 200 ]
+ /Count 1
+ /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 4 0 R
+ /F2 5 0 R
+ >>
+ >>
+ /Contents 6 0 R
+>>
+endobj
+{{object 4 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello\2\3, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/control_characters.pdf b/testing/resources/control_characters.pdf
new file mode 100644
index 0000000000..535009733f
--- /dev/null
+++ b/testing/resources/control_characters.pdf
@@ -0,0 +1,64 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /MediaBox [ 0 0 200 200 ]
+ /Count 1
+ /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 4 0 R
+ /F2 5 0 R
+ >>
+ >>
+ /Contents 6 0 R
+>>
+endobj
+4 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello\2\3, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+xref
+0 7
+0000000000 65535 f
+0000000015 00000 n
+0000000068 00000 n
+0000000161 00000 n
+0000000303 00000 n
+0000000381 00000 n
+0000000457 00000 n
+trailer<< /Root 1 0 R /Size 7 >>
+startxref
+582
+%%EOF