summaryrefslogtreecommitdiff
path: root/core/fpdftext/cpdf_textpage.cpp
diff options
context:
space:
mode:
authorRyan Harrison <rharrison@chromium.org>2017-11-30 21:02:41 +0000
committerChromium commit bot <commit-bot@chromium.org>2017-11-30 21:02:41 +0000
commit8b357e7504ea804293983453540ae91c9fc57922 (patch)
tree7b8f611eac73034f9149b014fb547d6886e0d5b7 /core/fpdftext/cpdf_textpage.cpp
parent0ae8e03cc2d310ba0ba19b878ea448f17a577cdb (diff)
downloadpdfium-8b357e7504ea804293983453540ae91c9fc57922.tar.xz
Rewrite lower level details of extracting text from page
The current implementation of text extraction was difficult to understand, duplicated logic that existed in other methods, and wasn't clear about the units the inputs were in. It also didn't handle control characters correctly. The new implementation leans on the methods for converting indices between the text buffer index and character list index spaces to avoid duplication of code. It also makes it clear to the reader that inputs are in the character list index space. Finally, it fixes issues being seen in Chrome with respect of ranges being slightly off. This CL also adds a test for extracting text that has control characters. BUG=pdfium:942,chromium:654578 Change-Id: Id9d1f360c2d7492c7b5a48d6c9ae29f530892742 Reviewed-on: https://pdfium-review.googlesource.com/20014 Commit-Queue: Ryan Harrison <rharrison@chromium.org> Reviewed-by: dsinclair <dsinclair@chromium.org> Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Diffstat (limited to 'core/fpdftext/cpdf_textpage.cpp')
-rw-r--r--core/fpdftext/cpdf_textpage.cpp56
1 files changed, 17 insertions, 39 deletions
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 7ea2061c26..8ef5522bae 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -436,49 +436,27 @@ void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
}
}
-WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
- if (!m_bIsParsed || nCount == 0)
+WideString CPDF_TextPage::GetPageText(int start, int count) const {
+ if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed ||
+ m_CharList.empty() || m_TextBuf.GetLength() == 0) {
return L"";
-
- if (start < 0)
- start = 0;
-
- if (nCount == -1) {
- nCount = pdfium::CollectionSize<int>(m_CharList) - start;
- WideStringView wsTextBuf = m_TextBuf.AsStringView();
- return WideString(wsTextBuf.Right(wsTextBuf.GetLength() - start));
}
- if (nCount <= 0 || m_CharList.empty())
- return L"";
- if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
- nCount = pdfium::CollectionSize<int>(m_CharList) - start;
- if (nCount <= 0)
+
+ int text_start = TextIndexFromCharIndex(start);
+ if (text_start < 0)
return L"";
- CheckMarkedContentObject(start, nCount);
- int startindex = 0;
- PAGECHAR_INFO charinfo = m_CharList[start];
- int startOffset = 0;
- while (charinfo.m_Index == -1) {
- startOffset++;
- if (startOffset > nCount ||
- start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
- return L"";
- }
- charinfo = m_CharList[start + startOffset];
- }
- startindex = charinfo.m_Index;
- charinfo = m_CharList[start + nCount - 1];
- int nCountOffset = 0;
- while (charinfo.m_Index == -1) {
- nCountOffset++;
- if (nCountOffset >= nCount)
- return L"";
- charinfo = m_CharList[start + nCount - nCountOffset - 1];
- }
- nCount = start + nCount - nCountOffset - startindex;
- if (nCount <= 0)
+
+ count = std::min(count, CountChars() - start);
+
+ int last = start + count - 1;
+ int text_last = TextIndexFromCharIndex(last);
+ if (text_last < 0 || text_last < text_start)
return L"";
- return WideString(m_TextBuf.AsStringView().Mid(startindex, nCount));
+
+ int text_count = text_last - text_start + 1;
+
+ return WideString(m_TextBuf.AsStringView().Mid(
+ static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
}
int CPDF_TextPage::CountRects(int start, int nCount) {