diff options
author | Ryan Harrison <rharrison@chromium.org> | 2017-11-30 21:02:41 +0000 |
---|---|---|
committer | Chromium commit bot <commit-bot@chromium.org> | 2017-11-30 21:02:41 +0000 |
commit | 8b357e7504ea804293983453540ae91c9fc57922 (patch) | |
tree | 7b8f611eac73034f9149b014fb547d6886e0d5b7 /core/fpdftext | |
parent | 0ae8e03cc2d310ba0ba19b878ea448f17a577cdb (diff) | |
download | pdfium-8b357e7504ea804293983453540ae91c9fc57922.tar.xz |
Rewrite lower level details of extracting text from page
The current implementation of text extraction was difficult to
understand, duplicated logic that existed in other methods, and wasn't
clear about the units the inputs were in. It also didn't handle
control characters correctly.
The new implementation leans on the methods for converting indices
between the text buffer index and character list index spaces to avoid
duplication of code. It also makes it clear to the reader that inputs
are in the character list index space. Finally, it fixes issues being
seen in Chrome with respect of ranges being slightly off.
This CL also adds a test for extracting text that has control
characters.
BUG=pdfium:942,chromium:654578
Change-Id: Id9d1f360c2d7492c7b5a48d6c9ae29f530892742
Reviewed-on: https://pdfium-review.googlesource.com/20014
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: dsinclair <dsinclair@chromium.org>
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Diffstat (limited to 'core/fpdftext')
-rw-r--r-- | core/fpdftext/cpdf_linkextract.cpp | 2 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpage.cpp | 56 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpage.h | 8 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpagefind.cpp | 4 |
4 files changed, 27 insertions, 43 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 3a38343721..05cbdfb3a7 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -114,7 +114,7 @@ void CPDF_LinkExtract::ExtractLinks() { if (!m_pTextPage->IsParsed()) return; - m_strPageText = m_pTextPage->GetPageText(0, -1); + m_strPageText = m_pTextPage->GetAllPageText(); if (m_strPageText.IsEmpty()) return; diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index 7ea2061c26..8ef5522bae 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp @@ -436,49 +436,27 @@ void CPDF_TextPage::CheckMarkedContentObject(int32_t& start, } } -WideString CPDF_TextPage::GetPageText(int start, int nCount) const { - if (!m_bIsParsed || nCount == 0) +WideString CPDF_TextPage::GetPageText(int start, int count) const { + if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed || + m_CharList.empty() || m_TextBuf.GetLength() == 0) { return L""; - - if (start < 0) - start = 0; - - if (nCount == -1) { - nCount = pdfium::CollectionSize<int>(m_CharList) - start; - WideStringView wsTextBuf = m_TextBuf.AsStringView(); - return WideString(wsTextBuf.Right(wsTextBuf.GetLength() - start)); } - if (nCount <= 0 || m_CharList.empty()) - return L""; - if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1) - nCount = pdfium::CollectionSize<int>(m_CharList) - start; - if (nCount <= 0) + + int text_start = TextIndexFromCharIndex(start); + if (text_start < 0) return L""; - CheckMarkedContentObject(start, nCount); - int startindex = 0; - PAGECHAR_INFO charinfo = m_CharList[start]; - int startOffset = 0; - while (charinfo.m_Index == -1) { - startOffset++; - if (startOffset > nCount || - start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) { - return L""; - } - charinfo = m_CharList[start + startOffset]; - } - startindex = charinfo.m_Index; - charinfo = m_CharList[start + nCount - 1]; - int nCountOffset = 0; - while (charinfo.m_Index == -1) { - nCountOffset++; - if (nCountOffset >= nCount) - return L""; - charinfo = m_CharList[start + nCount - nCountOffset - 1]; - } - nCount = start + nCount - nCountOffset - startindex; - if (nCount <= 0) + + count = std::min(count, CountChars() - start); + + int last = start + count - 1; + int text_last = TextIndexFromCharIndex(last); + if (text_last < 0 || text_last < text_start) return L""; - return WideString(m_TextBuf.AsStringView().Mid(startindex, nCount)); + + int text_count = text_last - text_start + 1; + + return WideString(m_TextBuf.AsStringView().Mid( + static_cast<size_t>(text_start), static_cast<size_t>(text_count))); } int CPDF_TextPage::CountRects(int start, int nCount) { diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h index e8ab82ac2c..cd30ace3ad 100644 --- a/core/fpdftext/cpdf_textpage.h +++ b/core/fpdftext/cpdf_textpage.h @@ -103,7 +103,13 @@ class CPDF_TextPage { std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const; int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; WideString GetTextByRect(const CFX_FloatRect& rect) const; - WideString GetPageText(int start = 0, int nCount = -1) const; + + // Returns string with the text from |m_TextBuf| that are covered by the input + // range. |start| and |count| are in terms of the m_CharIndex, so the range + // will be converted into appropriate indices. + WideString GetPageText(int start, int count) const; + WideString GetAllPageText() const { return GetPageText(0, CountChars()); } + int CountRects(int start, int nCount); void GetRect(int rectIndex, float& left, diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp index a874521326..9f243a0aee 100644 --- a/core/fpdftext/cpdf_textpagefind.cpp +++ b/core/fpdftext/cpdf_textpagefind.cpp @@ -41,7 +41,7 @@ CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) m_resStart(0), m_resEnd(-1), m_IsFind(false) { - m_strText = m_pTextPage->GetPageText(); + m_strText = m_pTextPage->GetAllPageText(); int nCount = pTextPage->CountChars(); if (nCount) m_CharIndex.push_back(0); @@ -85,7 +85,7 @@ bool CPDF_TextPageFind::FindFirst(const WideString& findwhat, if (!m_pTextPage) return false; if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) - m_strText = m_pTextPage->GetPageText(); + m_strText = m_pTextPage->GetAllPageText(); WideString findwhatStr = findwhat; m_findWhat = findwhatStr; m_flags = flags; |