diff options
-rw-r--r-- | core/fpdftext/cpdf_linkextract.cpp | 2 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpage.cpp | 56 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpage.h | 8 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpagefind.cpp | 4 | ||||
-rw-r--r-- | fpdfsdk/fpdftext.cpp | 21 | ||||
-rw-r--r-- | fpdfsdk/fpdftext_embeddertest.cpp | 35 | ||||
-rw-r--r-- | testing/resources/control_characters.in | 54 | ||||
-rw-r--r-- | testing/resources/control_characters.pdf | 64 |
8 files changed, 183 insertions, 61 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 3a38343721..05cbdfb3a7 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -114,7 +114,7 @@ void CPDF_LinkExtract::ExtractLinks() { if (!m_pTextPage->IsParsed()) return; - m_strPageText = m_pTextPage->GetPageText(0, -1); + m_strPageText = m_pTextPage->GetAllPageText(); if (m_strPageText.IsEmpty()) return; diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index 7ea2061c26..8ef5522bae 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp @@ -436,49 +436,27 @@ void CPDF_TextPage::CheckMarkedContentObject(int32_t& start, } } -WideString CPDF_TextPage::GetPageText(int start, int nCount) const { - if (!m_bIsParsed || nCount == 0) +WideString CPDF_TextPage::GetPageText(int start, int count) const { + if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed || + m_CharList.empty() || m_TextBuf.GetLength() == 0) { return L""; - - if (start < 0) - start = 0; - - if (nCount == -1) { - nCount = pdfium::CollectionSize<int>(m_CharList) - start; - WideStringView wsTextBuf = m_TextBuf.AsStringView(); - return WideString(wsTextBuf.Right(wsTextBuf.GetLength() - start)); } - if (nCount <= 0 || m_CharList.empty()) - return L""; - if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1) - nCount = pdfium::CollectionSize<int>(m_CharList) - start; - if (nCount <= 0) + + int text_start = TextIndexFromCharIndex(start); + if (text_start < 0) return L""; - CheckMarkedContentObject(start, nCount); - int startindex = 0; - PAGECHAR_INFO charinfo = m_CharList[start]; - int startOffset = 0; - while (charinfo.m_Index == -1) { - startOffset++; - if (startOffset > nCount || - start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) { - return L""; - } - charinfo = m_CharList[start + startOffset]; - } - startindex = charinfo.m_Index; - charinfo = m_CharList[start + nCount - 1]; - int nCountOffset = 0; - while (charinfo.m_Index == -1) { - nCountOffset++; - if (nCountOffset >= nCount) - return L""; - charinfo = m_CharList[start + nCount - nCountOffset - 1]; - } - nCount = start + nCount - nCountOffset - startindex; - if (nCount <= 0) + + count = std::min(count, CountChars() - start); + + int last = start + count - 1; + int text_last = TextIndexFromCharIndex(last); + if (text_last < 0 || text_last < text_start) return L""; - return WideString(m_TextBuf.AsStringView().Mid(startindex, nCount)); + + int text_count = text_last - text_start + 1; + + return WideString(m_TextBuf.AsStringView().Mid( + static_cast<size_t>(text_start), static_cast<size_t>(text_count))); } int CPDF_TextPage::CountRects(int start, int nCount) { diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h index e8ab82ac2c..cd30ace3ad 100644 --- a/core/fpdftext/cpdf_textpage.h +++ b/core/fpdftext/cpdf_textpage.h @@ -103,7 +103,13 @@ class CPDF_TextPage { std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const; int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; WideString GetTextByRect(const CFX_FloatRect& rect) const; - WideString GetPageText(int start = 0, int nCount = -1) const; + + // Returns string with the text from |m_TextBuf| that are covered by the input + // range. |start| and |count| are in terms of the m_CharIndex, so the range + // will be converted into appropriate indices. + WideString GetPageText(int start, int count) const; + WideString GetAllPageText() const { return GetPageText(0, CountChars()); } + int CountRects(int start, int nCount); void GetRect(int rectIndex, float& left, diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp index a874521326..9f243a0aee 100644 --- a/core/fpdftext/cpdf_textpagefind.cpp +++ b/core/fpdftext/cpdf_textpagefind.cpp @@ -41,7 +41,7 @@ CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) m_resStart(0), m_resEnd(-1), m_IsFind(false) { - m_strText = m_pTextPage->GetPageText(); + m_strText = m_pTextPage->GetAllPageText(); int nCount = pTextPage->CountChars(); if (nCount) m_CharIndex.push_back(0); @@ -85,7 +85,7 @@ bool CPDF_TextPageFind::FindFirst(const WideString& findwhat, if (!m_pTextPage) return false; if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) - m_strText = m_pTextPage->GetPageText(); + m_strText = m_pTextPage->GetAllPageText(); WideString findwhatStr = findwhat; m_findWhat = findwhatStr; m_flags = flags; diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp index 5a2deb9a81..d9f7d572e9 100644 --- a/fpdfsdk/fpdftext.cpp +++ b/fpdfsdk/fpdftext.cpp @@ -179,25 +179,10 @@ FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page, return 1; } - // char_* values are for a data structure that includes non-printing unicode - // characters, where the text_* values are from a data structure that doesn't - // include these characters, so translation is needed. - int text_start = textpage->TextIndexFromCharIndex(char_start); - if (text_start == -1) - return 0; - - int char_last = char_start + char_count - 1; - int text_last = textpage->TextIndexFromCharIndex(char_last); - if (text_last == -1) - return 0; - - int text_count = text_last - text_start + 1; - if (text_count < 1) - return 0; + WideString str = textpage->GetPageText(char_start, char_count); - WideString str = textpage->GetPageText(text_start, text_count); - if (str.GetLength() > static_cast<size_t>(text_count)) - str = str.Left(static_cast<size_t>(text_count)); + if (str.GetLength() > static_cast<size_t>(char_count)) + str = str.Left(static_cast<size_t>(char_count)); // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected // the number of items to stay the same. diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp index 51216b9818..60654057b9 100644 --- a/fpdfsdk/fpdftext_embeddertest.cpp +++ b/fpdfsdk/fpdftext_embeddertest.cpp @@ -590,3 +590,38 @@ TEST_F(FPDFTextEmbeddertest, bug_782596) { FPDFText_ClosePage(textpage); UnloadPage(page); } + +TEST_F(FPDFTextEmbeddertest, ControlCharacters) { + EXPECT_TRUE(OpenDocument("control_characters.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + + // Should not include the control characters in the output + static const char expected[] = "Hello, world!\r\nGoodbye, world!"; + unsigned short fixed_buffer[128]; + memset(fixed_buffer, 0xbd, sizeof(fixed_buffer)); + int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer); + + ASSERT_GE(num_chars, 0); + EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars)); + EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected))); + + // Attempting to get a chunk of text after the control characters + static const char expected_substring[] = "Goodbye, world!"; + // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the + // original stream + static const int offset = 17; + memset(fixed_buffer, 0xbd, sizeof(fixed_buffer)); + num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer); + + ASSERT_GE(num_chars, 0); + EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars)); + EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer, + sizeof(expected_substring))); + + FPDFText_ClosePage(textpage); + UnloadPage(page); +} diff --git a/testing/resources/control_characters.in b/testing/resources/control_characters.in new file mode 100644 index 0000000000..ca7827fe11 --- /dev/null +++ b/testing/resources/control_characters.in @@ -0,0 +1,54 @@ +{{header}} +{{object 1 0}} << + /Type /Catalog + /Pages 2 0 R +>> +endobj +{{object 2 0}} << + /Type /Pages + /MediaBox [ 0 0 200 200 ] + /Count 1 + /Kids [ 3 0 R ] +>> +endobj +{{object 3 0}} << + /Type /Page + /Parent 2 0 R + /Resources << + /Font << + /F1 4 0 R + /F2 5 0 R + >> + >> + /Contents 6 0 R +>> +endobj +{{object 4 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +{{object 5 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +{{object 6 0}} << +>> +stream +BT +20 50 Td +/F1 12 Tf +(Hello\2\3, world!) Tj +0 50 Td +/F2 16 Tf +(Goodbye, world!) Tj +ET +endstream +endobj +{{xref}} +{{trailer}} +{{startxref}} +%%EOF diff --git a/testing/resources/control_characters.pdf b/testing/resources/control_characters.pdf new file mode 100644 index 0000000000..535009733f --- /dev/null +++ b/testing/resources/control_characters.pdf @@ -0,0 +1,64 @@ +%PDF-1.7 +% ò¤ô +1 0 obj << + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj << + /Type /Pages + /MediaBox [ 0 0 200 200 ] + /Count 1 + /Kids [ 3 0 R ] +>> +endobj +3 0 obj << + /Type /Page + /Parent 2 0 R + /Resources << + /Font << + /F1 4 0 R + /F2 5 0 R + >> + >> + /Contents 6 0 R +>> +endobj +4 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +5 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +6 0 obj << +>> +stream +BT +20 50 Td +/F1 12 Tf +(Hello\2\3, world!) Tj +0 50 Td +/F2 16 Tf +(Goodbye, world!) Tj +ET +endstream +endobj +xref +0 7 +0000000000 65535 f +0000000015 00000 n +0000000068 00000 n +0000000161 00000 n +0000000303 00000 n +0000000381 00000 n +0000000457 00000 n +trailer<< /Root 1 0 R /Size 7 >> +startxref +582 +%%EOF |