From 3bee9c60f013b8b7e99c39ee35699d132b330334 Mon Sep 17 00:00:00 2001 From: Miklos Vajna Date: Tue, 7 Aug 2018 21:45:34 +0000 Subject: Add FPDFTextObj_GetText() API Generalize CPDF_TextPage::GetTextByRect(), so that it's possible to get the text from a text page using a predicate, that way we can easily get the text that belongs to single text object as well. Change-Id: Ia457af0f41184694dc1481709be72b35685bce7f Reviewed-on: https://pdfium-review.googlesource.com/39530 Reviewed-by: Henrique Nakashima Reviewed-by: Lei Zhang Commit-Queue: Lei Zhang --- core/fpdftext/cpdf_textpage.cpp | 18 +++++++++++++-- core/fpdftext/cpdf_textpage.h | 4 ++++ fpdfsdk/fpdf_edittext.cpp | 18 +++++++++++++++ fpdfsdk/fpdf_text_embeddertest.cpp | 45 ++++++++++++++++++++++++++++++++++++++ fpdfsdk/fpdf_view_c_api_test.c | 1 + public/fpdf_edit.h | 20 +++++++++++++++++ 6 files changed, 104 insertions(+), 2 deletions(-) diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index 2894160437..ed7f36fb6c 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp @@ -426,7 +426,8 @@ int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point, return pos < nCount ? pos : NearPos; } -WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { +WideString CPDF_TextPage::GetTextByPredicate( + const std::function& predicate) const { if (!m_bIsParsed) return WideString(); @@ -435,7 +436,7 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { bool IsAddLineFeed = false; WideString strText; for (const auto& charinfo : m_CharList) { - if (IsRectIntersect(rect, charinfo.m_CharBox)) { + if (predicate(charinfo)) { if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar && IsAddLineFeed) { posy = charinfo.m_Origin.y; @@ -460,6 +461,19 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { return strText; } +WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { + return GetTextByPredicate([&rect](const PAGECHAR_INFO& charinfo) { + return IsRectIntersect(rect, charinfo.m_CharBox); + }); +} + +WideString CPDF_TextPage::GetTextByObject( + const CPDF_TextObject* pTextObj) const { + return GetTextByPredicate([pTextObj](const PAGECHAR_INFO& charinfo) { + return charinfo.m_pTextObj == pTextObj; + }); +} + void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const { if (!m_bIsParsed || !pdfium::IndexInBounds(m_CharList, index)) return; diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h index 36d01854f5..90b45bd96b 100644 --- a/core/fpdftext/cpdf_textpage.h +++ b/core/fpdftext/cpdf_textpage.h @@ -8,6 +8,7 @@ #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ #include +#include #include #include "core/fpdfapi/page/cpdf_pageobjectlist.h" @@ -97,6 +98,7 @@ class CPDF_TextPage { std::vector GetRectArray(int start, int nCount) const; int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; WideString GetTextByRect(const CFX_FloatRect& rect) const; + WideString GetTextByObject(const CPDF_TextObject* pTextObj) const; // Returns string with the text from |m_TextBuf| that are covered by the input // range. |start| and |count| are in terms of the |m_CharIndex|, so the range @@ -151,6 +153,8 @@ class CPDF_TextPage { TextOrientation FindTextlineFlowOrientation() const; void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix); void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); + WideString GetTextByPredicate( + const std::function& predicate) const; UnownedPtr const m_pPage; std::vector m_CharIndex; diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp index 6aa44b3b20..2773763b9a 100644 --- a/fpdfsdk/fpdf_edittext.cpp +++ b/fpdfsdk/fpdf_edittext.cpp @@ -22,6 +22,7 @@ #include "core/fpdfapi/parser/cpdf_number.h" #include "core/fpdfapi/parser/cpdf_reference.h" #include "core/fpdfapi/parser/cpdf_stream.h" +#include "core/fpdftext/cpdf_textpage.h" #include "core/fxcrt/fx_extension.h" #include "core/fxge/cfx_fontmgr.h" #include "core/fxge/fx_font.h" @@ -564,6 +565,23 @@ FPDFTextObj_GetFontName(FPDF_PAGEOBJECT text, return dwStringLen; } +FPDF_EXPORT unsigned long FPDF_CALLCONV +FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, + FPDF_TEXTPAGE text_page, + void* buffer, + unsigned long length) { + CPDF_TextObject* pTextObj = CPDFTextObjectFromFPDFPageObject(text_object); + if (!pTextObj) + return 0; + + CPDF_TextPage* pTextPage = CPDFTextPageFromFPDFTextPage(text_page); + if (!pTextPage) + return 0; + + WideString text = pTextPage->GetTextByObject(pTextObj); + return Utf16EncodeMaybeCopyAndReturnLength(text, buffer, length); +} + FPDF_EXPORT void FPDF_CALLCONV FPDFFont_Close(FPDF_FONT font) { CPDF_Font* pFont = CPDFFontFromFPDFFont(font); if (!pFont) diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp index eafe1a2d11..bf064d672a 100644 --- a/fpdfsdk/fpdf_text_embeddertest.cpp +++ b/fpdfsdk/fpdf_text_embeddertest.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "core/fxcrt/fx_memory.h" #include "public/cpp/fpdf_scopers.h" @@ -762,6 +763,50 @@ TEST_F(FPDFTextEmbeddertest, CountRects) { UnloadPage(page); } +TEST_F(FPDFTextEmbeddertest, GetText) { + ASSERT_TRUE(OpenDocument("hello_world.pdf")); + FPDF_PAGE page = LoadPage(0); + ASSERT_TRUE(page); + + FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page); + ASSERT_TRUE(text_page); + + EXPECT_EQ(2, FPDFPage_CountObjects(page)); + FPDF_PAGEOBJECT text_object = FPDFPage_GetObject(page, 0); + ASSERT_TRUE(text_object); + + // Positive testing. + constexpr char kHelloText[] = "Hello, world!"; + // Return value includes the terminating NUL that is provided. + constexpr unsigned long kHelloUTF16Size = FX_ArraySize(kHelloText) * 2; + constexpr wchar_t kHelloWideText[] = L"Hello, world!"; + unsigned long size = FPDFTextObj_GetText(text_object, text_page, nullptr, 0); + ASSERT_EQ(kHelloUTF16Size, size); + + std::vector buffer(size); + ASSERT_EQ(size, + FPDFTextObj_GetText(text_object, text_page, buffer.data(), size)); + ASSERT_EQ(kHelloWideText, GetPlatformWString(buffer.data())); + + // Negative testing. + ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, text_page, nullptr, 0)); + ASSERT_EQ(0U, FPDFTextObj_GetText(text_object, nullptr, nullptr, 0)); + ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, nullptr, nullptr, 0)); + + // Buffer is too small, ensure it's not modified. + buffer.resize(2); + buffer[0] = 'x'; + buffer[1] = '\0'; + size = + FPDFTextObj_GetText(text_object, text_page, buffer.data(), buffer.size()); + ASSERT_EQ(kHelloUTF16Size, size); + ASSERT_EQ('x', buffer[0]); + ASSERT_EQ('\0', buffer[1]); + + FPDFText_ClosePage(text_page); + UnloadPage(page); +} + TEST_F(FPDFTextEmbeddertest, CroppedText) { static constexpr int kPageCount = 4; static constexpr FS_RECTF kBoxes[kPageCount] = { diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c index 91d24fd233..56e9d7abed 100644 --- a/fpdfsdk/fpdf_view_c_api_test.c +++ b/fpdfsdk/fpdf_view_c_api_test.c @@ -206,6 +206,7 @@ int CheckPDFiumCApi() { CHK(FPDFPath_SetStrokeWidth); CHK(FPDFTextObj_GetFontName); CHK(FPDFTextObj_GetFontSize); + CHK(FPDFTextObj_GetText); CHK(FPDFText_GetMatrix); CHK(FPDFText_GetTextRenderMode); CHK(FPDFText_LoadFont); diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h index 4d5aa9c48a..83fedba905 100644 --- a/public/fpdf_edit.h +++ b/public/fpdf_edit.h @@ -1274,6 +1274,26 @@ FPDFTextObj_GetFontName(FPDF_PAGEOBJECT text, void* buffer, unsigned long length); +// Experimental API. +// Get the text of a text object. +// +// text_object - the handle to the text object. +// text_page - the handle to the text page. +// buffer - the address of a buffer that receives the text. +// length - the size, in bytes, of |buffer|. +// +// Returns the number of bytes in the text (including the trailing NUL +// character) on success, 0 on error. +// +// Regardless of the platform, the |buffer| is always in UTF16-LE encoding. +// If |length| is less than the returned length, or |buffer| is NULL, |buffer| +// will not be modified. +FPDF_EXPORT unsigned long FPDF_CALLCONV +FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, + FPDF_TEXTPAGE text_page, + void* buffer, + unsigned long length); + // Experimental API. // Get number of page objects inside |form_object|. // -- cgit v1.2.3