summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiklos Vajna <vmiklos@collabora.co.uk>2018-08-07 21:45:34 +0000
committerChromium commit bot <commit-bot@chromium.org>2018-08-07 21:45:34 +0000
commit3bee9c60f013b8b7e99c39ee35699d132b330334 (patch)
treefc00b16bf5c6b84af3c4683e43a2652b80db173b
parente1c2f6d7fe7a50280161832799550a3ee8f98088 (diff)
downloadpdfium-3bee9c60f013b8b7e99c39ee35699d132b330334.tar.xz
Add FPDFTextObj_GetText() API
Generalize CPDF_TextPage::GetTextByRect(), so that it's possible to get the text from a text page using a predicate, that way we can easily get the text that belongs to single text object as well. Change-Id: Ia457af0f41184694dc1481709be72b35685bce7f Reviewed-on: https://pdfium-review.googlesource.com/39530 Reviewed-by: Henrique Nakashima <hnakashima@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org>
-rw-r--r--core/fpdftext/cpdf_textpage.cpp18
-rw-r--r--core/fpdftext/cpdf_textpage.h4
-rw-r--r--fpdfsdk/fpdf_edittext.cpp18
-rw-r--r--fpdfsdk/fpdf_text_embeddertest.cpp45
-rw-r--r--fpdfsdk/fpdf_view_c_api_test.c1
-rw-r--r--public/fpdf_edit.h20
6 files changed, 104 insertions, 2 deletions
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 2894160437..ed7f36fb6c 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -426,7 +426,8 @@ int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
return pos < nCount ? pos : NearPos;
}
-WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
+WideString CPDF_TextPage::GetTextByPredicate(
+ const std::function<bool(const PAGECHAR_INFO&)>& predicate) const {
if (!m_bIsParsed)
return WideString();
@@ -435,7 +436,7 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
bool IsAddLineFeed = false;
WideString strText;
for (const auto& charinfo : m_CharList) {
- if (IsRectIntersect(rect, charinfo.m_CharBox)) {
+ if (predicate(charinfo)) {
if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
IsAddLineFeed) {
posy = charinfo.m_Origin.y;
@@ -460,6 +461,19 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
return strText;
}
+WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
+ return GetTextByPredicate([&rect](const PAGECHAR_INFO& charinfo) {
+ return IsRectIntersect(rect, charinfo.m_CharBox);
+ });
+}
+
+WideString CPDF_TextPage::GetTextByObject(
+ const CPDF_TextObject* pTextObj) const {
+ return GetTextByPredicate([pTextObj](const PAGECHAR_INFO& charinfo) {
+ return charinfo.m_pTextObj == pTextObj;
+ });
+}
+
void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
if (!m_bIsParsed || !pdfium::IndexInBounds(m_CharList, index))
return;
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index 36d01854f5..90b45bd96b 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -8,6 +8,7 @@
#define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
#include <deque>
+#include <functional>
#include <vector>
#include "core/fpdfapi/page/cpdf_pageobjectlist.h"
@@ -97,6 +98,7 @@ class CPDF_TextPage {
std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
WideString GetTextByRect(const CFX_FloatRect& rect) const;
+ WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
// Returns string with the text from |m_TextBuf| that are covered by the input
// range. |start| and |count| are in terms of the |m_CharIndex|, so the range
@@ -151,6 +153,8 @@ class CPDF_TextPage {
TextOrientation FindTextlineFlowOrientation() const;
void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
+ WideString GetTextByPredicate(
+ const std::function<bool(const PAGECHAR_INFO&)>& predicate) const;
UnownedPtr<const CPDF_Page> const m_pPage;
std::vector<uint16_t> m_CharIndex;
diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index 6aa44b3b20..2773763b9a 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp
@@ -22,6 +22,7 @@
#include "core/fpdfapi/parser/cpdf_number.h"
#include "core/fpdfapi/parser/cpdf_reference.h"
#include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fpdftext/cpdf_textpage.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxge/cfx_fontmgr.h"
#include "core/fxge/fx_font.h"
@@ -564,6 +565,23 @@ FPDFTextObj_GetFontName(FPDF_PAGEOBJECT text,
return dwStringLen;
}
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+ FPDF_TEXTPAGE text_page,
+ void* buffer,
+ unsigned long length) {
+ CPDF_TextObject* pTextObj = CPDFTextObjectFromFPDFPageObject(text_object);
+ if (!pTextObj)
+ return 0;
+
+ CPDF_TextPage* pTextPage = CPDFTextPageFromFPDFTextPage(text_page);
+ if (!pTextPage)
+ return 0;
+
+ WideString text = pTextPage->GetTextByObject(pTextObj);
+ return Utf16EncodeMaybeCopyAndReturnLength(text, buffer, length);
+}
+
FPDF_EXPORT void FPDF_CALLCONV FPDFFont_Close(FPDF_FONT font) {
CPDF_Font* pFont = CPDFFontFromFPDFFont(font);
if (!pFont)
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index eafe1a2d11..bf064d672a 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -5,6 +5,7 @@
#include <algorithm>
#include <memory>
#include <utility>
+#include <vector>
#include "core/fxcrt/fx_memory.h"
#include "public/cpp/fpdf_scopers.h"
@@ -762,6 +763,50 @@ TEST_F(FPDFTextEmbeddertest, CountRects) {
UnloadPage(page);
}
+TEST_F(FPDFTextEmbeddertest, GetText) {
+ ASSERT_TRUE(OpenDocument("hello_world.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
+ ASSERT_TRUE(text_page);
+
+ EXPECT_EQ(2, FPDFPage_CountObjects(page));
+ FPDF_PAGEOBJECT text_object = FPDFPage_GetObject(page, 0);
+ ASSERT_TRUE(text_object);
+
+ // Positive testing.
+ constexpr char kHelloText[] = "Hello, world!";
+ // Return value includes the terminating NUL that is provided.
+ constexpr unsigned long kHelloUTF16Size = FX_ArraySize(kHelloText) * 2;
+ constexpr wchar_t kHelloWideText[] = L"Hello, world!";
+ unsigned long size = FPDFTextObj_GetText(text_object, text_page, nullptr, 0);
+ ASSERT_EQ(kHelloUTF16Size, size);
+
+ std::vector<unsigned short> buffer(size);
+ ASSERT_EQ(size,
+ FPDFTextObj_GetText(text_object, text_page, buffer.data(), size));
+ ASSERT_EQ(kHelloWideText, GetPlatformWString(buffer.data()));
+
+ // Negative testing.
+ ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, text_page, nullptr, 0));
+ ASSERT_EQ(0U, FPDFTextObj_GetText(text_object, nullptr, nullptr, 0));
+ ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, nullptr, nullptr, 0));
+
+ // Buffer is too small, ensure it's not modified.
+ buffer.resize(2);
+ buffer[0] = 'x';
+ buffer[1] = '\0';
+ size =
+ FPDFTextObj_GetText(text_object, text_page, buffer.data(), buffer.size());
+ ASSERT_EQ(kHelloUTF16Size, size);
+ ASSERT_EQ('x', buffer[0]);
+ ASSERT_EQ('\0', buffer[1]);
+
+ FPDFText_ClosePage(text_page);
+ UnloadPage(page);
+}
+
TEST_F(FPDFTextEmbeddertest, CroppedText) {
static constexpr int kPageCount = 4;
static constexpr FS_RECTF kBoxes[kPageCount] = {
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 91d24fd233..56e9d7abed 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -206,6 +206,7 @@ int CheckPDFiumCApi() {
CHK(FPDFPath_SetStrokeWidth);
CHK(FPDFTextObj_GetFontName);
CHK(FPDFTextObj_GetFontSize);
+ CHK(FPDFTextObj_GetText);
CHK(FPDFText_GetMatrix);
CHK(FPDFText_GetTextRenderMode);
CHK(FPDFText_LoadFont);
diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h
index 4d5aa9c48a..83fedba905 100644
--- a/public/fpdf_edit.h
+++ b/public/fpdf_edit.h
@@ -1275,6 +1275,26 @@ FPDFTextObj_GetFontName(FPDF_PAGEOBJECT text,
unsigned long length);
// Experimental API.
+// Get the text of a text object.
+//
+// text_object - the handle to the text object.
+// text_page - the handle to the text page.
+// buffer - the address of a buffer that receives the text.
+// length - the size, in bytes, of |buffer|.
+//
+// Returns the number of bytes in the text (including the trailing NUL
+// character) on success, 0 on error.
+//
+// Regardless of the platform, the |buffer| is always in UTF16-LE encoding.
+// If |length| is less than the returned length, or |buffer| is NULL, |buffer|
+// will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+ FPDF_TEXTPAGE text_page,
+ void* buffer,
+ unsigned long length);
+
+// Experimental API.
// Get number of page objects inside |form_object|.
//
// form_object - handle to a form object.