summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--core/include/fpdftext/fpdf_text.h12
-rw-r--r--core/src/fpdftext/fpdf_text_int.cpp179
-rw-r--r--core/src/fpdftext/text_int.h23
3 files changed, 81 insertions, 133 deletions
diff --git a/core/include/fpdftext/fpdf_text.h b/core/include/fpdftext/fpdf_text.h
index f27f1db7e6..a14552ef24 100644
--- a/core/include/fpdftext/fpdf_text.h
+++ b/core/include/fpdftext/fpdf_text.h
@@ -72,23 +72,20 @@ class CPDFText_ParseOptions {
FX_BOOL m_bNormalizeObjs;
FX_BOOL m_bOutputHyphen;
};
+
class IPDF_TextPage {
public:
- virtual ~IPDF_TextPage() {}
- static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage,
- CPDFText_ParseOptions ParserOptions);
static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0);
- static IPDF_TextPage* CreateTextPage(const CPDF_PageObjects* pObjs,
- int flags = 0);
static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage);
+ virtual ~IPDF_TextPage() {}
+
virtual void NormalizeObjects(FX_BOOL bNormalize) = 0;
virtual FX_BOOL ParseTextPage() = 0;
- virtual FX_BOOL IsParsered() const = 0;
+ virtual bool IsParsed() const = 0;
- public:
virtual int CharIndexFromTextIndex(int TextIndex) const = 0;
virtual int TextIndexFromCharIndex(int CharIndex) const = 0;
@@ -139,6 +136,7 @@ class IPDF_TextPage {
virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0;
};
+
#define FPDFTEXT_MATCHCASE 0x00000001
#define FPDFTEXT_MATCHWHOLEWORD 0x00000002
#define FPDFTEXT_CONSECUTIVE 0x00000004
diff --git a/core/src/fpdftext/fpdf_text_int.cpp b/core/src/fpdftext/fpdf_text_int.cpp
index 462f1369dd..b81d967776 100644
--- a/core/src/fpdftext/fpdf_text_int.cpp
+++ b/core/src/fpdftext/fpdf_text_int.cpp
@@ -81,29 +81,21 @@ CPDFText_ParseOptions::CPDFText_ParseOptions()
: m_bGetCharCodeOnly(FALSE),
m_bNormalizeObjs(TRUE),
m_bOutputHyphen(FALSE) {}
-IPDF_TextPage* IPDF_TextPage::CreateTextPage(
- const CPDF_Page* pPage,
- CPDFText_ParseOptions ParserOptions) {
- return new CPDF_TextPage(pPage, ParserOptions);
-}
+
IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
int flags) {
return new CPDF_TextPage(pPage, flags);
}
-IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_PageObjects* pObjs,
- int flags) {
- return new CPDF_TextPage(pObjs, flags);
-}
+
IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
const IPDF_TextPage* pTextPage) {
- if (!pTextPage) {
- return NULL;
- }
- return new CPDF_TextPageFind(pTextPage);
+ return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr;
}
+
IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
return new CPDF_LinkExtract();
}
+
#define TEXT_BLANK_CHAR L' '
#define TEXT_LINEFEED_CHAR L'\n'
#define TEXT_RETURN_CHAR L'\r'
@@ -112,47 +104,21 @@ IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
#define TEXT_RETURN_LINEFEED L"\r\n"
#define TEXT_LINEFEED L"\n"
#define TEXT_CHARRATIO_GAPDELTA 0.070
+
CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
- : m_charList(512),
- m_TempCharList(50),
- m_pPreTextObj(NULL),
- m_IsParsered(FALSE),
- m_TextlineDir(-1),
- m_CurlineRect(0, 0, 0, 0) {
- m_pPage = pPage;
- m_parserflag = flags;
- m_TextBuf.EstimateSize(0, 10240);
- pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
- (int)pPage->GetPageHeight(), 0);
-}
-CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage,
- CPDFText_ParseOptions ParserOptions)
- : m_ParseOptions(ParserOptions),
+ : m_pPage(pPage),
m_charList(512),
m_TempCharList(50),
- m_pPreTextObj(NULL),
- m_IsParsered(FALSE),
+ m_parserflag(flags),
+ m_pPreTextObj(nullptr),
+ m_bIsParsed(false),
m_TextlineDir(-1),
m_CurlineRect(0, 0, 0, 0) {
- m_pPage = pPage;
- m_parserflag = 0;
m_TextBuf.EstimateSize(0, 10240);
pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
(int)pPage->GetPageHeight(), 0);
}
-CPDF_TextPage::CPDF_TextPage(const CPDF_PageObjects* pPage, int flags)
- : m_charList(512),
- m_TempCharList(50),
- m_pPreTextObj(NULL),
- m_IsParsered(FALSE),
- m_TextlineDir(-1),
- m_CurlineRect(0, 0, 0, 0) {
- m_pPage = pPage;
- m_parserflag = flags;
- m_TextBuf.EstimateSize(0, 10240);
- CFX_FloatRect pageRect = pPage->CalcBoundingBox();
- m_DisplayMatrix = CFX_AffineMatrix(1, 0, 0, -1, pageRect.right, pageRect.top);
-}
+
void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) {
m_ParseOptions.m_bNormalizeObjs = bNormalize;
}
@@ -172,16 +138,15 @@ bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
}
}
FX_BOOL CPDF_TextPage::ParseTextPage() {
- if (!m_pPage) {
- m_IsParsered = FALSE;
+ m_bIsParsed = false;
+ if (!m_pPage)
return FALSE;
- }
- m_IsParsered = FALSE;
+
m_TextBuf.Clear();
m_charList.RemoveAll();
m_pPreTextObj = NULL;
ProcessObject();
- m_IsParsered = TRUE;
+ m_bIsParsed = true;
if (!m_ParseOptions.m_bGetCharCodeOnly) {
m_CharIndex.RemoveAll();
int nCount = m_charList.GetSize();
@@ -269,7 +234,7 @@ void CPDF_TextPage::GetRectArray(int start,
if (start < 0 || nCount == 0) {
return;
}
- if (!m_IsParsered) {
+ if (!m_bIsParsed) {
return;
}
PAGECHAR_INFO info_curchar;
@@ -352,12 +317,9 @@ void CPDF_TextPage::GetRectArray(int start,
int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
FX_FLOAT xTolerance,
FX_FLOAT yTolerance) const {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
- return -3;
- }
- if (!m_IsParsered) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return -3;
- }
+
int pos = 0;
int NearPos = -1;
double xdif = 5000, ydif = 5000;
@@ -400,9 +362,9 @@ int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
}
CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
CFX_WideString strText;
- if (m_ParseOptions.m_bGetCharCodeOnly || !m_IsParsered) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return strText;
- }
+
int nCount = m_charList.GetSize();
int pos = 0;
FX_FLOAT posy = 0;
@@ -438,12 +400,9 @@ CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
}
void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect,
CFX_RectArray& resRectArray) const {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return;
- }
- if (!m_IsParsered) {
- return;
- }
+
CFX_FloatRect curRect;
FX_BOOL flagNewRect = TRUE;
CPDF_TextObject* pCurObj = NULL;
@@ -498,15 +457,12 @@ int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x,
return GetIndexAtPos(point, xTolerance, yTolerance);
}
void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO& info) const {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return;
- }
- if (!m_IsParsered) {
- return;
- }
- if (index < 0 || index >= m_charList.GetSize()) {
+
+ if (index < 0 || index >= m_charList.GetSize())
return;
- }
+
PAGECHAR_INFO charinfo;
charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
info.m_Charcode = charinfo.m_CharCode;
@@ -561,12 +517,12 @@ void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
}
}
CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
- if (!m_IsParsered || nCount == 0) {
+ if (!m_bIsParsed || nCount == 0)
return L"";
- }
- if (start < 0) {
+
+ if (start < 0)
start = 0;
- }
+
if (nCount == -1) {
nCount = m_charList.GetSize() - start;
return m_TextBuf.GetWideString().Mid(start,
@@ -610,15 +566,9 @@ CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
return m_TextBuf.GetWideString().Mid(startindex, nCount);
}
int CPDF_TextPage::CountRects(int start, int nCount) {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed || start < 0)
return -1;
- }
- if (!m_IsParsered) {
- return -1;
- }
- if (start < 0) {
- return -1;
- }
+
if (nCount == -1 || nCount + start > m_charList.GetSize()) {
nCount = m_charList.GetSize() - start;
}
@@ -631,12 +581,12 @@ void CPDF_TextPage::GetRect(int rectIndex,
FX_FLOAT& top,
FX_FLOAT& right,
FX_FLOAT& bottom) const {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return;
- }
- if (!m_IsParsered || rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) {
+
+ if (rectIndex < 0 || rectIndex >= m_SelRects.GetSize())
return;
- }
+
left = m_SelRects.GetAt(rectIndex).left;
top = m_SelRects.GetAt(rectIndex).top;
right = m_SelRects.GetAt(rectIndex).right;
@@ -703,12 +653,12 @@ FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect,
return GetBaselineRotate(start, end, Rotate);
}
FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return FALSE;
- }
- if (!m_IsParsered || rectIndex < 0 || rectIndex > m_SelRects.GetSize()) {
+
+ if (rectIndex < 0 || rectIndex > m_SelRects.GetSize())
return FALSE;
- }
+
CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
return GetBaselineRotate(rect, Rotate);
}
@@ -717,13 +667,13 @@ int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left,
FX_FLOAT right,
FX_FLOAT bottom,
FX_BOOL bContains) {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly)
return -1;
- }
+
m_Segment.RemoveAll();
- if (!m_IsParsered) {
+ if (!m_bIsParsed)
return -1;
- }
+
CFX_FloatRect rect(left, bottom, right, top);
rect.Normalize();
int nCount = m_charList.GetSize();
@@ -803,18 +753,15 @@ void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const {
count = m_Segment.GetAt(index).m_nCount;
}
int CPDF_TextPage::GetWordBreak(int index, int direction) const {
- if (m_ParseOptions.m_bGetCharCodeOnly) {
+ if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
return -1;
- }
- if (!m_IsParsered) {
- return -1;
- }
- if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) {
+
+ if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT)
return -1;
- }
- if (index < 0 || index >= m_charList.GetSize()) {
+
+ if (index < 0 || index >= m_charList.GetSize())
return -1;
- }
+
PAGECHAR_INFO charinfo;
charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
@@ -2562,24 +2509,30 @@ int CPDF_TextPageFind::GetMatchedCount() const {
int resEnd = GetCharIndex(m_resEnd);
return resEnd - resStart + 1;
}
-CPDF_LinkExtract::CPDF_LinkExtract() : m_pTextPage(NULL), m_IsParserd(FALSE) {}
+
+CPDF_LinkExtract::CPDF_LinkExtract()
+ : m_pTextPage(nullptr), m_bIsParsed(false) {
+}
+
CPDF_LinkExtract::~CPDF_LinkExtract() {
DeleteLinkList();
}
+
FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
- if (!pTextPage || !pTextPage->IsParsered()) {
+ if (!pTextPage || !pTextPage->IsParsed())
return FALSE;
- }
+
m_pTextPage = (const CPDF_TextPage*)pTextPage;
m_strPageText = m_pTextPage->GetPageText(0, -1);
DeleteLinkList();
if (m_strPageText.IsEmpty()) {
return FALSE;
}
- parserLink();
- m_IsParserd = TRUE;
+ ParseLink();
+ m_bIsParsed = true;
return TRUE;
}
+
void CPDF_LinkExtract::DeleteLinkList() {
while (m_LinkList.GetSize()) {
CPDF_LinkExt* linkinfo = NULL;
@@ -2590,12 +2543,12 @@ void CPDF_LinkExtract::DeleteLinkList() {
m_LinkList.RemoveAll();
}
int CPDF_LinkExtract::CountLinks() const {
- if (!m_IsParserd) {
+ if (!m_bIsParsed) {
return -1;
}
return m_LinkList.GetSize();
}
-void CPDF_LinkExtract::parserLink() {
+void CPDF_LinkExtract::ParseLink() {
int start = 0, pos = 0;
int TotalChar = m_pTextPage->CountChars();
while (pos < TotalChar) {
@@ -2748,7 +2701,7 @@ FX_BOOL CPDF_LinkExtract::AppendToLinkList(int start,
return TRUE;
}
CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
- if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
+ if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
return L"";
}
CPDF_LinkExt* link = NULL;
@@ -2761,7 +2714,7 @@ CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
void CPDF_LinkExtract::GetBoundedSegment(int index,
int& start,
int& count) const {
- if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
+ if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
return;
}
CPDF_LinkExt* link = NULL;
@@ -2773,7 +2726,7 @@ void CPDF_LinkExtract::GetBoundedSegment(int index,
count = link->m_Count;
}
void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const {
- if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
+ if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
return;
}
CPDF_LinkExt* link = NULL;
diff --git a/core/src/fpdftext/text_int.h b/core/src/fpdftext/text_int.h
index 481c48294f..0b3b9c8ae0 100644
--- a/core/src/fpdftext/text_int.h
+++ b/core/src/fpdftext/text_int.h
@@ -51,15 +51,13 @@ typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
class CPDF_TextPage : public IPDF_TextPage {
public:
- CPDF_TextPage(const CPDF_Page* pPage, int flags = 0);
- CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0);
- CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);
+ CPDF_TextPage(const CPDF_Page* pPage, int flags);
~CPDF_TextPage() override {}
// IPDF_TextPage
FX_BOOL ParseTextPage() override;
void NormalizeObjects(FX_BOOL bNormalize) override;
- FX_BOOL IsParsered() const override { return m_IsParsered; }
+ bool IsParsed() const override { return m_bIsParsed; }
int CharIndexFromTextIndex(int TextIndex) const override;
int TextIndexFromCharIndex(int CharIndex) const override;
int CountChars() const override;
@@ -132,20 +130,18 @@ class CPDF_TextPage : public IPDF_TextPage {
const CPDF_Font* pFont,
int nItems) const;
- protected:
CPDFText_ParseOptions m_ParseOptions;
CFX_WordArray m_CharIndex;
- const CPDF_PageObjects* m_pPage;
+ const CPDF_PageObjects* const m_pPage;
PAGECHAR_InfoArray m_charList;
CFX_WideTextBuf m_TextBuf;
PAGECHAR_InfoArray m_TempCharList;
CFX_WideTextBuf m_TempTextBuf;
- int m_parserflag;
+ const int m_parserflag;
CPDF_TextObject* m_pPreTextObj;
CFX_AffineMatrix m_perMatrix;
- FX_BOOL m_IsParsered;
+ bool m_bIsParsed;
CFX_AffineMatrix m_DisplayMatrix;
-
SEGMENT_Array m_Segment;
CFX_RectArray m_SelRects;
LINEOBJ m_LineObj;
@@ -155,7 +151,7 @@ class CPDF_TextPage : public IPDF_TextPage {
class CPDF_TextPageFind : public IPDF_TextPageFind {
public:
- CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
+ explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
~CPDF_TextPageFind() override {}
// IPDF_TextPageFind
@@ -200,6 +196,7 @@ class CPDF_TextPageFind : public IPDF_TextPageFind {
CFX_RectArray m_resArray;
FX_BOOL m_IsFind;
};
+
class CPDF_LinkExt {
public:
CPDF_LinkExt() {}
@@ -223,10 +220,10 @@ class CPDF_LinkExtract : public IPDF_LinkExtract {
void GetBoundedSegment(int index, int& start, int& count) const override;
void GetRects(int index, CFX_RectArray& rects) const override;
- FX_BOOL IsExtract() const { return m_IsParserd; }
+ FX_BOOL IsExtract() const { return m_bIsParsed; }
protected:
- void parserLink();
+ void ParseLink();
void DeleteLinkList();
FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
FX_BOOL CheckMailLink(CFX_WideString& str);
@@ -236,7 +233,7 @@ class CPDF_LinkExtract : public IPDF_LinkExtract {
LINK_InfoArray m_LinkList;
const CPDF_TextPage* m_pTextPage;
CFX_WideString m_strPageText;
- FX_BOOL m_IsParserd;
+ bool m_bIsParsed;
};
FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst);