summaryrefslogtreecommitdiff
path: root/core/fpdftext
diff options
context:
space:
mode:
Diffstat (limited to 'core/fpdftext')
-rw-r--r--core/fpdftext/fpdf_text_int.cpp103
-rw-r--r--core/fpdftext/fpdf_text_int.h247
-rw-r--r--core/fpdftext/fpdf_text_int_unittest.cpp2
-rw-r--r--core/fpdftext/include/cpdf_linkextract.h54
-rw-r--r--core/fpdftext/include/cpdf_textpage.h157
-rw-r--r--core/fpdftext/include/cpdf_textpagefind.h65
-rw-r--r--core/fpdftext/include/ipdf_linkextract.h26
-rw-r--r--core/fpdftext/include/ipdf_textpage.h70
-rw-r--r--core/fpdftext/include/ipdf_textpagefind.h29
-rw-r--r--core/fpdftext/unicodenormalization.cpp50
-rw-r--r--core/fpdftext/unicodenormalization.h14
11 files changed, 344 insertions, 473 deletions
diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/fpdf_text_int.cpp
index 8e8686c4a1..741331fb77 100644
--- a/core/fpdftext/fpdf_text_int.cpp
+++ b/core/fpdftext/fpdf_text_int.cpp
@@ -4,8 +4,6 @@
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-#include "core/fpdftext/fpdf_text_int.h"
-
#include <algorithm>
#include <cctype>
#include <cwctype>
@@ -14,15 +12,17 @@
#include <vector>
#include "core/fpdfapi/fpdf_font/include/cpdf_font.h"
+#include "core/fpdfapi/fpdf_page/include/cpdf_form.h"
#include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h"
+#include "core/fpdfapi/fpdf_page/include/cpdf_page.h"
#include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h"
#include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"
#include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"
#include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"
-#include "core/fpdftext/include/ipdf_linkextract.h"
-#include "core/fpdftext/include/ipdf_textpage.h"
-#include "core/fpdftext/include/ipdf_textpagefind.h"
-#include "core/fpdftext/unicodenormalization.h"
+#include "core/fpdftext/include/cpdf_linkextract.h"
+#include "core/fpdftext/include/cpdf_textpage.h"
+#include "core/fpdftext/include/cpdf_textpagefind.h"
+#include "core/fpdftext/unicodenormalizationdata.h"
#include "core/fxcrt/fx_bidi.h"
#include "core/fxcrt/include/fx_ext.h"
#include "core/fxcrt/include/fx_ucd.h"
@@ -36,9 +36,24 @@
#define FPDFTEXT_MATCHWHOLEWORD 0x00000002
#define FPDFTEXT_CONSECUTIVE 0x00000004
+#define FPDFTEXT_CHAR_ERROR -1
+#define FPDFTEXT_CHAR_NORMAL 0
+#define FPDFTEXT_CHAR_GENERATED 1
+#define FPDFTEXT_CHAR_UNUNICODE 2
+#define FPDFTEXT_CHAR_HYPHEN 3
+#define FPDFTEXT_CHAR_PIECE 4
+#define FPDFTEXT_MC_PASS 0
+#define FPDFTEXT_MC_DONE 1
+#define FPDFTEXT_MC_DELAY 2
+
namespace {
-FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
+const FX_FLOAT kDefaultFontSize = 1.0f;
+const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
+ nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
+ g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
+
+FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
if (curChar < 255) {
return FALSE;
}
@@ -55,7 +70,7 @@ FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
return TRUE;
}
-FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
+FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
if (threshold < 300) {
return threshold / 2.0f;
}
@@ -68,8 +83,8 @@ FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
return threshold / 6.0f;
}
-FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
- const CFX_Matrix& matrix) {
+FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj,
+ const CFX_Matrix& matrix) {
FX_FLOAT baseSpace = 0.0;
const int nItems = pTextObj->CountItems();
if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
@@ -94,23 +109,39 @@ FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
return baseSpace;
}
-const FX_FLOAT kDefaultFontSize = 1.0f;
-
-} // namespace
-
-IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
- int flags) {
- return new CPDF_TextPage(pPage, flags);
-}
-
-IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
- const IPDF_TextPage* pTextPage) {
- return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr;
+FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) {
+ wch = wch & 0xFFFF;
+ FX_WCHAR wFind = g_UnicodeData_Normalization[wch];
+ if (!wFind) {
+ if (pDst) {
+ *pDst = wch;
+ }
+ return 1;
+ }
+ if (wFind >= 0x8000) {
+ wch = wFind - 0x8000;
+ wFind = 1;
+ } else {
+ wch = wFind & 0x0FFF;
+ wFind >>= 12;
+ }
+ const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind];
+ if (pMap == g_UnicodeData_Normalization_Map4) {
+ pMap = g_UnicodeData_Normalization_Map4 + wch;
+ wFind = (FX_WCHAR)(*pMap++);
+ } else {
+ pMap += wch;
+ }
+ if (pDst) {
+ FX_WCHAR n = wFind;
+ while (n--) {
+ *pDst++ = *pMap++;
+ }
+ }
+ return (FX_STRSIZE)wFind;
}
-IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
- return new CPDF_LinkExtract();
-}
+} // namespace
#define TEXT_BLANK_CHAR L' '
#define TEXT_LINEFEED_CHAR L'\n'
@@ -932,10 +963,10 @@ void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar,
info.m_Index = m_TextBuf.GetLength();
if (wChar >= 0xFB00 && wChar <= 0xFB06) {
FX_WCHAR* pDst = NULL;
- FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
+ FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
if (nCount >= 1) {
pDst = FX_Alloc(FX_WCHAR, nCount);
- FX_Unicode_GetNormalization(wChar, pDst);
+ Unicode_GetNormalization(wChar, pDst);
for (int nIndex = 0; nIndex < nCount; nIndex++) {
PAGECHAR_INFO info2 = info;
info2.m_Unicode = pDst[nIndex];
@@ -960,10 +991,10 @@ void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar,
info.m_Index = m_TextBuf.GetLength();
wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
FX_WCHAR* pDst = NULL;
- FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
+ FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst);
if (nCount >= 1) {
pDst = FX_Alloc(FX_WCHAR, nCount);
- FX_Unicode_GetNormalization(wChar, pDst);
+ Unicode_GetNormalization(wChar, pDst);
for (int nIndex = 0; nIndex < nCount; nIndex++) {
PAGECHAR_INFO info2 = info;
info2.m_Unicode = pDst[nIndex];
@@ -1377,7 +1408,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
m_pPreTextObj = pTextObj;
m_perMatrix.Copy(formMatrix);
int nItems = pTextObj->CountItems();
- FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix);
+ FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix);
const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems);
const FX_BOOL bIsBidiAndMirrorInverse =
@@ -1430,7 +1461,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
threshold = this_width > last_width ? (FX_FLOAT)this_width
: (FX_FLOAT)last_width;
- threshold = _NormalizeThreshold(threshold);
+ threshold = NormalizeThreshold(threshold);
threshold = fontsize_h * threshold / 1000;
}
if (threshold && (spacing && spacing >= threshold)) {
@@ -1898,7 +1929,7 @@ FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) {
return TRUE;
}
-CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
+CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
: m_pTextPage(pTextPage),
m_flags(0),
m_findNextStart(-1),
@@ -2054,8 +2085,8 @@ FX_BOOL CPDF_TextPageFind::FindNext() {
CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
if (nStartPos == nResultPos &&
- !(_IsIgnoreSpaceCharacter(lastChar) ||
- _IsIgnoreSpaceCharacter(curChar))) {
+ !(IsIgnoreSpaceCharacter(lastChar) ||
+ IsIgnoreSpaceCharacter(curChar))) {
bMatch = FALSE;
}
for (int d = PreResEndPos; d < nResultPos; d++) {
@@ -2174,7 +2205,7 @@ void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
while (pos < csWord.GetLength()) {
CFX_WideString curStr = csWord.Mid(pos, 1);
FX_WCHAR curChar = csWord.GetAt(pos);
- if (_IsIgnoreSpaceCharacter(curChar)) {
+ if (IsIgnoreSpaceCharacter(curChar)) {
if (pos > 0 && curChar == 0x2019) {
pos++;
continue;
@@ -2306,7 +2337,7 @@ CPDF_LinkExtract::~CPDF_LinkExtract() {
DeleteLinkList();
}
-FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
+FX_BOOL CPDF_LinkExtract::ExtractLinks(const CPDF_TextPage* pTextPage) {
if (!pTextPage || !pTextPage->IsParsed())
return FALSE;
diff --git a/core/fpdftext/fpdf_text_int.h b/core/fpdftext/fpdf_text_int.h
deleted file mode 100644
index 7acab55ccb..0000000000
--- a/core/fpdftext/fpdf_text_int.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FPDFTEXT_FPDF_TEXT_INT_H_
-#define CORE_FPDFTEXT_FPDF_TEXT_INT_H_
-
-#include <deque>
-#include <vector>
-
-#include "core/fpdfapi/fpdf_page/cpdf_pageobjectlist.h"
-#include "core/fpdfapi/fpdf_page/include/cpdf_form.h"
-#include "core/fpdfapi/fpdf_page/include/cpdf_page.h"
-#include "core/fpdftext/include/ipdf_linkextract.h"
-#include "core/fpdftext/include/ipdf_textpage.h"
-#include "core/fpdftext/include/ipdf_textpagefind.h"
-#include "core/fxcrt/include/fx_basic.h"
-
-class CFX_BidiChar;
-class CPDF_FormObject;
-class CPDF_LinkExtract;
-class CPDF_TextPageFind;
-class CPDF_Font;
-
-#define FPDFTEXT_CHAR_ERROR -1
-#define FPDFTEXT_CHAR_NORMAL 0
-#define FPDFTEXT_CHAR_GENERATED 1
-#define FPDFTEXT_CHAR_UNUNICODE 2
-#define FPDFTEXT_CHAR_HYPHEN 3
-#define FPDFTEXT_CHAR_PIECE 4
-#define FPDFTEXT_MC_PASS 0
-#define FPDFTEXT_MC_DONE 1
-#define FPDFTEXT_MC_DELAY 2
-
-struct PAGECHAR_INFO {
- int m_CharCode;
- FX_WCHAR m_Unicode;
- FX_FLOAT m_OriginX;
- FX_FLOAT m_OriginY;
- int32_t m_Flag;
- CFX_FloatRect m_CharBox;
- CPDF_TextObject* m_pTextObj;
- CFX_Matrix m_Matrix;
- int m_Index;
-};
-
-struct FPDF_SEGMENT {
- int m_Start;
- int m_nCount;
-};
-
-struct PDFTEXT_Obj {
- CPDF_TextObject* m_pTextObj;
- CFX_Matrix m_formMatrix;
-};
-
-class CPDF_TextPage : public IPDF_TextPage {
- public:
- CPDF_TextPage(const CPDF_Page* pPage, int flags);
- ~CPDF_TextPage() override {}
-
- // IPDF_TextPage:
- void ParseTextPage() override;
- bool IsParsed() const override { return m_bIsParsed; }
- int CharIndexFromTextIndex(int TextIndex) const override;
- int TextIndexFromCharIndex(int CharIndex) const override;
- int CountChars() const override;
- void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override;
- void GetRectArray(int start,
- int nCount,
- CFX_RectArray& rectArray) const override;
- int GetIndexAtPos(CFX_FloatPoint point,
- FX_FLOAT xTolerance,
- FX_FLOAT yTolerance) const override;
- int GetIndexAtPos(FX_FLOAT x,
- FX_FLOAT y,
- FX_FLOAT xTolerance,
- FX_FLOAT yTolerance) const override;
- CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override;
- void GetRectsArrayByRect(const CFX_FloatRect& rect,
- CFX_RectArray& resRectArray) const override;
- CFX_WideString GetPageText(int start = 0, int nCount = -1) const override;
- int CountRects(int start, int nCount) override;
- void GetRect(int rectIndex,
- FX_FLOAT& left,
- FX_FLOAT& top,
- FX_FLOAT& right,
- FX_FLOAT& bottom) const override;
- FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override;
- FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override;
- int CountBoundedSegments(FX_FLOAT left,
- FX_FLOAT top,
- FX_FLOAT right,
- FX_FLOAT bottom,
- FX_BOOL bContains = FALSE) override;
- void GetBoundedSegment(int index, int& start, int& count) const override;
- int GetWordBreak(int index, int direction) const override;
-
- static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1,
- const CFX_FloatRect& rect2);
- static FX_BOOL IsLetter(FX_WCHAR unicode);
-
- private:
- FX_BOOL IsHyphen(FX_WCHAR curChar);
- bool IsControlChar(const PAGECHAR_INFO& charInfo);
- FX_BOOL GetBaselineRotate(int start, int end, int& Rotate);
- void ProcessObject();
- void ProcessFormObject(CPDF_FormObject* pFormObj,
- const CFX_Matrix& formMatrix);
- void ProcessTextObject(PDFTEXT_Obj pObj);
- void ProcessTextObject(CPDF_TextObject* pTextObj,
- const CFX_Matrix& formMatrix,
- const CPDF_PageObjectList* pObjList,
- CPDF_PageObjectList::const_iterator ObjPos);
- int ProcessInsertObject(const CPDF_TextObject* pObj,
- const CFX_Matrix& formMatrix);
- FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
- FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
- const CPDF_PageObjectList* pObjList,
- CPDF_PageObjectList::const_iterator ObjPos);
- FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1,
- CPDF_TextObject* pTextObj2);
- int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const;
- void CloseTempLine();
- void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str);
- int32_t PreMarkedContent(PDFTEXT_Obj pObj);
- void ProcessMarkedContent(PDFTEXT_Obj pObj);
- void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
- void FindPreviousTextObject(void);
- void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
- void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
- int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
- int32_t FindTextlineFlowDirection();
-
- void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
- FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj,
- const CPDF_Font* pFont,
- int nItems) const;
-
- const CPDF_Page* const m_pPage;
- std::vector<uint16_t> m_CharIndex;
- std::deque<PAGECHAR_INFO> m_CharList;
- std::deque<PAGECHAR_INFO> m_TempCharList;
- CFX_WideTextBuf m_TextBuf;
- CFX_WideTextBuf m_TempTextBuf;
- const int m_parserflag;
- CPDF_TextObject* m_pPreTextObj;
- CFX_Matrix m_perMatrix;
- bool m_bIsParsed;
- CFX_Matrix m_DisplayMatrix;
- CFX_ArrayTemplate<FPDF_SEGMENT> m_Segments;
- CFX_RectArray m_SelRects;
- CFX_ArrayTemplate<PDFTEXT_Obj> m_LineObj;
- int32_t m_TextlineDir;
- CFX_FloatRect m_CurlineRect;
-};
-
-class CPDF_TextPageFind : public IPDF_TextPageFind {
- public:
- explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
- ~CPDF_TextPageFind() override {}
-
- // IPDF_TextPageFind
- FX_BOOL FindFirst(const CFX_WideString& findwhat,
- int flags,
- int startPos = 0) override;
- FX_BOOL FindNext() override;
- FX_BOOL FindPrev() override;
- void GetRectArray(CFX_RectArray& rects) const override;
- int GetCurOrder() const override;
- int GetMatchedCount() const override;
-
- protected:
- void ExtractFindWhat(const CFX_WideString& findwhat);
- FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText,
- int startPos,
- int endPos);
- FX_BOOL ExtractSubString(CFX_WideString& rString,
- const FX_WCHAR* lpszFullString,
- int iSubString,
- FX_WCHAR chSep);
- CFX_WideString MakeReverse(const CFX_WideString& str);
- int ReverseFind(const CFX_WideString& csPageText,
- const CFX_WideString& csWord,
- int nStartPos,
- int& WordLength);
- int GetCharIndex(int index) const;
-
- private:
- std::vector<uint16_t> m_CharIndex;
- const IPDF_TextPage* m_pTextPage;
- CFX_WideString m_strText;
- CFX_WideString m_findWhat;
- int m_flags;
- std::vector<CFX_WideString> m_csFindWhatArray;
- int m_findNextStart;
- int m_findPreStart;
- FX_BOOL m_bMatchCase;
- FX_BOOL m_bMatchWholeWord;
- int m_resStart;
- int m_resEnd;
- CFX_RectArray m_resArray;
- FX_BOOL m_IsFind;
-};
-
-class CPDF_LinkExt {
- public:
- CPDF_LinkExt() {}
- int m_Start;
- int m_Count;
- CFX_WideString m_strUrl;
- virtual ~CPDF_LinkExt() {}
-};
-
-typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
-
-class CPDF_LinkExtract : public IPDF_LinkExtract {
- public:
- CPDF_LinkExtract();
- ~CPDF_LinkExtract() override;
-
- // IPDF_LinkExtract
- FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override;
- int CountLinks() const override;
- CFX_WideString GetURL(int index) const override;
- void GetBoundedSegment(int index, int& start, int& count) const override;
- void GetRects(int index, CFX_RectArray& rects) const override;
-
- FX_BOOL IsExtract() const { return m_bIsParsed; }
-
- protected:
- void ParseLink();
- void DeleteLinkList();
- FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
- bool CheckMailLink(CFX_WideString& str);
- void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
-
- private:
- LINK_InfoArray m_LinkList;
- const CPDF_TextPage* m_pTextPage;
- CFX_WideString m_strPageText;
- bool m_bIsParsed;
-};
-
-#endif // CORE_FPDFTEXT_FPDF_TEXT_INT_H_
diff --git a/core/fpdftext/fpdf_text_int_unittest.cpp b/core/fpdftext/fpdf_text_int_unittest.cpp
index e62e885d4b..e1dd0f7504 100644
--- a/core/fpdftext/fpdf_text_int_unittest.cpp
+++ b/core/fpdftext/fpdf_text_int_unittest.cpp
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "core/fpdftext/fpdf_text_int.h"
+#include "core/fpdftext/include/cpdf_linkextract.h"
#include "testing/gtest/include/gtest/gtest.h"
diff --git a/core/fpdftext/include/cpdf_linkextract.h b/core/fpdftext/include/cpdf_linkextract.h
new file mode 100644
index 0000000000..263768ee5d
--- /dev/null
+++ b/core/fpdftext/include/cpdf_linkextract.h
@@ -0,0 +1,54 @@
+// Copyright 2014 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FPDFTEXT_INCLUDE_CPDF_LINKEXTRACT_H_
+#define CORE_FPDFTEXT_INCLUDE_CPDF_LINKEXTRACT_H_
+
+#include "core/fxcrt/include/fx_basic.h"
+#include "core/fxcrt/include/fx_coordinates.h"
+#include "core/fxcrt/include/fx_string.h"
+#include "core/fxcrt/include/fx_system.h"
+
+class CPDF_TextPage;
+
+class CPDF_LinkExt {
+ public:
+ CPDF_LinkExt() {}
+ ~CPDF_LinkExt() {}
+
+ int m_Start;
+ int m_Count;
+ CFX_WideString m_strUrl;
+};
+
+class CPDF_LinkExtract {
+ public:
+ CPDF_LinkExtract();
+ ~CPDF_LinkExtract();
+
+ FX_BOOL ExtractLinks(const CPDF_TextPage* pTextPage);
+ int CountLinks() const;
+ CFX_WideString GetURL(int index) const;
+ void GetBoundedSegment(int index, int& start, int& count) const;
+ void GetRects(int index, CFX_RectArray& rects) const;
+
+ FX_BOOL IsExtract() const { return m_bIsParsed; }
+
+ protected:
+ void ParseLink();
+ void DeleteLinkList();
+ FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
+ bool CheckMailLink(CFX_WideString& str);
+ void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
+
+ private:
+ CFX_ArrayTemplate<CPDF_LinkExt*> m_LinkList;
+ const CPDF_TextPage* m_pTextPage;
+ CFX_WideString m_strPageText;
+ bool m_bIsParsed;
+};
+
+#endif // CORE_FPDFTEXT_INCLUDE_CPDF_LINKEXTRACT_H_
diff --git a/core/fpdftext/include/cpdf_textpage.h b/core/fpdftext/include/cpdf_textpage.h
new file mode 100644
index 0000000000..19e8791b5a
--- /dev/null
+++ b/core/fpdftext/include/cpdf_textpage.h
@@ -0,0 +1,157 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGE_H_
+#define CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGE_H_
+
+#include <deque>
+#include <vector>
+
+#include "core/fpdfapi/fpdf_page/cpdf_pageobjectlist.h"
+#include "core/fxcrt/include/fx_basic.h"
+#include "core/fxcrt/include/fx_coordinates.h"
+#include "core/fxcrt/include/fx_string.h"
+
+class CFX_BidiChar;
+class CPDF_Font;
+class CPDF_FormObject;
+class CPDF_Page;
+class CPDF_TextObject;
+
+struct FPDF_CHAR_INFO {
+ FX_WCHAR m_Unicode;
+ FX_WCHAR m_Charcode;
+ int32_t m_Flag;
+ FX_FLOAT m_FontSize;
+ FX_FLOAT m_OriginX;
+ FX_FLOAT m_OriginY;
+ CFX_FloatRect m_CharBox;
+ CPDF_TextObject* m_pTextObj;
+ CFX_Matrix m_Matrix;
+};
+
+struct FPDF_SEGMENT {
+ int m_Start;
+ int m_nCount;
+};
+
+struct PAGECHAR_INFO {
+ int m_CharCode;
+ FX_WCHAR m_Unicode;
+ FX_FLOAT m_OriginX;
+ FX_FLOAT m_OriginY;
+ int32_t m_Flag;
+ CFX_FloatRect m_CharBox;
+ CPDF_TextObject* m_pTextObj;
+ CFX_Matrix m_Matrix;
+ int m_Index;
+};
+
+struct PDFTEXT_Obj {
+ CPDF_TextObject* m_pTextObj;
+ CFX_Matrix m_formMatrix;
+};
+
+class CPDF_TextPage {
+ public:
+ CPDF_TextPage(const CPDF_Page* pPage, int flags);
+ ~CPDF_TextPage() {}
+
+ // IPDF_TextPage:
+ void ParseTextPage();
+ bool IsParsed() const { return m_bIsParsed; }
+ int CharIndexFromTextIndex(int TextIndex) const;
+ int TextIndexFromCharIndex(int CharIndex) const;
+ int CountChars() const;
+ void GetCharInfo(int index, FPDF_CHAR_INFO* info) const;
+ void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const;
+ int GetIndexAtPos(CFX_FloatPoint point,
+ FX_FLOAT xTolerance,
+ FX_FLOAT yTolerance) const;
+ int GetIndexAtPos(FX_FLOAT x,
+ FX_FLOAT y,
+ FX_FLOAT xTolerance,
+ FX_FLOAT yTolerance) const;
+ CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const;
+ void GetRectsArrayByRect(const CFX_FloatRect& rect,
+ CFX_RectArray& resRectArray) const;
+ CFX_WideString GetPageText(int start = 0, int nCount = -1) const;
+ int CountRects(int start, int nCount);
+ void GetRect(int rectIndex,
+ FX_FLOAT& left,
+ FX_FLOAT& top,
+ FX_FLOAT& right,
+ FX_FLOAT& bottom) const;
+ FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate);
+ FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate);
+ int CountBoundedSegments(FX_FLOAT left,
+ FX_FLOAT top,
+ FX_FLOAT right,
+ FX_FLOAT bottom,
+ FX_BOOL bContains = FALSE);
+ void GetBoundedSegment(int index, int& start, int& count) const;
+ int GetWordBreak(int index, int direction) const;
+
+ static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1,
+ const CFX_FloatRect& rect2);
+ static FX_BOOL IsLetter(FX_WCHAR unicode);
+
+ private:
+ FX_BOOL IsHyphen(FX_WCHAR curChar);
+ bool IsControlChar(const PAGECHAR_INFO& charInfo);
+ FX_BOOL GetBaselineRotate(int start, int end, int& Rotate);
+ void ProcessObject();
+ void ProcessFormObject(CPDF_FormObject* pFormObj,
+ const CFX_Matrix& formMatrix);
+ void ProcessTextObject(PDFTEXT_Obj pObj);
+ void ProcessTextObject(CPDF_TextObject* pTextObj,
+ const CFX_Matrix& formMatrix,
+ const CPDF_PageObjectList* pObjList,
+ CPDF_PageObjectList::const_iterator ObjPos);
+ int ProcessInsertObject(const CPDF_TextObject* pObj,
+ const CFX_Matrix& formMatrix);
+ FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
+ FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
+ const CPDF_PageObjectList* pObjList,
+ CPDF_PageObjectList::const_iterator ObjPos);
+ FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1,
+ CPDF_TextObject* pTextObj2);
+ int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const;
+ void CloseTempLine();
+ void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str);
+ int32_t PreMarkedContent(PDFTEXT_Obj pObj);
+ void ProcessMarkedContent(PDFTEXT_Obj pObj);
+ void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
+ void FindPreviousTextObject(void);
+ void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
+ void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
+ int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
+ int32_t FindTextlineFlowDirection();
+
+ void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
+ FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj,
+ const CPDF_Font* pFont,
+ int nItems) const;
+
+ const CPDF_Page* const m_pPage;
+ std::vector<uint16_t> m_CharIndex;
+ std::deque<PAGECHAR_INFO> m_CharList;
+ std::deque<PAGECHAR_INFO> m_TempCharList;
+ CFX_WideTextBuf m_TextBuf;
+ CFX_WideTextBuf m_TempTextBuf;
+ const int m_parserflag;
+ CPDF_TextObject* m_pPreTextObj;
+ CFX_Matrix m_perMatrix;
+ bool m_bIsParsed;
+ CFX_Matrix m_DisplayMatrix;
+ CFX_ArrayTemplate<FPDF_SEGMENT> m_Segments;
+ CFX_RectArray m_SelRects;
+ CFX_ArrayTemplate<PDFTEXT_Obj> m_LineObj;
+ int32_t m_TextlineDir;
+ CFX_FloatRect m_CurlineRect;
+};
+
+#endif // CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGE_H_
diff --git a/core/fpdftext/include/cpdf_textpagefind.h b/core/fpdftext/include/cpdf_textpagefind.h
new file mode 100644
index 0000000000..ec739e4896
--- /dev/null
+++ b/core/fpdftext/include/cpdf_textpagefind.h
@@ -0,0 +1,65 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGEFIND_H_
+#define CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGEFIND_H_
+
+#include <vector>
+
+#include "core/fxcrt/include/fx_coordinates.h"
+#include "core/fxcrt/include/fx_string.h"
+#include "core/fxcrt/include/fx_system.h"
+
+class CPDF_TextPage;
+
+class CPDF_TextPageFind {
+ public:
+ explicit CPDF_TextPageFind(const CPDF_TextPage* pTextPage);
+ ~CPDF_TextPageFind() {}
+
+ FX_BOOL FindFirst(const CFX_WideString& findwhat,
+ int flags,
+ int startPos = 0);
+ FX_BOOL FindNext();
+ FX_BOOL FindPrev();
+ void GetRectArray(CFX_RectArray& rects) const;
+ int GetCurOrder() const;
+ int GetMatchedCount() const;
+
+ protected:
+ void ExtractFindWhat(const CFX_WideString& findwhat);
+ FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText,
+ int startPos,
+ int endPos);
+ FX_BOOL ExtractSubString(CFX_WideString& rString,
+ const FX_WCHAR* lpszFullString,
+ int iSubString,
+ FX_WCHAR chSep);
+ CFX_WideString MakeReverse(const CFX_WideString& str);
+ int ReverseFind(const CFX_WideString& csPageText,
+ const CFX_WideString& csWord,
+ int nStartPos,
+ int& WordLength);
+ int GetCharIndex(int index) const;
+
+ private:
+ std::vector<uint16_t> m_CharIndex;
+ const CPDF_TextPage* m_pTextPage;
+ CFX_WideString m_strText;
+ CFX_WideString m_findWhat;
+ int m_flags;
+ std::vector<CFX_WideString> m_csFindWhatArray;
+ int m_findNextStart;
+ int m_findPreStart;
+ FX_BOOL m_bMatchCase;
+ FX_BOOL m_bMatchWholeWord;
+ int m_resStart;
+ int m_resEnd;
+ CFX_RectArray m_resArray;
+ FX_BOOL m_IsFind;
+};
+
+#endif // CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGEFIND_H_
diff --git a/core/fpdftext/include/ipdf_linkextract.h b/core/fpdftext/include/ipdf_linkextract.h
deleted file mode 100644
index c1a5f2f04e..0000000000
--- a/core/fpdftext/include/ipdf_linkextract.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2016 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FPDFTEXT_INCLUDE_IPDF_LINKEXTRACT_H_
-#define CORE_FPDFTEXT_INCLUDE_IPDF_LINKEXTRACT_H_
-
-#include "core/fpdftext/include/ipdf_textpage.h"
-#include "core/fxcrt/include/fx_coordinates.h"
-#include "core/fxcrt/include/fx_system.h"
-
-class IPDF_LinkExtract {
- public:
- static IPDF_LinkExtract* CreateLinkExtract();
- virtual ~IPDF_LinkExtract() {}
-
- virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0;
- virtual int CountLinks() const = 0;
- virtual CFX_WideString GetURL(int index) const = 0;
- virtual void GetBoundedSegment(int index, int& start, int& count) const = 0;
- virtual void GetRects(int index, CFX_RectArray& rects) const = 0;
-};
-
-#endif // CORE_FPDFTEXT_INCLUDE_IPDF_LINKEXTRACT_H_
diff --git a/core/fpdftext/include/ipdf_textpage.h b/core/fpdftext/include/ipdf_textpage.h
deleted file mode 100644
index 3849cd4004..0000000000
--- a/core/fpdftext/include/ipdf_textpage.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2016 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGE_H_
-#define CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGE_H_
-
-#include "core/fxcrt/include/fx_coordinates.h"
-#include "core/fxcrt/include/fx_system.h"
-
-class CPDF_TextObject;
-class CPDF_Page;
-
-struct FPDF_CHAR_INFO {
- FX_WCHAR m_Unicode;
- FX_WCHAR m_Charcode;
- int32_t m_Flag;
- FX_FLOAT m_FontSize;
- FX_FLOAT m_OriginX;
- FX_FLOAT m_OriginY;
- CFX_FloatRect m_CharBox;
- CPDF_TextObject* m_pTextObj;
- CFX_Matrix m_Matrix;
-};
-
-class IPDF_TextPage {
- public:
- static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0);
- virtual ~IPDF_TextPage() {}
-
- virtual void ParseTextPage() = 0;
- virtual bool IsParsed() const = 0;
- virtual int CharIndexFromTextIndex(int TextIndex) const = 0;
- virtual int TextIndexFromCharIndex(int CharIndex) const = 0;
- virtual int CountChars() const = 0;
- virtual void GetCharInfo(int index, FPDF_CHAR_INFO* info) const = 0;
- virtual void GetRectArray(int start,
- int nCount,
- CFX_RectArray& rectArray) const = 0;
- virtual int GetIndexAtPos(CFX_FloatPoint point,
- FX_FLOAT xTolerance,
- FX_FLOAT yTolerance) const = 0;
- virtual int GetIndexAtPos(FX_FLOAT x,
- FX_FLOAT y,
- FX_FLOAT xTolerance,
- FX_FLOAT yTolerance) const = 0;
- virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0;
- virtual void GetRectsArrayByRect(const CFX_FloatRect& rect,
- CFX_RectArray& resRectArray) const = 0;
- virtual int CountRects(int start, int nCount) = 0;
- virtual void GetRect(int rectIndex,
- FX_FLOAT& left,
- FX_FLOAT& top,
- FX_FLOAT& right,
- FX_FLOAT& bottom) const = 0;
- virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0;
- virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0;
- virtual int CountBoundedSegments(FX_FLOAT left,
- FX_FLOAT top,
- FX_FLOAT right,
- FX_FLOAT bottom,
- FX_BOOL bContains = FALSE) = 0;
- virtual void GetBoundedSegment(int index, int& start, int& count) const = 0;
- virtual int GetWordBreak(int index, int direction) const = 0;
- virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0;
-};
-
-#endif // CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGE_H_
diff --git a/core/fpdftext/include/ipdf_textpagefind.h b/core/fpdftext/include/ipdf_textpagefind.h
deleted file mode 100644
index b13432b59b..0000000000
--- a/core/fpdftext/include/ipdf_textpagefind.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2016 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGEFIND_H_
-#define CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGEFIND_H_
-
-#include "core/fpdftext/include/ipdf_textpage.h"
-#include "core/fxcrt/include/fx_coordinates.h"
-#include "core/fxcrt/include/fx_string.h"
-
-class IPDF_TextPageFind {
- public:
- static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage);
- virtual ~IPDF_TextPageFind() {}
-
- virtual FX_BOOL FindFirst(const CFX_WideString& findwhat,
- int flags,
- int startPos = 0) = 0;
- virtual FX_BOOL FindNext() = 0;
- virtual FX_BOOL FindPrev() = 0;
- virtual void GetRectArray(CFX_RectArray& rects) const = 0;
- virtual int GetCurOrder() const = 0;
- virtual int GetMatchedCount() const = 0;
-};
-
-#endif // CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGEFIND_H_
diff --git a/core/fpdftext/unicodenormalization.cpp b/core/fpdftext/unicodenormalization.cpp
deleted file mode 100644
index 67ab57cb5d..0000000000
--- a/core/fpdftext/unicodenormalization.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/fpdftext/unicodenormalization.h"
-
-#include "core/fpdftext/unicodenormalizationdata.h"
-#include "core/fxcrt/include/fx_string.h"
-
-namespace {
-
-const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
- nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
- g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
-
-} // namespace
-
-FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) {
- wch = wch & 0xFFFF;
- FX_WCHAR wFind = g_UnicodeData_Normalization[wch];
- if (!wFind) {
- if (pDst) {
- *pDst = wch;
- }
- return 1;
- }
- if (wFind >= 0x8000) {
- wch = wFind - 0x8000;
- wFind = 1;
- } else {
- wch = wFind & 0x0FFF;
- wFind >>= 12;
- }
- const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind];
- if (pMap == g_UnicodeData_Normalization_Map4) {
- pMap = g_UnicodeData_Normalization_Map4 + wch;
- wFind = (FX_WCHAR)(*pMap++);
- } else {
- pMap += wch;
- }
- if (pDst) {
- FX_WCHAR n = wFind;
- while (n--) {
- *pDst++ = *pMap++;
- }
- }
- return (FX_STRSIZE)wFind;
-}
diff --git a/core/fpdftext/unicodenormalization.h b/core/fpdftext/unicodenormalization.h
deleted file mode 100644
index ee3c8b2024..0000000000
--- a/core/fpdftext/unicodenormalization.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2014 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FPDFTEXT_UNICODENORMALIZATION_H_
-#define CORE_FPDFTEXT_UNICODENORMALIZATION_H_
-
-#include "core/fxcrt/include/fx_system.h"
-
-FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst);
-
-#endif // CORE_FPDFTEXT_UNICODENORMALIZATION_H_