diff options
Diffstat (limited to 'core')
-rw-r--r-- | core/fpdftext/fpdf_text_int.cpp | 103 | ||||
-rw-r--r-- | core/fpdftext/fpdf_text_int.h | 247 | ||||
-rw-r--r-- | core/fpdftext/fpdf_text_int_unittest.cpp | 2 | ||||
-rw-r--r-- | core/fpdftext/include/cpdf_linkextract.h | 54 | ||||
-rw-r--r-- | core/fpdftext/include/cpdf_textpage.h | 157 | ||||
-rw-r--r-- | core/fpdftext/include/cpdf_textpagefind.h | 65 | ||||
-rw-r--r-- | core/fpdftext/include/ipdf_linkextract.h | 26 | ||||
-rw-r--r-- | core/fpdftext/include/ipdf_textpage.h | 70 | ||||
-rw-r--r-- | core/fpdftext/include/ipdf_textpagefind.h | 29 | ||||
-rw-r--r-- | core/fpdftext/unicodenormalization.cpp | 50 | ||||
-rw-r--r-- | core/fpdftext/unicodenormalization.h | 14 |
11 files changed, 344 insertions, 473 deletions
diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/fpdf_text_int.cpp index 8e8686c4a1..741331fb77 100644 --- a/core/fpdftext/fpdf_text_int.cpp +++ b/core/fpdftext/fpdf_text_int.cpp @@ -4,8 +4,6 @@ // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com -#include "core/fpdftext/fpdf_text_int.h" - #include <algorithm> #include <cctype> #include <cwctype> @@ -14,15 +12,17 @@ #include <vector> #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" +#include "core/fpdfapi/fpdf_page/include/cpdf_form.h" #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" +#include "core/fpdfapi/fpdf_page/include/cpdf_page.h" #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" -#include "core/fpdftext/include/ipdf_linkextract.h" -#include "core/fpdftext/include/ipdf_textpage.h" -#include "core/fpdftext/include/ipdf_textpagefind.h" -#include "core/fpdftext/unicodenormalization.h" +#include "core/fpdftext/include/cpdf_linkextract.h" +#include "core/fpdftext/include/cpdf_textpage.h" +#include "core/fpdftext/include/cpdf_textpagefind.h" +#include "core/fpdftext/unicodenormalizationdata.h" #include "core/fxcrt/fx_bidi.h" #include "core/fxcrt/include/fx_ext.h" #include "core/fxcrt/include/fx_ucd.h" @@ -36,9 +36,24 @@ #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 #define FPDFTEXT_CONSECUTIVE 0x00000004 +#define FPDFTEXT_CHAR_ERROR -1 +#define FPDFTEXT_CHAR_NORMAL 0 +#define FPDFTEXT_CHAR_GENERATED 1 +#define FPDFTEXT_CHAR_UNUNICODE 2 +#define FPDFTEXT_CHAR_HYPHEN 3 +#define FPDFTEXT_CHAR_PIECE 4 +#define FPDFTEXT_MC_PASS 0 +#define FPDFTEXT_MC_DONE 1 +#define FPDFTEXT_MC_DELAY 2 + namespace { -FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { +const FX_FLOAT kDefaultFontSize = 1.0f; +const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { + nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, + g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; + +FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { if (curChar < 255) { return FALSE; } @@ -55,7 +70,7 @@ FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { return TRUE; } -FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { +FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { if (threshold < 300) { return threshold / 2.0f; } @@ -68,8 +83,8 @@ FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { return threshold / 6.0f; } -FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, - const CFX_Matrix& matrix) { +FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj, + const CFX_Matrix& matrix) { FX_FLOAT baseSpace = 0.0; const int nItems = pTextObj->CountItems(); if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { @@ -94,23 +109,39 @@ FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, return baseSpace; } -const FX_FLOAT kDefaultFontSize = 1.0f; - -} // namespace - -IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, - int flags) { - return new CPDF_TextPage(pPage, flags); -} - -IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind( - const IPDF_TextPage* pTextPage) { - return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr; +FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) { + wch = wch & 0xFFFF; + FX_WCHAR wFind = g_UnicodeData_Normalization[wch]; + if (!wFind) { + if (pDst) { + *pDst = wch; + } + return 1; + } + if (wFind >= 0x8000) { + wch = wFind - 0x8000; + wFind = 1; + } else { + wch = wFind & 0x0FFF; + wFind >>= 12; + } + const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind]; + if (pMap == g_UnicodeData_Normalization_Map4) { + pMap = g_UnicodeData_Normalization_Map4 + wch; + wFind = (FX_WCHAR)(*pMap++); + } else { + pMap += wch; + } + if (pDst) { + FX_WCHAR n = wFind; + while (n--) { + *pDst++ = *pMap++; + } + } + return (FX_STRSIZE)wFind; } -IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() { - return new CPDF_LinkExtract(); -} +} // namespace #define TEXT_BLANK_CHAR L' ' #define TEXT_LINEFEED_CHAR L'\n' @@ -932,10 +963,10 @@ void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar, info.m_Index = m_TextBuf.GetLength(); if (wChar >= 0xFB00 && wChar <= 0xFB06) { FX_WCHAR* pDst = NULL; - FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); + FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst); if (nCount >= 1) { pDst = FX_Alloc(FX_WCHAR, nCount); - FX_Unicode_GetNormalization(wChar, pDst); + Unicode_GetNormalization(wChar, pDst); for (int nIndex = 0; nIndex < nCount; nIndex++) { PAGECHAR_INFO info2 = info; info2.m_Unicode = pDst[nIndex]; @@ -960,10 +991,10 @@ void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar, info.m_Index = m_TextBuf.GetLength(); wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); FX_WCHAR* pDst = NULL; - FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); + FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst); if (nCount >= 1) { pDst = FX_Alloc(FX_WCHAR, nCount); - FX_Unicode_GetNormalization(wChar, pDst); + Unicode_GetNormalization(wChar, pDst); for (int nIndex = 0; nIndex < nCount; nIndex++) { PAGECHAR_INFO info2 = info; info2.m_Unicode = pDst[nIndex]; @@ -1377,7 +1408,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { m_pPreTextObj = pTextObj; m_perMatrix.Copy(formMatrix); int nItems = pTextObj->CountItems(); - FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); + FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix); const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems); const FX_BOOL bIsBidiAndMirrorInverse = @@ -1430,7 +1461,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; - threshold = _NormalizeThreshold(threshold); + threshold = NormalizeThreshold(threshold); threshold = fontsize_h * threshold / 1000; } if (threshold && (spacing && spacing >= threshold)) { @@ -1898,7 +1929,7 @@ FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) { return TRUE; } -CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) +CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) : m_pTextPage(pTextPage), m_flags(0), m_findNextStart(-1), @@ -2054,8 +2085,8 @@ FX_BOOL CPDF_TextPageFind::FindNext() { CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); if (nStartPos == nResultPos && - !(_IsIgnoreSpaceCharacter(lastChar) || - _IsIgnoreSpaceCharacter(curChar))) { + !(IsIgnoreSpaceCharacter(lastChar) || + IsIgnoreSpaceCharacter(curChar))) { bMatch = FALSE; } for (int d = PreResEndPos; d < nResultPos; d++) { @@ -2174,7 +2205,7 @@ void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { while (pos < csWord.GetLength()) { CFX_WideString curStr = csWord.Mid(pos, 1); FX_WCHAR curChar = csWord.GetAt(pos); - if (_IsIgnoreSpaceCharacter(curChar)) { + if (IsIgnoreSpaceCharacter(curChar)) { if (pos > 0 && curChar == 0x2019) { pos++; continue; @@ -2306,7 +2337,7 @@ CPDF_LinkExtract::~CPDF_LinkExtract() { DeleteLinkList(); } -FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) { +FX_BOOL CPDF_LinkExtract::ExtractLinks(const CPDF_TextPage* pTextPage) { if (!pTextPage || !pTextPage->IsParsed()) return FALSE; diff --git a/core/fpdftext/fpdf_text_int.h b/core/fpdftext/fpdf_text_int.h deleted file mode 100644 index 7acab55ccb..0000000000 --- a/core/fpdftext/fpdf_text_int.h +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2014 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_FPDFTEXT_FPDF_TEXT_INT_H_ -#define CORE_FPDFTEXT_FPDF_TEXT_INT_H_ - -#include <deque> -#include <vector> - -#include "core/fpdfapi/fpdf_page/cpdf_pageobjectlist.h" -#include "core/fpdfapi/fpdf_page/include/cpdf_form.h" -#include "core/fpdfapi/fpdf_page/include/cpdf_page.h" -#include "core/fpdftext/include/ipdf_linkextract.h" -#include "core/fpdftext/include/ipdf_textpage.h" -#include "core/fpdftext/include/ipdf_textpagefind.h" -#include "core/fxcrt/include/fx_basic.h" - -class CFX_BidiChar; -class CPDF_FormObject; -class CPDF_LinkExtract; -class CPDF_TextPageFind; -class CPDF_Font; - -#define FPDFTEXT_CHAR_ERROR -1 -#define FPDFTEXT_CHAR_NORMAL 0 -#define FPDFTEXT_CHAR_GENERATED 1 -#define FPDFTEXT_CHAR_UNUNICODE 2 -#define FPDFTEXT_CHAR_HYPHEN 3 -#define FPDFTEXT_CHAR_PIECE 4 -#define FPDFTEXT_MC_PASS 0 -#define FPDFTEXT_MC_DONE 1 -#define FPDFTEXT_MC_DELAY 2 - -struct PAGECHAR_INFO { - int m_CharCode; - FX_WCHAR m_Unicode; - FX_FLOAT m_OriginX; - FX_FLOAT m_OriginY; - int32_t m_Flag; - CFX_FloatRect m_CharBox; - CPDF_TextObject* m_pTextObj; - CFX_Matrix m_Matrix; - int m_Index; -}; - -struct FPDF_SEGMENT { - int m_Start; - int m_nCount; -}; - -struct PDFTEXT_Obj { - CPDF_TextObject* m_pTextObj; - CFX_Matrix m_formMatrix; -}; - -class CPDF_TextPage : public IPDF_TextPage { - public: - CPDF_TextPage(const CPDF_Page* pPage, int flags); - ~CPDF_TextPage() override {} - - // IPDF_TextPage: - void ParseTextPage() override; - bool IsParsed() const override { return m_bIsParsed; } - int CharIndexFromTextIndex(int TextIndex) const override; - int TextIndexFromCharIndex(int CharIndex) const override; - int CountChars() const override; - void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override; - void GetRectArray(int start, - int nCount, - CFX_RectArray& rectArray) const override; - int GetIndexAtPos(CFX_FloatPoint point, - FX_FLOAT xTolerance, - FX_FLOAT yTolerance) const override; - int GetIndexAtPos(FX_FLOAT x, - FX_FLOAT y, - FX_FLOAT xTolerance, - FX_FLOAT yTolerance) const override; - CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override; - void GetRectsArrayByRect(const CFX_FloatRect& rect, - CFX_RectArray& resRectArray) const override; - CFX_WideString GetPageText(int start = 0, int nCount = -1) const override; - int CountRects(int start, int nCount) override; - void GetRect(int rectIndex, - FX_FLOAT& left, - FX_FLOAT& top, - FX_FLOAT& right, - FX_FLOAT& bottom) const override; - FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override; - FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override; - int CountBoundedSegments(FX_FLOAT left, - FX_FLOAT top, - FX_FLOAT right, - FX_FLOAT bottom, - FX_BOOL bContains = FALSE) override; - void GetBoundedSegment(int index, int& start, int& count) const override; - int GetWordBreak(int index, int direction) const override; - - static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, - const CFX_FloatRect& rect2); - static FX_BOOL IsLetter(FX_WCHAR unicode); - - private: - FX_BOOL IsHyphen(FX_WCHAR curChar); - bool IsControlChar(const PAGECHAR_INFO& charInfo); - FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); - void ProcessObject(); - void ProcessFormObject(CPDF_FormObject* pFormObj, - const CFX_Matrix& formMatrix); - void ProcessTextObject(PDFTEXT_Obj pObj); - void ProcessTextObject(CPDF_TextObject* pTextObj, - const CFX_Matrix& formMatrix, - const CPDF_PageObjectList* pObjList, - CPDF_PageObjectList::const_iterator ObjPos); - int ProcessInsertObject(const CPDF_TextObject* pObj, - const CFX_Matrix& formMatrix); - FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); - FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, - const CPDF_PageObjectList* pObjList, - CPDF_PageObjectList::const_iterator ObjPos); - FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, - CPDF_TextObject* pTextObj2); - int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const; - void CloseTempLine(); - void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str); - int32_t PreMarkedContent(PDFTEXT_Obj pObj); - void ProcessMarkedContent(PDFTEXT_Obj pObj); - void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; - void FindPreviousTextObject(void); - void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info); - void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info); - int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); - int32_t FindTextlineFlowDirection(); - - void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); - FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, - const CPDF_Font* pFont, - int nItems) const; - - const CPDF_Page* const m_pPage; - std::vector<uint16_t> m_CharIndex; - std::deque<PAGECHAR_INFO> m_CharList; - std::deque<PAGECHAR_INFO> m_TempCharList; - CFX_WideTextBuf m_TextBuf; - CFX_WideTextBuf m_TempTextBuf; - const int m_parserflag; - CPDF_TextObject* m_pPreTextObj; - CFX_Matrix m_perMatrix; - bool m_bIsParsed; - CFX_Matrix m_DisplayMatrix; - CFX_ArrayTemplate<FPDF_SEGMENT> m_Segments; - CFX_RectArray m_SelRects; - CFX_ArrayTemplate<PDFTEXT_Obj> m_LineObj; - int32_t m_TextlineDir; - CFX_FloatRect m_CurlineRect; -}; - -class CPDF_TextPageFind : public IPDF_TextPageFind { - public: - explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage); - ~CPDF_TextPageFind() override {} - - // IPDF_TextPageFind - FX_BOOL FindFirst(const CFX_WideString& findwhat, - int flags, - int startPos = 0) override; - FX_BOOL FindNext() override; - FX_BOOL FindPrev() override; - void GetRectArray(CFX_RectArray& rects) const override; - int GetCurOrder() const override; - int GetMatchedCount() const override; - - protected: - void ExtractFindWhat(const CFX_WideString& findwhat); - FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, - int startPos, - int endPos); - FX_BOOL ExtractSubString(CFX_WideString& rString, - const FX_WCHAR* lpszFullString, - int iSubString, - FX_WCHAR chSep); - CFX_WideString MakeReverse(const CFX_WideString& str); - int ReverseFind(const CFX_WideString& csPageText, - const CFX_WideString& csWord, - int nStartPos, - int& WordLength); - int GetCharIndex(int index) const; - - private: - std::vector<uint16_t> m_CharIndex; - const IPDF_TextPage* m_pTextPage; - CFX_WideString m_strText; - CFX_WideString m_findWhat; - int m_flags; - std::vector<CFX_WideString> m_csFindWhatArray; - int m_findNextStart; - int m_findPreStart; - FX_BOOL m_bMatchCase; - FX_BOOL m_bMatchWholeWord; - int m_resStart; - int m_resEnd; - CFX_RectArray m_resArray; - FX_BOOL m_IsFind; -}; - -class CPDF_LinkExt { - public: - CPDF_LinkExt() {} - int m_Start; - int m_Count; - CFX_WideString m_strUrl; - virtual ~CPDF_LinkExt() {} -}; - -typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; - -class CPDF_LinkExtract : public IPDF_LinkExtract { - public: - CPDF_LinkExtract(); - ~CPDF_LinkExtract() override; - - // IPDF_LinkExtract - FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override; - int CountLinks() const override; - CFX_WideString GetURL(int index) const override; - void GetBoundedSegment(int index, int& start, int& count) const override; - void GetRects(int index, CFX_RectArray& rects) const override; - - FX_BOOL IsExtract() const { return m_bIsParsed; } - - protected: - void ParseLink(); - void DeleteLinkList(); - FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); - bool CheckMailLink(CFX_WideString& str); - void AppendToLinkList(int start, int count, const CFX_WideString& strUrl); - - private: - LINK_InfoArray m_LinkList; - const CPDF_TextPage* m_pTextPage; - CFX_WideString m_strPageText; - bool m_bIsParsed; -}; - -#endif // CORE_FPDFTEXT_FPDF_TEXT_INT_H_ diff --git a/core/fpdftext/fpdf_text_int_unittest.cpp b/core/fpdftext/fpdf_text_int_unittest.cpp index e62e885d4b..e1dd0f7504 100644 --- a/core/fpdftext/fpdf_text_int_unittest.cpp +++ b/core/fpdftext/fpdf_text_int_unittest.cpp @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "core/fpdftext/fpdf_text_int.h" +#include "core/fpdftext/include/cpdf_linkextract.h" #include "testing/gtest/include/gtest/gtest.h" diff --git a/core/fpdftext/include/cpdf_linkextract.h b/core/fpdftext/include/cpdf_linkextract.h new file mode 100644 index 0000000000..263768ee5d --- /dev/null +++ b/core/fpdftext/include/cpdf_linkextract.h @@ -0,0 +1,54 @@ +// Copyright 2014 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FPDFTEXT_INCLUDE_CPDF_LINKEXTRACT_H_ +#define CORE_FPDFTEXT_INCLUDE_CPDF_LINKEXTRACT_H_ + +#include "core/fxcrt/include/fx_basic.h" +#include "core/fxcrt/include/fx_coordinates.h" +#include "core/fxcrt/include/fx_string.h" +#include "core/fxcrt/include/fx_system.h" + +class CPDF_TextPage; + +class CPDF_LinkExt { + public: + CPDF_LinkExt() {} + ~CPDF_LinkExt() {} + + int m_Start; + int m_Count; + CFX_WideString m_strUrl; +}; + +class CPDF_LinkExtract { + public: + CPDF_LinkExtract(); + ~CPDF_LinkExtract(); + + FX_BOOL ExtractLinks(const CPDF_TextPage* pTextPage); + int CountLinks() const; + CFX_WideString GetURL(int index) const; + void GetBoundedSegment(int index, int& start, int& count) const; + void GetRects(int index, CFX_RectArray& rects) const; + + FX_BOOL IsExtract() const { return m_bIsParsed; } + + protected: + void ParseLink(); + void DeleteLinkList(); + FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); + bool CheckMailLink(CFX_WideString& str); + void AppendToLinkList(int start, int count, const CFX_WideString& strUrl); + + private: + CFX_ArrayTemplate<CPDF_LinkExt*> m_LinkList; + const CPDF_TextPage* m_pTextPage; + CFX_WideString m_strPageText; + bool m_bIsParsed; +}; + +#endif // CORE_FPDFTEXT_INCLUDE_CPDF_LINKEXTRACT_H_ diff --git a/core/fpdftext/include/cpdf_textpage.h b/core/fpdftext/include/cpdf_textpage.h new file mode 100644 index 0000000000..19e8791b5a --- /dev/null +++ b/core/fpdftext/include/cpdf_textpage.h @@ -0,0 +1,157 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGE_H_ +#define CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGE_H_ + +#include <deque> +#include <vector> + +#include "core/fpdfapi/fpdf_page/cpdf_pageobjectlist.h" +#include "core/fxcrt/include/fx_basic.h" +#include "core/fxcrt/include/fx_coordinates.h" +#include "core/fxcrt/include/fx_string.h" + +class CFX_BidiChar; +class CPDF_Font; +class CPDF_FormObject; +class CPDF_Page; +class CPDF_TextObject; + +struct FPDF_CHAR_INFO { + FX_WCHAR m_Unicode; + FX_WCHAR m_Charcode; + int32_t m_Flag; + FX_FLOAT m_FontSize; + FX_FLOAT m_OriginX; + FX_FLOAT m_OriginY; + CFX_FloatRect m_CharBox; + CPDF_TextObject* m_pTextObj; + CFX_Matrix m_Matrix; +}; + +struct FPDF_SEGMENT { + int m_Start; + int m_nCount; +}; + +struct PAGECHAR_INFO { + int m_CharCode; + FX_WCHAR m_Unicode; + FX_FLOAT m_OriginX; + FX_FLOAT m_OriginY; + int32_t m_Flag; + CFX_FloatRect m_CharBox; + CPDF_TextObject* m_pTextObj; + CFX_Matrix m_Matrix; + int m_Index; +}; + +struct PDFTEXT_Obj { + CPDF_TextObject* m_pTextObj; + CFX_Matrix m_formMatrix; +}; + +class CPDF_TextPage { + public: + CPDF_TextPage(const CPDF_Page* pPage, int flags); + ~CPDF_TextPage() {} + + // IPDF_TextPage: + void ParseTextPage(); + bool IsParsed() const { return m_bIsParsed; } + int CharIndexFromTextIndex(int TextIndex) const; + int TextIndexFromCharIndex(int CharIndex) const; + int CountChars() const; + void GetCharInfo(int index, FPDF_CHAR_INFO* info) const; + void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const; + int GetIndexAtPos(CFX_FloatPoint point, + FX_FLOAT xTolerance, + FX_FLOAT yTolerance) const; + int GetIndexAtPos(FX_FLOAT x, + FX_FLOAT y, + FX_FLOAT xTolerance, + FX_FLOAT yTolerance) const; + CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const; + void GetRectsArrayByRect(const CFX_FloatRect& rect, + CFX_RectArray& resRectArray) const; + CFX_WideString GetPageText(int start = 0, int nCount = -1) const; + int CountRects(int start, int nCount); + void GetRect(int rectIndex, + FX_FLOAT& left, + FX_FLOAT& top, + FX_FLOAT& right, + FX_FLOAT& bottom) const; + FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate); + FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate); + int CountBoundedSegments(FX_FLOAT left, + FX_FLOAT top, + FX_FLOAT right, + FX_FLOAT bottom, + FX_BOOL bContains = FALSE); + void GetBoundedSegment(int index, int& start, int& count) const; + int GetWordBreak(int index, int direction) const; + + static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, + const CFX_FloatRect& rect2); + static FX_BOOL IsLetter(FX_WCHAR unicode); + + private: + FX_BOOL IsHyphen(FX_WCHAR curChar); + bool IsControlChar(const PAGECHAR_INFO& charInfo); + FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); + void ProcessObject(); + void ProcessFormObject(CPDF_FormObject* pFormObj, + const CFX_Matrix& formMatrix); + void ProcessTextObject(PDFTEXT_Obj pObj); + void ProcessTextObject(CPDF_TextObject* pTextObj, + const CFX_Matrix& formMatrix, + const CPDF_PageObjectList* pObjList, + CPDF_PageObjectList::const_iterator ObjPos); + int ProcessInsertObject(const CPDF_TextObject* pObj, + const CFX_Matrix& formMatrix); + FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); + FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, + const CPDF_PageObjectList* pObjList, + CPDF_PageObjectList::const_iterator ObjPos); + FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, + CPDF_TextObject* pTextObj2); + int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const; + void CloseTempLine(); + void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str); + int32_t PreMarkedContent(PDFTEXT_Obj pObj); + void ProcessMarkedContent(PDFTEXT_Obj pObj); + void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; + void FindPreviousTextObject(void); + void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info); + void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info); + int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); + int32_t FindTextlineFlowDirection(); + + void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); + FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, + const CPDF_Font* pFont, + int nItems) const; + + const CPDF_Page* const m_pPage; + std::vector<uint16_t> m_CharIndex; + std::deque<PAGECHAR_INFO> m_CharList; + std::deque<PAGECHAR_INFO> m_TempCharList; + CFX_WideTextBuf m_TextBuf; + CFX_WideTextBuf m_TempTextBuf; + const int m_parserflag; + CPDF_TextObject* m_pPreTextObj; + CFX_Matrix m_perMatrix; + bool m_bIsParsed; + CFX_Matrix m_DisplayMatrix; + CFX_ArrayTemplate<FPDF_SEGMENT> m_Segments; + CFX_RectArray m_SelRects; + CFX_ArrayTemplate<PDFTEXT_Obj> m_LineObj; + int32_t m_TextlineDir; + CFX_FloatRect m_CurlineRect; +}; + +#endif // CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGE_H_ diff --git a/core/fpdftext/include/cpdf_textpagefind.h b/core/fpdftext/include/cpdf_textpagefind.h new file mode 100644 index 0000000000..ec739e4896 --- /dev/null +++ b/core/fpdftext/include/cpdf_textpagefind.h @@ -0,0 +1,65 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGEFIND_H_ +#define CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGEFIND_H_ + +#include <vector> + +#include "core/fxcrt/include/fx_coordinates.h" +#include "core/fxcrt/include/fx_string.h" +#include "core/fxcrt/include/fx_system.h" + +class CPDF_TextPage; + +class CPDF_TextPageFind { + public: + explicit CPDF_TextPageFind(const CPDF_TextPage* pTextPage); + ~CPDF_TextPageFind() {} + + FX_BOOL FindFirst(const CFX_WideString& findwhat, + int flags, + int startPos = 0); + FX_BOOL FindNext(); + FX_BOOL FindPrev(); + void GetRectArray(CFX_RectArray& rects) const; + int GetCurOrder() const; + int GetMatchedCount() const; + + protected: + void ExtractFindWhat(const CFX_WideString& findwhat); + FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, + int startPos, + int endPos); + FX_BOOL ExtractSubString(CFX_WideString& rString, + const FX_WCHAR* lpszFullString, + int iSubString, + FX_WCHAR chSep); + CFX_WideString MakeReverse(const CFX_WideString& str); + int ReverseFind(const CFX_WideString& csPageText, + const CFX_WideString& csWord, + int nStartPos, + int& WordLength); + int GetCharIndex(int index) const; + + private: + std::vector<uint16_t> m_CharIndex; + const CPDF_TextPage* m_pTextPage; + CFX_WideString m_strText; + CFX_WideString m_findWhat; + int m_flags; + std::vector<CFX_WideString> m_csFindWhatArray; + int m_findNextStart; + int m_findPreStart; + FX_BOOL m_bMatchCase; + FX_BOOL m_bMatchWholeWord; + int m_resStart; + int m_resEnd; + CFX_RectArray m_resArray; + FX_BOOL m_IsFind; +}; + +#endif // CORE_FPDFTEXT_INCLUDE_CPDF_TEXTPAGEFIND_H_ diff --git a/core/fpdftext/include/ipdf_linkextract.h b/core/fpdftext/include/ipdf_linkextract.h deleted file mode 100644 index c1a5f2f04e..0000000000 --- a/core/fpdftext/include/ipdf_linkextract.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2016 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_FPDFTEXT_INCLUDE_IPDF_LINKEXTRACT_H_ -#define CORE_FPDFTEXT_INCLUDE_IPDF_LINKEXTRACT_H_ - -#include "core/fpdftext/include/ipdf_textpage.h" -#include "core/fxcrt/include/fx_coordinates.h" -#include "core/fxcrt/include/fx_system.h" - -class IPDF_LinkExtract { - public: - static IPDF_LinkExtract* CreateLinkExtract(); - virtual ~IPDF_LinkExtract() {} - - virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; - virtual int CountLinks() const = 0; - virtual CFX_WideString GetURL(int index) const = 0; - virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; - virtual void GetRects(int index, CFX_RectArray& rects) const = 0; -}; - -#endif // CORE_FPDFTEXT_INCLUDE_IPDF_LINKEXTRACT_H_ diff --git a/core/fpdftext/include/ipdf_textpage.h b/core/fpdftext/include/ipdf_textpage.h deleted file mode 100644 index 3849cd4004..0000000000 --- a/core/fpdftext/include/ipdf_textpage.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2016 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGE_H_ -#define CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGE_H_ - -#include "core/fxcrt/include/fx_coordinates.h" -#include "core/fxcrt/include/fx_system.h" - -class CPDF_TextObject; -class CPDF_Page; - -struct FPDF_CHAR_INFO { - FX_WCHAR m_Unicode; - FX_WCHAR m_Charcode; - int32_t m_Flag; - FX_FLOAT m_FontSize; - FX_FLOAT m_OriginX; - FX_FLOAT m_OriginY; - CFX_FloatRect m_CharBox; - CPDF_TextObject* m_pTextObj; - CFX_Matrix m_Matrix; -}; - -class IPDF_TextPage { - public: - static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); - virtual ~IPDF_TextPage() {} - - virtual void ParseTextPage() = 0; - virtual bool IsParsed() const = 0; - virtual int CharIndexFromTextIndex(int TextIndex) const = 0; - virtual int TextIndexFromCharIndex(int CharIndex) const = 0; - virtual int CountChars() const = 0; - virtual void GetCharInfo(int index, FPDF_CHAR_INFO* info) const = 0; - virtual void GetRectArray(int start, - int nCount, - CFX_RectArray& rectArray) const = 0; - virtual int GetIndexAtPos(CFX_FloatPoint point, - FX_FLOAT xTolerance, - FX_FLOAT yTolerance) const = 0; - virtual int GetIndexAtPos(FX_FLOAT x, - FX_FLOAT y, - FX_FLOAT xTolerance, - FX_FLOAT yTolerance) const = 0; - virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0; - virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, - CFX_RectArray& resRectArray) const = 0; - virtual int CountRects(int start, int nCount) = 0; - virtual void GetRect(int rectIndex, - FX_FLOAT& left, - FX_FLOAT& top, - FX_FLOAT& right, - FX_FLOAT& bottom) const = 0; - virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; - virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0; - virtual int CountBoundedSegments(FX_FLOAT left, - FX_FLOAT top, - FX_FLOAT right, - FX_FLOAT bottom, - FX_BOOL bContains = FALSE) = 0; - virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; - virtual int GetWordBreak(int index, int direction) const = 0; - virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0; -}; - -#endif // CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGE_H_ diff --git a/core/fpdftext/include/ipdf_textpagefind.h b/core/fpdftext/include/ipdf_textpagefind.h deleted file mode 100644 index b13432b59b..0000000000 --- a/core/fpdftext/include/ipdf_textpagefind.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2016 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGEFIND_H_ -#define CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGEFIND_H_ - -#include "core/fpdftext/include/ipdf_textpage.h" -#include "core/fxcrt/include/fx_coordinates.h" -#include "core/fxcrt/include/fx_string.h" - -class IPDF_TextPageFind { - public: - static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); - virtual ~IPDF_TextPageFind() {} - - virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, - int flags, - int startPos = 0) = 0; - virtual FX_BOOL FindNext() = 0; - virtual FX_BOOL FindPrev() = 0; - virtual void GetRectArray(CFX_RectArray& rects) const = 0; - virtual int GetCurOrder() const = 0; - virtual int GetMatchedCount() const = 0; -}; - -#endif // CORE_FPDFTEXT_INCLUDE_IPDF_TEXTPAGEFIND_H_ diff --git a/core/fpdftext/unicodenormalization.cpp b/core/fpdftext/unicodenormalization.cpp deleted file mode 100644 index 67ab57cb5d..0000000000 --- a/core/fpdftext/unicodenormalization.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2014 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#include "core/fpdftext/unicodenormalization.h" - -#include "core/fpdftext/unicodenormalizationdata.h" -#include "core/fxcrt/include/fx_string.h" - -namespace { - -const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { - nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, - g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; - -} // namespace - -FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) { - wch = wch & 0xFFFF; - FX_WCHAR wFind = g_UnicodeData_Normalization[wch]; - if (!wFind) { - if (pDst) { - *pDst = wch; - } - return 1; - } - if (wFind >= 0x8000) { - wch = wFind - 0x8000; - wFind = 1; - } else { - wch = wFind & 0x0FFF; - wFind >>= 12; - } - const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind]; - if (pMap == g_UnicodeData_Normalization_Map4) { - pMap = g_UnicodeData_Normalization_Map4 + wch; - wFind = (FX_WCHAR)(*pMap++); - } else { - pMap += wch; - } - if (pDst) { - FX_WCHAR n = wFind; - while (n--) { - *pDst++ = *pMap++; - } - } - return (FX_STRSIZE)wFind; -} diff --git a/core/fpdftext/unicodenormalization.h b/core/fpdftext/unicodenormalization.h deleted file mode 100644 index ee3c8b2024..0000000000 --- a/core/fpdftext/unicodenormalization.h +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2014 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_FPDFTEXT_UNICODENORMALIZATION_H_ -#define CORE_FPDFTEXT_UNICODENORMALIZATION_H_ - -#include "core/fxcrt/include/fx_system.h" - -FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst); - -#endif // CORE_FPDFTEXT_UNICODENORMALIZATION_H_ |