// Copyright 2014 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_ #define CORE_SRC_FPDFTEXT_TEXT_INT_H_ #include <deque> #include <vector> #include "core/include/fpdftext/fpdf_text.h" #include "core/include/fxcrt/fx_basic.h" class CFX_BidiChar; class CPDF_DocProgressiveSearch; class CPDF_FormObject; class CPDF_LinkExtract; class CPDF_TextPageFind; #define FPDFTEXT_CHAR_ERROR -1 #define FPDFTEXT_CHAR_NORMAL 0 #define FPDFTEXT_CHAR_GENERATED 1 #define FPDFTEXT_CHAR_UNUNICODE 2 #define FPDFTEXT_CHAR_HYPHEN 3 #define FPDFTEXT_CHAR_PIECE 4 #define FPDFTEXT_MC_PASS 0 #define FPDFTEXT_MC_DONE 1 #define FPDFTEXT_MC_DELAY 2 struct PAGECHAR_INFO { int m_CharCode; FX_WCHAR m_Unicode; FX_FLOAT m_OriginX; FX_FLOAT m_OriginY; int32_t m_Flag; CFX_FloatRect m_CharBox; CPDF_TextObject* m_pTextObj; CFX_Matrix m_Matrix; int m_Index; }; struct FPDF_SEGMENT { int m_Start; int m_nCount; }; struct PDFTEXT_Obj { CPDF_TextObject* m_pTextObj; CFX_Matrix m_formMatrix; }; class CPDF_TextPage : public IPDF_TextPage { public: CPDF_TextPage(const CPDF_Page* pPage, int flags); ~CPDF_TextPage() override {} // IPDF_TextPage: FX_BOOL ParseTextPage() override; bool IsParsed() const override { return m_bIsParsed; } int CharIndexFromTextIndex(int TextIndex) const override; int TextIndexFromCharIndex(int CharIndex) const override; int CountChars() const override; void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override; void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const override; int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTolerance, FX_FLOAT yTolerance) const override; int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTolerance, FX_FLOAT yTolerance) const override; CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override; void GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray& resRectArray) const override; CFX_WideString GetPageText(int start = 0, int nCount = -1) const override; int CountRects(int start, int nCount) override; void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT& bottom) const override; FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override; FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override; int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) override; void GetBoundedSegment(int index, int& start, int& count) const override; int GetWordBreak(int index, int direction) const override; static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2); static FX_BOOL IsLetter(FX_WCHAR unicode); private: FX_BOOL IsHyphen(FX_WCHAR curChar); bool IsControlChar(const PAGECHAR_INFO& charInfo); FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); void ProcessObject(); void ProcessFormObject(CPDF_FormObject* pFormObj, const CFX_Matrix& formMatrix); void ProcessTextObject(PDFTEXT_Obj pObj); void ProcessTextObject(CPDF_TextObject* pTextObj, const CFX_Matrix& formMatrix, FX_POSITION ObjPos); int ProcessInsertObject(const CPDF_TextObject* pObj, const CFX_Matrix& formMatrix); FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; void CloseTempLine(); void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str); int32_t PreMarkedContent(PDFTEXT_Obj pObj); void ProcessMarkedContent(PDFTEXT_Obj pObj); void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; void FindPreviousTextObject(void); void AddCharInfoByLRDirection(CFX_WideString& str, int i); void AddCharInfoByRLDirection(CFX_WideString& str, int i); int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); int32_t FindTextlineFlowDirection(); void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, const CPDF_Font* pFont, int nItems) const; std::vector<FX_WORD> m_CharIndex; const CPDF_PageObjectList* const m_pPage; std::deque<PAGECHAR_INFO> m_CharList; std::deque<PAGECHAR_INFO> m_TempCharList; CFX_WideTextBuf m_TextBuf; CFX_WideTextBuf m_TempTextBuf; const int m_parserflag; CPDF_TextObject* m_pPreTextObj; CFX_Matrix m_perMatrix; bool m_bIsParsed; CFX_Matrix m_DisplayMatrix; CFX_ArrayTemplate<FPDF_SEGMENT> m_Segments; CFX_RectArray m_SelRects; CFX_ArrayTemplate<PDFTEXT_Obj> m_LineObj; int32_t m_TextlineDir; CFX_FloatRect m_CurlineRect; }; class CPDF_TextPageFind : public IPDF_TextPageFind { public: explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage); ~CPDF_TextPageFind() override {} // IPDF_TextPageFind FX_BOOL FindFirst(const CFX_WideString& findwhat, int flags, int startPos = 0) override; FX_BOOL FindNext() override; FX_BOOL FindPrev() override; void GetRectArray(CFX_RectArray& rects) const override; int GetCurOrder() const override; int GetMatchedCount() const override; protected: void ExtractFindWhat(const CFX_WideString& findwhat); FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, int startPos, int endPos); FX_BOOL ExtractSubString(CFX_WideString& rString, const FX_WCHAR* lpszFullString, int iSubString, FX_WCHAR chSep); CFX_WideString MakeReverse(const CFX_WideString& str); int ReverseFind(const CFX_WideString& csPageText, const CFX_WideString& csWord, int nStartPos, int& WordLength); int GetCharIndex(int index) const; private: std::vector<FX_WORD> m_CharIndex; const IPDF_TextPage* m_pTextPage; CFX_WideString m_strText; CFX_WideString m_findWhat; int m_flags; CFX_WideStringArray m_csFindWhatArray; int m_findNextStart; int m_findPreStart; FX_BOOL m_bMatchCase; FX_BOOL m_bMatchWholeWord; int m_resStart; int m_resEnd; CFX_RectArray m_resArray; FX_BOOL m_IsFind; }; class CPDF_LinkExt { public: CPDF_LinkExt() {} int m_Start; int m_Count; CFX_WideString m_strUrl; virtual ~CPDF_LinkExt() {} }; typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; class CPDF_LinkExtract : public IPDF_LinkExtract { public: CPDF_LinkExtract(); ~CPDF_LinkExtract() override; // IPDF_LinkExtract FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override; int CountLinks() const override; CFX_WideString GetURL(int index) const override; void GetBoundedSegment(int index, int& start, int& count) const override; void GetRects(int index, CFX_RectArray& rects) const override; FX_BOOL IsExtract() const { return m_bIsParsed; } protected: void ParseLink(); void DeleteLinkList(); FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); bool CheckMailLink(CFX_WideString& str); void AppendToLinkList(int start, int count, const CFX_WideString& strUrl); private: LINK_InfoArray m_LinkList; const CPDF_TextPage* m_pTextPage; CFX_WideString m_strPageText; bool m_bIsParsed; }; FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst); void NormalizeString(CFX_WideString& str); void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); void GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjectList* pPage, FX_BOOL bUseLF); #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_