// Copyright 2014 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_ #define CORE_SRC_FPDFTEXT_TEXT_INT_H_ class CPDF_TextParseOptions { public: CPDF_TextParseOptions(); FX_BOOL m_bCheckObjectOrder; FX_BOOL m_bCheckDirection; int m_nCheckSameObject; }; class CPDF_TextPage; class CPDF_LinkExtract; class CPDF_TextPageFind; class CPDF_DocProgressiveSearch; #define FPDFTEXT_CHAR_ERROR -1 #define FPDFTEXT_CHAR_NORMAL 0 #define FPDFTEXT_CHAR_GENERATED 1 #define FPDFTEXT_CHAR_UNUNICODE 2 #define FPDFTEXT_CHAR_HYPHEN 3 #define FPDFTEXT_CHAR_PIECE 4 #define FPDFTEXT_MC_PASS 0 #define FPDFTEXT_MC_DONE 1 #define FPDFTEXT_MC_DELAY 2 typedef struct _PAGECHAR_INFO { int m_CharCode; FX_WCHAR m_Unicode; FX_FLOAT m_OriginX; FX_FLOAT m_OriginY; int32_t m_Flag; CFX_FloatRect m_CharBox; CPDF_TextObject* m_pTextObj; CFX_AffineMatrix m_Matrix; int m_Index; } PAGECHAR_INFO; typedef CFX_SegmentedArray PAGECHAR_InfoArray; typedef struct { int m_Start; int m_nCount; } FPDF_SEGMENT; typedef CFX_ArrayTemplate SEGMENT_Array; typedef struct { CPDF_TextObject* m_pTextObj; CFX_AffineMatrix m_formMatrix; } PDFTEXT_Obj; typedef CFX_ArrayTemplate LINEOBJ; class CPDF_TextPage: public IPDF_TextPage { public: CPDF_TextPage(const CPDF_Page* pPage, int flags = 0); CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0); CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); virtual FX_BOOL ParseTextPage(); virtual void NormalizeObjects(FX_BOOL bNormalize); virtual FX_BOOL IsParsered() const { return m_IsParsered; } virtual ~CPDF_TextPage() {}; public: virtual int CharIndexFromTextIndex(int TextIndex)const ; virtual int TextIndexFromCharIndex(int CharIndex)const; virtual int CountChars() const; virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const; virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const; virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const; virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray& resRectArray) const; virtual int GetOrderByDirection(int order, int direction) const; virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const; virtual int CountRects(int start, int nCount); virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top , FX_FLOAT& right, FX_FLOAT &bottom) const; virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate); virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate); virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE); virtual void GetBoundedSegment(int index, int& start, int& count) const; virtual int GetWordBreak(int index, int direction) const; public: const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; } static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2); static FX_BOOL IsLetter(FX_WCHAR unicode); private: FX_BOOL IsHyphen(FX_WCHAR curChar); FX_BOOL IsControlChar(PAGECHAR_INFO* pCharInfo); FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); void ProcessObject(); void ProcessFormObject(CPDF_FormObject* pFormObj, const CFX_AffineMatrix& formMatrix); void ProcessTextObject(PDFTEXT_Obj pObj); void ProcessTextObject(CPDF_TextObject* pTextObj, const CFX_AffineMatrix& formMatrix, FX_POSITION ObjPos); int ProcessInsertObject(const CPDF_TextObject* pObj, const CFX_AffineMatrix& formMatrix); FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; void CloseTempLine(); void OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str); int32_t PreMarkedContent(PDFTEXT_Obj pObj); void ProcessMarkedContent(PDFTEXT_Obj pObj); void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; void FindPreviousTextObject(void); void AddCharInfoByLRDirection(CFX_WideString& str, int i); void AddCharInfoByRLDirection(CFX_WideString& str, int i); int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); int32_t FindTextlineFlowDirection(); void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, const CPDF_Font* pFont, int nItems) const; protected: CPDFText_ParseOptions m_ParseOptions; CFX_WordArray m_CharIndex; const CPDF_PageObjects* m_pPage; PAGECHAR_InfoArray m_charList; CFX_WideTextBuf m_TextBuf; PAGECHAR_InfoArray m_TempCharList; CFX_WideTextBuf m_TempTextBuf; int m_parserflag; CPDF_TextObject* m_pPreTextObj; CFX_AffineMatrix m_perMatrix; FX_BOOL m_IsParsered; CFX_AffineMatrix m_DisplayMatrix; SEGMENT_Array m_Segment; CFX_RectArray m_SelRects; LINEOBJ m_LineObj; FX_BOOL m_TextlineDir; CFX_FloatRect m_CurlineRect; }; class CPDF_TextPageFind: public IPDF_TextPageFind { public: CPDF_TextPageFind(const IPDF_TextPage* pTextPage); virtual ~CPDF_TextPageFind() {}; public: virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, int flags, int startPos = 0); virtual FX_BOOL FindNext(); virtual FX_BOOL FindPrev(); virtual void GetRectArray(CFX_RectArray& rects) const; virtual int GetCurOrder() const; virtual int GetMatchedCount()const; protected: void ExtractFindWhat(const CFX_WideString& findwhat); FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, int startPos, int endPos); FX_BOOL ExtractSubString(CFX_WideString& rString, const FX_WCHAR* lpszFullString, int iSubString, FX_WCHAR chSep); CFX_WideString MakeReverse(const CFX_WideString& str); int ReverseFind(const CFX_WideString& csPageText, const CFX_WideString& csWord, int nStartPos, int& WordLength); int GetCharIndex(int index) const; private: CFX_WordArray m_CharIndex; const IPDF_TextPage* m_pTextPage; CFX_WideString m_strText; CFX_WideString m_findWhat; int m_flags; CFX_WideStringArray m_csFindWhatArray; int m_findNextStart; int m_findPreStart; FX_BOOL m_bMatchCase; FX_BOOL m_bMatchWholeWord; int m_resStart; int m_resEnd; CFX_RectArray m_resArray; FX_BOOL m_IsFind; }; class CPDF_LinkExt { public: CPDF_LinkExt() {}; int m_Start; int m_Count; CFX_WideString m_strUrl; virtual ~CPDF_LinkExt() {}; }; typedef CFX_ArrayTemplate LINK_InfoArray; class CPDF_LinkExtract: public IPDF_LinkExtract { public: CPDF_LinkExtract(); virtual ~CPDF_LinkExtract(); virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage); virtual FX_BOOL IsExtract() const { return m_IsParserd; } public: virtual int CountLinks() const; virtual CFX_WideString GetURL(int index) const; virtual void GetBoundedSegment(int index, int& start, int& count) const; virtual void GetRects(int index, CFX_RectArray& rects)const; protected: void parserLink(); void DeleteLinkList(); FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); FX_BOOL CheckMailLink(CFX_WideString& str); FX_BOOL AppendToLinkList(int start, int count, const CFX_WideString& strUrl); private: LINK_InfoArray m_LinkList; const CPDF_TextPage* m_pTextPage; CFX_WideString m_strPageText; FX_BOOL m_IsParserd; }; FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst); void NormalizeString(CFX_WideString& str); void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_