// Copyright 2014 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #ifndef _PDF_TEXT_INT_H_ #define _PDF_TEXT_INT_H_ class CPDF_TextParseOptions : public CFX_Object { public: CPDF_TextParseOptions(); FX_BOOL m_bCheckObjectOrder; FX_BOOL m_bCheckDirection; int m_nCheckSameObject; }; class CPDF_TextPage; class CPDF_LinkExtract; class CPDF_TextPageFind; class CPDF_DocProgressiveSearch; #define FPDFTEXT_CHAR_ERROR -1 #define FPDFTEXT_CHAR_NORMAL 0 #define FPDFTEXT_CHAR_GENERATED 1 #define FPDFTEXT_CHAR_UNUNICODE 2 #define FPDFTEXT_CHAR_HYPHEN 3 #define FPDFTEXT_CHAR_PIECE 4 #define FPDFTEXT_MC_PASS 0 #define FPDFTEXT_MC_DONE 1 #define FPDFTEXT_MC_DELAY 2 typedef struct _PAGECHAR_INFO: public CFX_Object { int m_CharCode; FX_WCHAR m_Unicode; FX_FLOAT m_OriginX; FX_FLOAT m_OriginY; FX_INT32 m_Flag; CFX_FloatRect m_CharBox; CPDF_TextObject* m_pTextObj; CFX_AffineMatrix m_Matrix; int m_Index; } PAGECHAR_INFO; typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray; typedef struct { int m_Start; int m_nCount; } FPDF_SEGMENT; typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array; typedef struct { CPDF_TextObject* m_pTextObj; CFX_AffineMatrix m_formMatrix; } PDFTEXT_Obj; typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ; class CPDF_TextPage: public IPDF_TextPage { public: CPDF_TextPage(const CPDF_Page* pPage, int flags = 0); CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0); CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); virtual FX_BOOL ParseTextPage(); virtual void NormalizeObjects(FX_BOOL bNormalize); virtual FX_BOOL IsParsered() const { return m_IsParsered; } virtual ~CPDF_TextPage() {}; public: virtual int CharIndexFromTextIndex(int TextIndex)const ; virtual int TextIndexFromCharIndex(int CharIndex)const; virtual int CountChars() const; virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const; virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const; virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; virtual CFX_WideString GetTextByRect(CFX_FloatRect rect) const; virtual void GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const; virtual int GetOrderByDirection(int order, int direction) const; virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const; virtual int CountRects(int start, int nCount); virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top , FX_FLOAT& right, FX_FLOAT &bottom) const; virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate); virtual FX_BOOL GetBaselineRotate(CFX_FloatRect rect, int& Rotate); virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE); virtual void GetBoundedSegment(int index, int& start, int& count) const; virtual int GetWordBreak(int index, int direction) const; public: const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; } static FX_BOOL IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2); static FX_BOOL IsLetter(FX_WCHAR unicode); private: FX_BOOL IsHyphen(FX_WCHAR curChar); FX_BOOL IsControlChar(PAGECHAR_INFO* pCharInfo); FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); void ProcessObject(); void ProcessFormObject(CPDF_FormObject* pFormObj, CFX_AffineMatrix formMatrix); void ProcessTextObject(PDFTEXT_Obj pObj); void ProcessTextObject(CPDF_TextObject* pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos); int ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix); FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; void CloseTempLine(); void OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str); FX_INT32 PreMarkedContent(PDFTEXT_Obj pObj); void ProcessMarkedContent(PDFTEXT_Obj pObj); void CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const; void FindPreviousTextObject(void); void AddCharInfoByLRDirection(CFX_WideString& str, int i); void AddCharInfoByRLDirection(CFX_WideString& str, int i); FX_INT32 GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); FX_INT32 FindTextlineFlowDirection(); protected: CPDFText_ParseOptions m_ParseOptions; CFX_WordArray m_CharIndex; const CPDF_PageObjects* m_pPage; PAGECHAR_InfoArray m_charList; CFX_WideTextBuf m_TextBuf; PAGECHAR_InfoArray m_TempCharList; CFX_WideTextBuf m_TempTextBuf; int m_parserflag; CPDF_TextObject* m_pPreTextObj; CFX_AffineMatrix m_perMatrix; FX_BOOL m_IsParsered; CFX_AffineMatrix m_DisplayMatrix; SEGMENT_Array m_Segment; CFX_RectArray m_SelRects; LINEOBJ m_LineObj; FX_BOOL m_TextlineDir; CFX_FloatRect m_CurlineRect; }; class CPDF_TextPageFind: public IPDF_TextPageFind { public: CPDF_TextPageFind(const IPDF_TextPage* pTextPage); virtual ~CPDF_TextPageFind() {}; public: virtual FX_BOOL FindFirst(CFX_WideString findwhat, int flags, int startPos = 0); virtual FX_BOOL FindNext(); virtual FX_BOOL FindPrev(); virtual void GetRectArray(CFX_RectArray& rects) const; virtual int GetCurOrder() const; virtual int GetMatchedCount()const; protected: void ExtractFindWhat(CFX_WideString findwhat); FX_BOOL IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos); FX_BOOL ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString, int iSubString, FX_WCHAR chSep); CFX_WideString MakeReverse(const CFX_WideString str); int ReverseFind(CFX_WideString csPageText, CFX_WideString csWord, int nStartPos, int& WordLength); int GetCharIndex(int index) const; private: CFX_WordArray m_CharIndex; const IPDF_TextPage* m_pTextPage; CFX_WideString m_strText; CFX_WideString m_findWhat; int m_flags; CFX_WideStringArray m_csFindWhatArray; int m_findNextStart; int m_findPreStart; FX_BOOL m_bMatchCase; FX_BOOL m_bMatchWholeWord; int m_resStart; int m_resEnd; CFX_RectArray m_resArray; FX_BOOL m_IsFind; }; class CPDF_LinkExt: public CFX_Object { public: CPDF_LinkExt() {}; int m_Start; int m_Count; CFX_WideString m_strUrl; virtual ~CPDF_LinkExt() {}; }; typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; class CPDF_LinkExtract: public IPDF_LinkExtract { public: CPDF_LinkExtract(); virtual ~CPDF_LinkExtract(); virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage); virtual FX_BOOL IsExtract() const { return m_IsParserd; } public: virtual int CountLinks() const; virtual CFX_WideString GetURL(int index) const; virtual void GetBoundedSegment(int index, int& start, int& count) const; virtual void GetRects(int index, CFX_RectArray& rects)const; protected: void parserLink(); void DeleteLinkList(); FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); FX_BOOL CheckMailLink(CFX_WideString& str); FX_BOOL AppendToLinkList(int start, int count, CFX_WideString strUrl); private: LINK_InfoArray m_LinkList; const CPDF_TextPage* m_pTextPage; CFX_WideString m_strPageText; FX_BOOL m_IsParserd; }; FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst); void NormalizeString(CFX_WideString& str); void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); #endif