diff options
Diffstat (limited to 'core/fpdftext/fpdf_text_int.cpp')
-rw-r--r-- | core/fpdftext/fpdf_text_int.cpp | 103 |
1 files changed, 67 insertions, 36 deletions
diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/fpdf_text_int.cpp index 8e8686c4a1..741331fb77 100644 --- a/core/fpdftext/fpdf_text_int.cpp +++ b/core/fpdftext/fpdf_text_int.cpp @@ -4,8 +4,6 @@ // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com -#include "core/fpdftext/fpdf_text_int.h" - #include <algorithm> #include <cctype> #include <cwctype> @@ -14,15 +12,17 @@ #include <vector> #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" +#include "core/fpdfapi/fpdf_page/include/cpdf_form.h" #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" +#include "core/fpdfapi/fpdf_page/include/cpdf_page.h" #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" -#include "core/fpdftext/include/ipdf_linkextract.h" -#include "core/fpdftext/include/ipdf_textpage.h" -#include "core/fpdftext/include/ipdf_textpagefind.h" -#include "core/fpdftext/unicodenormalization.h" +#include "core/fpdftext/include/cpdf_linkextract.h" +#include "core/fpdftext/include/cpdf_textpage.h" +#include "core/fpdftext/include/cpdf_textpagefind.h" +#include "core/fpdftext/unicodenormalizationdata.h" #include "core/fxcrt/fx_bidi.h" #include "core/fxcrt/include/fx_ext.h" #include "core/fxcrt/include/fx_ucd.h" @@ -36,9 +36,24 @@ #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 #define FPDFTEXT_CONSECUTIVE 0x00000004 +#define FPDFTEXT_CHAR_ERROR -1 +#define FPDFTEXT_CHAR_NORMAL 0 +#define FPDFTEXT_CHAR_GENERATED 1 +#define FPDFTEXT_CHAR_UNUNICODE 2 +#define FPDFTEXT_CHAR_HYPHEN 3 +#define FPDFTEXT_CHAR_PIECE 4 +#define FPDFTEXT_MC_PASS 0 +#define FPDFTEXT_MC_DONE 1 +#define FPDFTEXT_MC_DELAY 2 + namespace { -FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { +const FX_FLOAT kDefaultFontSize = 1.0f; +const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { + nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, + g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; + +FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { if (curChar < 255) { return FALSE; } @@ -55,7 +70,7 @@ FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { return TRUE; } -FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { +FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { if (threshold < 300) { return threshold / 2.0f; } @@ -68,8 +83,8 @@ FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { return threshold / 6.0f; } -FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, - const CFX_Matrix& matrix) { +FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj, + const CFX_Matrix& matrix) { FX_FLOAT baseSpace = 0.0; const int nItems = pTextObj->CountItems(); if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { @@ -94,23 +109,39 @@ FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, return baseSpace; } -const FX_FLOAT kDefaultFontSize = 1.0f; - -} // namespace - -IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, - int flags) { - return new CPDF_TextPage(pPage, flags); -} - -IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind( - const IPDF_TextPage* pTextPage) { - return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr; +FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) { + wch = wch & 0xFFFF; + FX_WCHAR wFind = g_UnicodeData_Normalization[wch]; + if (!wFind) { + if (pDst) { + *pDst = wch; + } + return 1; + } + if (wFind >= 0x8000) { + wch = wFind - 0x8000; + wFind = 1; + } else { + wch = wFind & 0x0FFF; + wFind >>= 12; + } + const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind]; + if (pMap == g_UnicodeData_Normalization_Map4) { + pMap = g_UnicodeData_Normalization_Map4 + wch; + wFind = (FX_WCHAR)(*pMap++); + } else { + pMap += wch; + } + if (pDst) { + FX_WCHAR n = wFind; + while (n--) { + *pDst++ = *pMap++; + } + } + return (FX_STRSIZE)wFind; } -IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() { - return new CPDF_LinkExtract(); -} +} // namespace #define TEXT_BLANK_CHAR L' ' #define TEXT_LINEFEED_CHAR L'\n' @@ -932,10 +963,10 @@ void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar, info.m_Index = m_TextBuf.GetLength(); if (wChar >= 0xFB00 && wChar <= 0xFB06) { FX_WCHAR* pDst = NULL; - FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); + FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst); if (nCount >= 1) { pDst = FX_Alloc(FX_WCHAR, nCount); - FX_Unicode_GetNormalization(wChar, pDst); + Unicode_GetNormalization(wChar, pDst); for (int nIndex = 0; nIndex < nCount; nIndex++) { PAGECHAR_INFO info2 = info; info2.m_Unicode = pDst[nIndex]; @@ -960,10 +991,10 @@ void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar, info.m_Index = m_TextBuf.GetLength(); wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); FX_WCHAR* pDst = NULL; - FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); + FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst); if (nCount >= 1) { pDst = FX_Alloc(FX_WCHAR, nCount); - FX_Unicode_GetNormalization(wChar, pDst); + Unicode_GetNormalization(wChar, pDst); for (int nIndex = 0; nIndex < nCount; nIndex++) { PAGECHAR_INFO info2 = info; info2.m_Unicode = pDst[nIndex]; @@ -1377,7 +1408,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { m_pPreTextObj = pTextObj; m_perMatrix.Copy(formMatrix); int nItems = pTextObj->CountItems(); - FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); + FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix); const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems); const FX_BOOL bIsBidiAndMirrorInverse = @@ -1430,7 +1461,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; - threshold = _NormalizeThreshold(threshold); + threshold = NormalizeThreshold(threshold); threshold = fontsize_h * threshold / 1000; } if (threshold && (spacing && spacing >= threshold)) { @@ -1898,7 +1929,7 @@ FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) { return TRUE; } -CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) +CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) : m_pTextPage(pTextPage), m_flags(0), m_findNextStart(-1), @@ -2054,8 +2085,8 @@ FX_BOOL CPDF_TextPageFind::FindNext() { CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); if (nStartPos == nResultPos && - !(_IsIgnoreSpaceCharacter(lastChar) || - _IsIgnoreSpaceCharacter(curChar))) { + !(IsIgnoreSpaceCharacter(lastChar) || + IsIgnoreSpaceCharacter(curChar))) { bMatch = FALSE; } for (int d = PreResEndPos; d < nResultPos; d++) { @@ -2174,7 +2205,7 @@ void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { while (pos < csWord.GetLength()) { CFX_WideString curStr = csWord.Mid(pos, 1); FX_WCHAR curChar = csWord.GetAt(pos); - if (_IsIgnoreSpaceCharacter(curChar)) { + if (IsIgnoreSpaceCharacter(curChar)) { if (pos > 0 && curChar == 0x2019) { pos++; continue; @@ -2306,7 +2337,7 @@ CPDF_LinkExtract::~CPDF_LinkExtract() { DeleteLinkList(); } -FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) { +FX_BOOL CPDF_LinkExtract::ExtractLinks(const CPDF_TextPage* pTextPage) { if (!pTextPage || !pTextPage->IsParsed()) return FALSE; |