summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BUILD.gn4
-rw-r--r--core/fpdftext/cpdf_linkextract.cpp173
-rw-r--r--core/fpdftext/cpdf_textpage.cpp (renamed from core/fpdftext/fpdf_text_int.cpp)604
-rw-r--r--core/fpdftext/cpdf_textpagefind.cpp407
-rw-r--r--core/fpdftext/include/cpdf_textpage.h20
5 files changed, 605 insertions, 603 deletions
diff --git a/BUILD.gn b/BUILD.gn
index ca2338f6fd..f603099d0f 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -538,7 +538,9 @@ static_library("fpdfapi") {
static_library("fpdftext") {
sources = [
- "core/fpdftext/fpdf_text_int.cpp",
+ "core/fpdftext/cpdf_linkextract.cpp",
+ "core/fpdftext/cpdf_textpage.cpp",
+ "core/fpdftext/cpdf_textpagefind.cpp",
"core/fpdftext/include/cpdf_linkextract.h",
"core/fpdftext/include/cpdf_textpage.h",
"core/fpdftext/include/cpdf_textpagefind.h",
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
new file mode 100644
index 0000000000..1677b67b55
--- /dev/null
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -0,0 +1,173 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fpdftext/include/cpdf_linkextract.h"
+
+#include <vector>
+
+#include "core/fpdftext/include/cpdf_textpage.h"
+#include "core/fxcrt/include/fx_ext.h"
+#include "core/fxcrt/include/fx_string.h"
+#include "core/fxcrt/include/fx_system.h"
+
+CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
+ : m_pTextPage(pTextPage) {}
+
+CPDF_LinkExtract::~CPDF_LinkExtract() {}
+
+void CPDF_LinkExtract::ExtractLinks() {
+ m_LinkArray.clear();
+ if (!m_pTextPage->IsParsed())
+ return;
+
+ m_strPageText = m_pTextPage->GetPageText(0, -1);
+ if (m_strPageText.IsEmpty())
+ return;
+
+ ParseLink();
+}
+
+void CPDF_LinkExtract::ParseLink() {
+ int start = 0, pos = 0;
+ int TotalChar = m_pTextPage->CountChars();
+ while (pos < TotalChar) {
+ FPDF_CHAR_INFO pageChar;
+ m_pTextPage->GetCharInfo(pos, &pageChar);
+ if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
+ pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
+ int nCount = pos - start;
+ if (pos == TotalChar - 1)
+ nCount++;
+ CFX_WideString strBeCheck;
+ strBeCheck = m_pTextPage->GetPageText(start, nCount);
+ if (strBeCheck.GetLength() > 5) {
+ while (strBeCheck.GetLength() > 0) {
+ FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
+ if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
+ strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
+ nCount--;
+ } else {
+ break;
+ }
+ }
+ if (nCount > 5 &&
+ (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
+ m_LinkArray.push_back({start, nCount, strBeCheck});
+ }
+ }
+ start = ++pos;
+ } else {
+ pos++;
+ }
+ }
+}
+
+bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
+ CFX_WideString str = strBeCheck;
+ str.MakeLower();
+ if (str.Find(L"http://www.") != -1) {
+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
+ return true;
+ }
+ if (str.Find(L"http://") != -1) {
+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
+ return true;
+ }
+ if (str.Find(L"https://www.") != -1) {
+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
+ return true;
+ }
+ if (str.Find(L"https://") != -1) {
+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
+ return true;
+ }
+ if (str.Find(L"www.") != -1) {
+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
+ strBeCheck = L"http://" + strBeCheck;
+ return true;
+ }
+ return false;
+}
+
+bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
+ int aPos = str.Find(L'@');
+ // Invalid when no '@'.
+ if (aPos < 1)
+ return false;
+
+ // Check the local part.
+ int pPos = aPos; // Used to track the position of '@' or '.'.
+ for (int i = aPos - 1; i >= 0; i--) {
+ FX_WCHAR ch = str.GetAt(i);
+ if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
+ continue;
+
+ if (ch != L'.' || i == pPos - 1 || i == 0) {
+ if (i == aPos - 1) {
+ // There is '.' or invalid char before '@'.
+ return FALSE;
+ }
+ // End extracting for other invalid chars, '.' at the beginning, or
+ // consecutive '.'.
+ int removed_len = i == pPos - 1 ? i + 2 : i + 1;
+ str = str.Right(str.GetLength() - removed_len);
+ break;
+ }
+ // Found a valid '.'.
+ pPos = i;
+ }
+
+ // Check the domain name part.
+ aPos = str.Find(L'@');
+ if (aPos < 1)
+ return false;
+
+ str.TrimRight(L'.');
+ // At least one '.' in domain name, but not at the beginning.
+ // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
+ // Check whether we should remove this check.
+ int ePos = str.Find(L'.', aPos + 1);
+ if (ePos == -1 || ePos == aPos + 1)
+ return false;
+
+ // Validate all other chars in domain name.
+ int nLen = str.GetLength();
+ pPos = 0; // Used to track the position of '.'.
+ for (int i = aPos + 1; i < nLen; i++) {
+ FX_WCHAR wch = str.GetAt(i);
+ if (wch == L'-' || FXSYS_iswalnum(wch))
+ continue;
+
+ if (wch != L'.' || i == pPos + 1) {
+ // Domain name should end before invalid char.
+ int host_end = i == pPos + 1 ? i - 2 : i - 1;
+ if (pPos > 0 && host_end - aPos >= 3) {
+ // Trim the ending invalid chars if there is at least one '.' and name.
+ str = str.Left(host_end + 1);
+ break;
+ }
+ return false;
+ }
+ pPos = i;
+ }
+
+ if (str.Find(L"mailto:") == -1)
+ str = L"mailto:" + str;
+
+ return true;
+}
+
+CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
+ return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
+}
+
+std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
+ if (index >= m_LinkArray.size())
+ return std::vector<CFX_FloatRect>();
+
+ return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
+ m_LinkArray[index].m_Count);
+}
diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/cpdf_textpage.cpp
index fbd9c9c8c1..3981cfee40 100644
--- a/core/fpdftext/fpdf_text_int.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -4,10 +4,9 @@
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+#include "core/fpdftext/include/cpdf_textpage.h"
+
#include <algorithm>
-#include <cctype>
-#include <cwctype>
-#include <memory>
#include <utility>
#include <vector>
@@ -19,35 +18,12 @@
#include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"
#include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"
#include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"
-#include "core/fpdftext/include/cpdf_linkextract.h"
-#include "core/fpdftext/include/cpdf_textpage.h"
-#include "core/fpdftext/include/cpdf_textpagefind.h"
#include "core/fpdftext/unicodenormalizationdata.h"
#include "core/fxcrt/fx_bidi.h"
#include "core/fxcrt/include/fx_ext.h"
#include "core/fxcrt/include/fx_ucd.h"
#include "third_party/base/stl_util.h"
-#define FPDFTEXT_MATCHCASE 0x00000001
-#define FPDFTEXT_MATCHWHOLEWORD 0x00000002
-#define FPDFTEXT_CONSECUTIVE 0x00000004
-
-#define FPDFTEXT_CHAR_ERROR -1
-#define FPDFTEXT_CHAR_NORMAL 0
-#define FPDFTEXT_CHAR_GENERATED 1
-#define FPDFTEXT_CHAR_UNUNICODE 2
-#define FPDFTEXT_CHAR_HYPHEN 3
-#define FPDFTEXT_CHAR_PIECE 4
-
-#define TEXT_SPACE_CHAR L' '
-#define TEXT_LINEFEED_CHAR L'\n'
-#define TEXT_RETURN_CHAR L'\r'
-#define TEXT_EMPTY L""
-#define TEXT_SPACE L" "
-#define TEXT_RETURN_LINEFEED L"\r\n"
-#define TEXT_LINEFEED L"\n"
-#define TEXT_CHARRATIO_GAPDELTA 0.070
-
namespace {
const FX_FLOAT kDefaultFontSize = 1.0f;
@@ -55,22 +31,6 @@ const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
-FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
- if (curChar < 255)
- return FALSE;
- if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
- (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
- (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
- (curChar >= 0x0400 && curChar <= 0x04FF) ||
- (curChar >= 0x0500 && curChar <= 0x052F) ||
- (curChar >= 0xA640 && curChar <= 0xA69F) ||
- (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
- (curChar >= 0x2000 && curChar <= 0x206F)) {
- return FALSE;
- }
- return TRUE;
-}
-
FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
if (threshold < 300)
return threshold / 2.0f;
@@ -1587,563 +1547,3 @@ FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
rect.Intersect(rect2);
return !rect.IsEmpty();
}
-
-CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
- : m_pTextPage(pTextPage),
- m_flags(0),
- m_findNextStart(-1),
- m_findPreStart(-1),
- m_bMatchCase(FALSE),
- m_bMatchWholeWord(FALSE),
- m_resStart(0),
- m_resEnd(-1),
- m_IsFind(FALSE) {
- m_strText = m_pTextPage->GetPageText();
- int nCount = pTextPage->CountChars();
- if (nCount) {
- m_CharIndex.push_back(0);
- }
- for (int i = 0; i < nCount; i++) {
- FPDF_CHAR_INFO info;
- pTextPage->GetCharInfo(i, &info);
- int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
- if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
- info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
- if (indexSize % 2) {
- m_CharIndex.push_back(1);
- } else {
- if (indexSize <= 0) {
- continue;
- }
- m_CharIndex[indexSize - 1] += 1;
- }
- } else {
- if (indexSize % 2) {
- if (indexSize <= 0) {
- continue;
- }
- m_CharIndex[indexSize - 1] = i + 1;
- } else {
- m_CharIndex.push_back(i + 1);
- }
- }
- }
- int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
- if (indexSize % 2) {
- m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
- }
-}
-
-CPDF_TextPageFind::~CPDF_TextPageFind() {}
-
-int CPDF_TextPageFind::GetCharIndex(int index) const {
- return m_pTextPage->CharIndexFromTextIndex(index);
-}
-
-FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
- int flags,
- int startPos) {
- if (!m_pTextPage) {
- return FALSE;
- }
- if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
- m_strText = m_pTextPage->GetPageText();
- }
- CFX_WideString findwhatStr = findwhat;
- m_findWhat = findwhatStr;
- m_flags = flags;
- m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
- if (m_strText.IsEmpty()) {
- m_IsFind = FALSE;
- return TRUE;
- }
- FX_STRSIZE len = findwhatStr.GetLength();
- if (!m_bMatchCase) {
- findwhatStr.MakeLower();
- m_strText.MakeLower();
- }
- m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
- m_findNextStart = startPos;
- if (startPos == -1) {
- m_findPreStart = m_strText.GetLength() - 1;
- } else {
- m_findPreStart = startPos;
- }
- m_csFindWhatArray.clear();
- int i = 0;
- while (i < len) {
- if (findwhatStr.GetAt(i) != ' ') {
- break;
- }
- i++;
- }
- if (i < len) {
- ExtractFindWhat(findwhatStr);
- } else {
- m_csFindWhatArray.push_back(findwhatStr);
- }
- if (m_csFindWhatArray.empty()) {
- return FALSE;
- }
- m_IsFind = TRUE;
- m_resStart = 0;
- m_resEnd = -1;
- return TRUE;
-}
-
-FX_BOOL CPDF_TextPageFind::FindNext() {
- if (!m_pTextPage) {
- return FALSE;
- }
- m_resArray.clear();
- if (m_findNextStart == -1) {
- return FALSE;
- }
- if (m_strText.IsEmpty()) {
- m_IsFind = FALSE;
- return m_IsFind;
- }
- int strLen = m_strText.GetLength();
- if (m_findNextStart > strLen - 1) {
- m_IsFind = FALSE;
- return m_IsFind;
- }
- int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
- int nResultPos = 0;
- int nStartPos = 0;
- nStartPos = m_findNextStart;
- FX_BOOL bSpaceStart = FALSE;
- for (int iWord = 0; iWord < nCount; iWord++) {
- CFX_WideString csWord = m_csFindWhatArray[iWord];
- if (csWord.IsEmpty()) {
- if (iWord == nCount - 1) {
- FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
- if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
- strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
- nResultPos = nStartPos + 1;
- break;
- }
- iWord = -1;
- } else if (iWord == 0) {
- bSpaceStart = TRUE;
- }
- continue;
- }
- int endIndex;
- nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
- if (nResultPos == -1) {
- m_IsFind = FALSE;
- return m_IsFind;
- }
- endIndex = nResultPos + csWord.GetLength() - 1;
- if (iWord == 0) {
- m_resStart = nResultPos;
- }
- FX_BOOL bMatch = TRUE;
- if (iWord != 0 && !bSpaceStart) {
- int PreResEndPos = nStartPos;
- int curChar = csWord.GetAt(0);
- CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
- int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
- if (nStartPos == nResultPos &&
- !(IsIgnoreSpaceCharacter(lastChar) ||
- IsIgnoreSpaceCharacter(curChar))) {
- bMatch = FALSE;
- }
- for (int d = PreResEndPos; d < nResultPos; d++) {
- FX_WCHAR strInsert = m_strText.GetAt(d);
- if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
- strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
- bMatch = FALSE;
- break;
- }
- }
- } else if (bSpaceStart) {
- if (nResultPos > 0) {
- FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
- if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
- strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
- bMatch = FALSE;
- m_resStart = nResultPos;
- } else {
- m_resStart = nResultPos - 1;
- }
- }
- }
- if (m_bMatchWholeWord && bMatch) {
- bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
- }
- nStartPos = endIndex + 1;
- if (!bMatch) {
- iWord = -1;
- if (bSpaceStart) {
- nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
- } else {
- nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
- }
- }
- }
- m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
- m_IsFind = TRUE;
- int resStart = GetCharIndex(m_resStart);
- int resEnd = GetCharIndex(m_resEnd);
- m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
- if (m_flags & FPDFTEXT_CONSECUTIVE) {
- m_findNextStart = m_resStart + 1;
- m_findPreStart = m_resEnd - 1;
- } else {
- m_findNextStart = m_resEnd + 1;
- m_findPreStart = m_resStart - 1;
- }
- return m_IsFind;
-}
-
-FX_BOOL CPDF_TextPageFind::FindPrev() {
- if (!m_pTextPage) {
- return FALSE;
- }
- m_resArray.clear();
- if (m_strText.IsEmpty() || m_findPreStart < 0) {
- m_IsFind = FALSE;
- return m_IsFind;
- }
- CPDF_TextPageFind findEngine(m_pTextPage);
- FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
- if (!ret) {
- m_IsFind = FALSE;
- return m_IsFind;
- }
- int order = -1, MatchedCount = 0;
- while (ret) {
- ret = findEngine.FindNext();
- if (ret) {
- int order1 = findEngine.GetCurOrder();
- int MatchedCount1 = findEngine.GetMatchedCount();
- if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
- break;
- }
- order = order1;
- MatchedCount = MatchedCount1;
- }
- }
- if (order == -1) {
- m_IsFind = FALSE;
- return m_IsFind;
- }
- m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
- m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
- m_IsFind = TRUE;
- m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
- if (m_flags & FPDFTEXT_CONSECUTIVE) {
- m_findNextStart = m_resStart + 1;
- m_findPreStart = m_resEnd - 1;
- } else {
- m_findNextStart = m_resEnd + 1;
- m_findPreStart = m_resStart - 1;
- }
- return m_IsFind;
-}
-
-void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
- if (findwhat.IsEmpty()) {
- return;
- }
- int index = 0;
- while (1) {
- CFX_WideString csWord = TEXT_EMPTY;
- int ret =
- ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
- if (csWord.IsEmpty()) {
- if (ret) {
- m_csFindWhatArray.push_back(L"");
- index++;
- continue;
- } else {
- break;
- }
- }
- int pos = 0;
- while (pos < csWord.GetLength()) {
- CFX_WideString curStr = csWord.Mid(pos, 1);
- FX_WCHAR curChar = csWord.GetAt(pos);
- if (IsIgnoreSpaceCharacter(curChar)) {
- if (pos > 0 && curChar == 0x2019) {
- pos++;
- continue;
- }
- if (pos > 0) {
- m_csFindWhatArray.push_back(csWord.Mid(0, pos));
- }
- m_csFindWhatArray.push_back(curStr);
- if (pos == csWord.GetLength() - 1) {
- csWord.clear();
- break;
- }
- csWord = csWord.Right(csWord.GetLength() - pos - 1);
- pos = 0;
- continue;
- }
- pos++;
- }
- if (!csWord.IsEmpty()) {
- m_csFindWhatArray.push_back(csWord);
- }
- index++;
- }
-}
-
-FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
- int startPos,
- int endPos) {
- FX_WCHAR char_left = 0;
- FX_WCHAR char_right = 0;
- int char_count = endPos - startPos + 1;
- if (char_count < 1) {
- return FALSE;
- }
- if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
- return TRUE;
- }
- if (startPos - 1 >= 0) {
- char_left = csPageText.GetAt(startPos - 1);
- }
- if (startPos + char_count < csPageText.GetLength()) {
- char_right = csPageText.GetAt(startPos + char_count);
- }
- if ((char_left > 'A' && char_left < 'a') ||
- (char_left > 'a' && char_left < 'z') ||
- (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
- (char_right > 'A' && char_right < 'a') ||
- (char_right > 'a' && char_right < 'z') ||
- (char_right > 0xfb00 && char_right < 0xfb06) ||
- std::iswdigit(char_right)) {
- return FALSE;
- }
- if (!(('A' > char_left || char_left > 'Z') &&
- ('a' > char_left || char_left > 'z') &&
- ('A' > char_right || char_right > 'Z') &&
- ('a' > char_right || char_right > 'z'))) {
- return FALSE;
- }
- if (char_count > 0) {
- if (csPageText.GetAt(startPos) >= L'0' &&
- csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
- char_left <= L'9') {
- return FALSE;
- }
- if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
- char_right >= L'0' && char_right <= L'9') {
- return FALSE;
- }
- }
- return TRUE;
-}
-
-FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
- const FX_WCHAR* lpszFullString,
- int iSubString,
- FX_WCHAR chSep) {
- if (!lpszFullString) {
- return FALSE;
- }
- while (iSubString--) {
- lpszFullString = wcschr(lpszFullString, chSep);
- if (!lpszFullString) {
- rString.clear();
- return FALSE;
- }
- lpszFullString++;
- while (*lpszFullString == chSep) {
- lpszFullString++;
- }
- }
- const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep);
- int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
- : (int)FXSYS_wcslen(lpszFullString);
- ASSERT(nLen >= 0);
- FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
- nLen * sizeof(FX_WCHAR));
- rString.ReleaseBuffer();
- return TRUE;
-}
-
-CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
- CFX_WideString str2;
- str2.clear();
- int nlen = str.GetLength();
- for (int i = nlen - 1; i >= 0; i--) {
- str2 += str.GetAt(i);
- }
- return str2;
-}
-
-int CPDF_TextPageFind::GetCurOrder() const {
- return GetCharIndex(m_resStart);
-}
-
-int CPDF_TextPageFind::GetMatchedCount() const {
- int resStart = GetCharIndex(m_resStart);
- int resEnd = GetCharIndex(m_resEnd);
- return resEnd - resStart + 1;
-}
-
-CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
- : m_pTextPage(pTextPage) {}
-
-CPDF_LinkExtract::~CPDF_LinkExtract() {
-}
-
-void CPDF_LinkExtract::ExtractLinks() {
- m_LinkArray.clear();
- if (!m_pTextPage->IsParsed())
- return;
-
- m_strPageText = m_pTextPage->GetPageText(0, -1);
- if (m_strPageText.IsEmpty())
- return;
-
- ParseLink();
-}
-
-void CPDF_LinkExtract::ParseLink() {
- int start = 0, pos = 0;
- int TotalChar = m_pTextPage->CountChars();
- while (pos < TotalChar) {
- FPDF_CHAR_INFO pageChar;
- m_pTextPage->GetCharInfo(pos, &pageChar);
- if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
- pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
- int nCount = pos - start;
- if (pos == TotalChar - 1) {
- nCount++;
- }
- CFX_WideString strBeCheck;
- strBeCheck = m_pTextPage->GetPageText(start, nCount);
- if (strBeCheck.GetLength() > 5) {
- while (strBeCheck.GetLength() > 0) {
- FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
- if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
- strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
- nCount--;
- } else {
- break;
- }
- }
- if (nCount > 5 &&
- (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
- m_LinkArray.push_back({start, nCount, strBeCheck});
- }
- }
- start = ++pos;
- } else {
- pos++;
- }
- }
-}
-
-bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
- CFX_WideString str = strBeCheck;
- str.MakeLower();
- if (str.Find(L"http://www.") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
- return true;
- }
- if (str.Find(L"http://") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
- return true;
- }
- if (str.Find(L"https://www.") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
- return true;
- }
- if (str.Find(L"https://") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
- return true;
- }
- if (str.Find(L"www.") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
- strBeCheck = L"http://" + strBeCheck;
- return true;
- }
- return false;
-}
-
-bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
- int aPos = str.Find(L'@');
- // Invalid when no '@'.
- if (aPos < 1)
- return false;
-
- // Check the local part.
- int pPos = aPos; // Used to track the position of '@' or '.'.
- for (int i = aPos - 1; i >= 0; i--) {
- FX_WCHAR ch = str.GetAt(i);
- if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
- continue;
-
- if (ch != L'.' || i == pPos - 1 || i == 0) {
- if (i == aPos - 1) {
- // There is '.' or invalid char before '@'.
- return FALSE;
- }
- // End extracting for other invalid chars, '.' at the beginning, or
- // consecutive '.'.
- int removed_len = i == pPos - 1 ? i + 2 : i + 1;
- str = str.Right(str.GetLength() - removed_len);
- break;
- }
- // Found a valid '.'.
- pPos = i;
- }
-
- // Check the domain name part.
- aPos = str.Find(L'@');
- if (aPos < 1)
- return false;
-
- str.TrimRight(L'.');
- // At least one '.' in domain name, but not at the beginning.
- // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
- // Check whether we should remove this check.
- int ePos = str.Find(L'.', aPos + 1);
- if (ePos == -1 || ePos == aPos + 1)
- return false;
-
- // Validate all other chars in domain name.
- int nLen = str.GetLength();
- pPos = 0; // Used to track the position of '.'.
- for (int i = aPos + 1; i < nLen; i++) {
- FX_WCHAR wch = str.GetAt(i);
- if (wch == L'-' || FXSYS_iswalnum(wch))
- continue;
-
- if (wch != L'.' || i == pPos + 1) {
- // Domain name should end before invalid char.
- int host_end = i == pPos + 1 ? i - 2 : i - 1;
- if (pPos > 0 && host_end - aPos >= 3) {
- // Trim the ending invalid chars if there is at least one '.' and name.
- str = str.Left(host_end + 1);
- break;
- }
- return false;
- }
- pPos = i;
- }
-
- if (str.Find(L"mailto:") == -1)
- str = L"mailto:" + str;
-
- return true;
-}
-
-CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
- return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
-}
-
-std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
- if (index >= m_LinkArray.size())
- return std::vector<CFX_FloatRect>();
-
- return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
- m_LinkArray[index].m_Count);
-}
diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp
new file mode 100644
index 0000000000..a67bdf15d5
--- /dev/null
+++ b/core/fpdftext/cpdf_textpagefind.cpp
@@ -0,0 +1,407 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fpdftext/include/cpdf_textpagefind.h"
+
+#include <cwchar>
+#include <cwctype>
+#include <vector>
+
+#include "core/fpdftext/include/cpdf_textpage.h"
+#include "core/fxcrt/include/fx_string.h"
+#include "core/fxcrt/include/fx_system.h"
+#include "third_party/base/stl_util.h"
+
+namespace {
+
+FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
+ if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
+ (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
+ (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
+ (curChar >= 0x0400 && curChar <= 0x04FF) ||
+ (curChar >= 0x0500 && curChar <= 0x052F) ||
+ (curChar >= 0xA640 && curChar <= 0xA69F) ||
+ (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
+ (curChar >= 0x2000 && curChar <= 0x206F)) {
+ return FALSE;
+ }
+ return TRUE;
+}
+
+} // namespace
+
+CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
+ : m_pTextPage(pTextPage),
+ m_flags(0),
+ m_findNextStart(-1),
+ m_findPreStart(-1),
+ m_bMatchCase(FALSE),
+ m_bMatchWholeWord(FALSE),
+ m_resStart(0),
+ m_resEnd(-1),
+ m_IsFind(FALSE) {
+ m_strText = m_pTextPage->GetPageText();
+ int nCount = pTextPage->CountChars();
+ if (nCount)
+ m_CharIndex.push_back(0);
+ for (int i = 0; i < nCount; i++) {
+ FPDF_CHAR_INFO info;
+ pTextPage->GetCharInfo(i, &info);
+ int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
+ if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
+ info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
+ if (indexSize % 2) {
+ m_CharIndex.push_back(1);
+ } else {
+ if (indexSize <= 0)
+ continue;
+ m_CharIndex[indexSize - 1] += 1;
+ }
+ } else {
+ if (indexSize % 2) {
+ if (indexSize <= 0)
+ continue;
+ m_CharIndex[indexSize - 1] = i + 1;
+ } else {
+ m_CharIndex.push_back(i + 1);
+ }
+ }
+ }
+ int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
+ if (indexSize % 2)
+ m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
+}
+
+CPDF_TextPageFind::~CPDF_TextPageFind() {}
+
+int CPDF_TextPageFind::GetCharIndex(int index) const {
+ return m_pTextPage->CharIndexFromTextIndex(index);
+}
+
+FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
+ int flags,
+ int startPos) {
+ if (!m_pTextPage)
+ return FALSE;
+ if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
+ m_strText = m_pTextPage->GetPageText();
+ CFX_WideString findwhatStr = findwhat;
+ m_findWhat = findwhatStr;
+ m_flags = flags;
+ m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
+ if (m_strText.IsEmpty()) {
+ m_IsFind = FALSE;
+ return TRUE;
+ }
+ FX_STRSIZE len = findwhatStr.GetLength();
+ if (!m_bMatchCase) {
+ findwhatStr.MakeLower();
+ m_strText.MakeLower();
+ }
+ m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
+ m_findNextStart = startPos;
+ if (startPos == -1)
+ m_findPreStart = m_strText.GetLength() - 1;
+ else
+ m_findPreStart = startPos;
+ m_csFindWhatArray.clear();
+ int i = 0;
+ while (i < len) {
+ if (findwhatStr.GetAt(i) != ' ')
+ break;
+ i++;
+ }
+ if (i < len)
+ ExtractFindWhat(findwhatStr);
+ else
+ m_csFindWhatArray.push_back(findwhatStr);
+ if (m_csFindWhatArray.empty())
+ return FALSE;
+ m_IsFind = TRUE;
+ m_resStart = 0;
+ m_resEnd = -1;
+ return TRUE;
+}
+
+FX_BOOL CPDF_TextPageFind::FindNext() {
+ if (!m_pTextPage)
+ return FALSE;
+ m_resArray.clear();
+ if (m_findNextStart == -1)
+ return FALSE;
+ if (m_strText.IsEmpty()) {
+ m_IsFind = FALSE;
+ return m_IsFind;
+ }
+ int strLen = m_strText.GetLength();
+ if (m_findNextStart > strLen - 1) {
+ m_IsFind = FALSE;
+ return m_IsFind;
+ }
+ int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
+ int nResultPos = 0;
+ int nStartPos = 0;
+ nStartPos = m_findNextStart;
+ bool bSpaceStart = false;
+ for (int iWord = 0; iWord < nCount; iWord++) {
+ CFX_WideString csWord = m_csFindWhatArray[iWord];
+ if (csWord.IsEmpty()) {
+ if (iWord == nCount - 1) {
+ FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
+ if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
+ strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
+ nResultPos = nStartPos + 1;
+ break;
+ }
+ iWord = -1;
+ } else if (iWord == 0) {
+ bSpaceStart = true;
+ }
+ continue;
+ }
+ int endIndex;
+ nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
+ if (nResultPos == -1) {
+ m_IsFind = FALSE;
+ return m_IsFind;
+ }
+ endIndex = nResultPos + csWord.GetLength() - 1;
+ if (iWord == 0)
+ m_resStart = nResultPos;
+ FX_BOOL bMatch = TRUE;
+ if (iWord != 0 && !bSpaceStart) {
+ int PreResEndPos = nStartPos;
+ int curChar = csWord.GetAt(0);
+ CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
+ int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
+ if (nStartPos == nResultPos &&
+ !(IsIgnoreSpaceCharacter(lastChar) ||
+ IsIgnoreSpaceCharacter(curChar))) {
+ bMatch = FALSE;
+ }
+ for (int d = PreResEndPos; d < nResultPos; d++) {
+ FX_WCHAR strInsert = m_strText.GetAt(d);
+ if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
+ strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
+ bMatch = FALSE;
+ break;
+ }
+ }
+ } else if (bSpaceStart) {
+ if (nResultPos > 0) {
+ FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
+ if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
+ strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
+ bMatch = FALSE;
+ m_resStart = nResultPos;
+ } else {
+ m_resStart = nResultPos - 1;
+ }
+ }
+ }
+ if (m_bMatchWholeWord && bMatch) {
+ bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
+ }
+ nStartPos = endIndex + 1;
+ if (!bMatch) {
+ iWord = -1;
+ if (bSpaceStart)
+ nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
+ else
+ nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
+ }
+ }
+ m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
+ m_IsFind = TRUE;
+ int resStart = GetCharIndex(m_resStart);
+ int resEnd = GetCharIndex(m_resEnd);
+ m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
+ if (m_flags & FPDFTEXT_CONSECUTIVE) {
+ m_findNextStart = m_resStart + 1;
+ m_findPreStart = m_resEnd - 1;
+ } else {
+ m_findNextStart = m_resEnd + 1;
+ m_findPreStart = m_resStart - 1;
+ }
+ return m_IsFind;
+}
+
+FX_BOOL CPDF_TextPageFind::FindPrev() {
+ if (!m_pTextPage)
+ return FALSE;
+ m_resArray.clear();
+ if (m_strText.IsEmpty() || m_findPreStart < 0) {
+ m_IsFind = FALSE;
+ return m_IsFind;
+ }
+ CPDF_TextPageFind findEngine(m_pTextPage);
+ FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
+ if (!ret) {
+ m_IsFind = FALSE;
+ return m_IsFind;
+ }
+ int order = -1, MatchedCount = 0;
+ while (ret) {
+ ret = findEngine.FindNext();
+ if (ret) {
+ int order1 = findEngine.GetCurOrder();
+ int MatchedCount1 = findEngine.GetMatchedCount();
+ if (((order1 + MatchedCount1) - 1) > m_findPreStart)
+ break;
+ order = order1;
+ MatchedCount = MatchedCount1;
+ }
+ }
+ if (order == -1) {
+ m_IsFind = FALSE;
+ return m_IsFind;
+ }
+ m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
+ m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
+ m_IsFind = TRUE;
+ m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
+ if (m_flags & FPDFTEXT_CONSECUTIVE) {
+ m_findNextStart = m_resStart + 1;
+ m_findPreStart = m_resEnd - 1;
+ } else {
+ m_findNextStart = m_resEnd + 1;
+ m_findPreStart = m_resStart - 1;
+ }
+ return m_IsFind;
+}
+
+void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
+ if (findwhat.IsEmpty())
+ return;
+ int index = 0;
+ while (1) {
+ CFX_WideString csWord = TEXT_EMPTY;
+ int ret =
+ ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
+ if (csWord.IsEmpty()) {
+ if (ret) {
+ m_csFindWhatArray.push_back(L"");
+ index++;
+ continue;
+ } else {
+ break;
+ }
+ }
+ int pos = 0;
+ while (pos < csWord.GetLength()) {
+ CFX_WideString curStr = csWord.Mid(pos, 1);
+ FX_WCHAR curChar = csWord.GetAt(pos);
+ if (IsIgnoreSpaceCharacter(curChar)) {
+ if (pos > 0 && curChar == 0x2019) {
+ pos++;
+ continue;
+ }
+ if (pos > 0)
+ m_csFindWhatArray.push_back(csWord.Mid(0, pos));
+ m_csFindWhatArray.push_back(curStr);
+ if (pos == csWord.GetLength() - 1) {
+ csWord.clear();
+ break;
+ }
+ csWord = csWord.Right(csWord.GetLength() - pos - 1);
+ pos = 0;
+ continue;
+ }
+ pos++;
+ }
+ if (!csWord.IsEmpty())
+ m_csFindWhatArray.push_back(csWord);
+ index++;
+ }
+}
+
+FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
+ int startPos,
+ int endPos) {
+ FX_WCHAR char_left = 0;
+ FX_WCHAR char_right = 0;
+ int char_count = endPos - startPos + 1;
+ if (char_count < 1)
+ return FALSE;
+ if (char_count == 1 && csPageText.GetAt(startPos) > 255)
+ return TRUE;
+ if (startPos - 1 >= 0)
+ char_left = csPageText.GetAt(startPos - 1);
+ if (startPos + char_count < csPageText.GetLength())
+ char_right = csPageText.GetAt(startPos + char_count);
+ if ((char_left > 'A' && char_left < 'a') ||
+ (char_left > 'a' && char_left < 'z') ||
+ (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
+ (char_right > 'A' && char_right < 'a') ||
+ (char_right > 'a' && char_right < 'z') ||
+ (char_right > 0xfb00 && char_right < 0xfb06) ||
+ std::iswdigit(char_right)) {
+ return FALSE;
+ }
+ if (!(('A' > char_left || char_left > 'Z') &&
+ ('a' > char_left || char_left > 'z') &&
+ ('A' > char_right || char_right > 'Z') &&
+ ('a' > char_right || char_right > 'z'))) {
+ return FALSE;
+ }
+ if (char_count > 0) {
+ if (csPageText.GetAt(startPos) >= L'0' &&
+ csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
+ char_left <= L'9') {
+ return FALSE;
+ }
+ if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
+ char_right >= L'0' && char_right <= L'9') {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
+ const FX_WCHAR* lpszFullString,
+ int iSubString,
+ FX_WCHAR chSep) {
+ if (!lpszFullString)
+ return FALSE;
+ while (iSubString--) {
+ lpszFullString = std::wcschr(lpszFullString, chSep);
+ if (!lpszFullString) {
+ rString.clear();
+ return FALSE;
+ }
+ lpszFullString++;
+ while (*lpszFullString == chSep)
+ lpszFullString++;
+ }
+ const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
+ int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
+ : (int)FXSYS_wcslen(lpszFullString);
+ ASSERT(nLen >= 0);
+ FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
+ nLen * sizeof(FX_WCHAR));
+ rString.ReleaseBuffer();
+ return TRUE;
+}
+
+CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
+ CFX_WideString str2;
+ str2.clear();
+ int nlen = str.GetLength();
+ for (int i = nlen - 1; i >= 0; i--)
+ str2 += str.GetAt(i);
+ return str2;
+}
+
+int CPDF_TextPageFind::GetCurOrder() const {
+ return GetCharIndex(m_resStart);
+}
+
+int CPDF_TextPageFind::GetMatchedCount() const {
+ int resStart = GetCharIndex(m_resStart);
+ int resEnd = GetCharIndex(m_resEnd);
+ return resEnd - resStart + 1;
+}
diff --git a/core/fpdftext/include/cpdf_textpage.h b/core/fpdftext/include/cpdf_textpage.h
index 7f25fd7cf5..64b1613e7e 100644
--- a/core/fpdftext/include/cpdf_textpage.h
+++ b/core/fpdftext/include/cpdf_textpage.h
@@ -20,6 +20,26 @@ class CPDF_FormObject;
class CPDF_Page;
class CPDF_TextObject;
+#define FPDFTEXT_MATCHCASE 0x00000001
+#define FPDFTEXT_MATCHWHOLEWORD 0x00000002
+#define FPDFTEXT_CONSECUTIVE 0x00000004
+
+#define FPDFTEXT_CHAR_ERROR -1
+#define FPDFTEXT_CHAR_NORMAL 0
+#define FPDFTEXT_CHAR_GENERATED 1
+#define FPDFTEXT_CHAR_UNUNICODE 2
+#define FPDFTEXT_CHAR_HYPHEN 3
+#define FPDFTEXT_CHAR_PIECE 4
+
+#define TEXT_SPACE_CHAR L' '
+#define TEXT_LINEFEED_CHAR L'\n'
+#define TEXT_RETURN_CHAR L'\r'
+#define TEXT_EMPTY L""
+#define TEXT_SPACE L" "
+#define TEXT_RETURN_LINEFEED L"\r\n"
+#define TEXT_LINEFEED L"\n"
+#define TEXT_CHARRATIO_GAPDELTA 0.070
+
enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
enum class FPDFText_Direction { Left = -1, Right = 1 };