diff options
Diffstat (limited to 'core/fpdftext/cpdf_linkextract.cpp')
-rw-r--r-- | core/fpdftext/cpdf_linkextract.cpp | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp new file mode 100644 index 0000000000..1677b67b55 --- /dev/null +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -0,0 +1,173 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fpdftext/include/cpdf_linkextract.h" + +#include <vector> + +#include "core/fpdftext/include/cpdf_textpage.h" +#include "core/fxcrt/include/fx_ext.h" +#include "core/fxcrt/include/fx_string.h" +#include "core/fxcrt/include/fx_system.h" + +CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) + : m_pTextPage(pTextPage) {} + +CPDF_LinkExtract::~CPDF_LinkExtract() {} + +void CPDF_LinkExtract::ExtractLinks() { + m_LinkArray.clear(); + if (!m_pTextPage->IsParsed()) + return; + + m_strPageText = m_pTextPage->GetPageText(0, -1); + if (m_strPageText.IsEmpty()) + return; + + ParseLink(); +} + +void CPDF_LinkExtract::ParseLink() { + int start = 0, pos = 0; + int TotalChar = m_pTextPage->CountChars(); + while (pos < TotalChar) { + FPDF_CHAR_INFO pageChar; + m_pTextPage->GetCharInfo(pos, &pageChar); + if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || + pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { + int nCount = pos - start; + if (pos == TotalChar - 1) + nCount++; + CFX_WideString strBeCheck; + strBeCheck = m_pTextPage->GetPageText(start, nCount); + if (strBeCheck.GetLength() > 5) { + while (strBeCheck.GetLength() > 0) { + FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); + if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { + strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); + nCount--; + } else { + break; + } + } + if (nCount > 5 && + (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { + m_LinkArray.push_back({start, nCount, strBeCheck}); + } + } + start = ++pos; + } else { + pos++; + } + } +} + +bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { + CFX_WideString str = strBeCheck; + str.MakeLower(); + if (str.Find(L"http://www.") != -1) { + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); + return true; + } + if (str.Find(L"http://") != -1) { + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); + return true; + } + if (str.Find(L"https://www.") != -1) { + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); + return true; + } + if (str.Find(L"https://") != -1) { + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); + return true; + } + if (str.Find(L"www.") != -1) { + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); + strBeCheck = L"http://" + strBeCheck; + return true; + } + return false; +} + +bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { + int aPos = str.Find(L'@'); + // Invalid when no '@'. + if (aPos < 1) + return false; + + // Check the local part. + int pPos = aPos; // Used to track the position of '@' or '.'. + for (int i = aPos - 1; i >= 0; i--) { + FX_WCHAR ch = str.GetAt(i); + if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) + continue; + + if (ch != L'.' || i == pPos - 1 || i == 0) { + if (i == aPos - 1) { + // There is '.' or invalid char before '@'. + return FALSE; + } + // End extracting for other invalid chars, '.' at the beginning, or + // consecutive '.'. + int removed_len = i == pPos - 1 ? i + 2 : i + 1; + str = str.Right(str.GetLength() - removed_len); + break; + } + // Found a valid '.'. + pPos = i; + } + + // Check the domain name part. + aPos = str.Find(L'@'); + if (aPos < 1) + return false; + + str.TrimRight(L'.'); + // At least one '.' in domain name, but not at the beginning. + // TODO(weili): RFC5322 allows domain names to be a local name without '.'. + // Check whether we should remove this check. + int ePos = str.Find(L'.', aPos + 1); + if (ePos == -1 || ePos == aPos + 1) + return false; + + // Validate all other chars in domain name. + int nLen = str.GetLength(); + pPos = 0; // Used to track the position of '.'. + for (int i = aPos + 1; i < nLen; i++) { + FX_WCHAR wch = str.GetAt(i); + if (wch == L'-' || FXSYS_iswalnum(wch)) + continue; + + if (wch != L'.' || i == pPos + 1) { + // Domain name should end before invalid char. + int host_end = i == pPos + 1 ? i - 2 : i - 1; + if (pPos > 0 && host_end - aPos >= 3) { + // Trim the ending invalid chars if there is at least one '.' and name. + str = str.Left(host_end + 1); + break; + } + return false; + } + pPos = i; + } + + if (str.Find(L"mailto:") == -1) + str = L"mailto:" + str; + + return true; +} + +CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { + return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; +} + +std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { + if (index >= m_LinkArray.size()) + return std::vector<CFX_FloatRect>(); + + return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, + m_LinkArray[index].m_Count); +} |