// Copyright 2016 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include "core/fpdftext/cpdf_linkextract.h" #include #include "core/fpdftext/cpdf_textpage.h" #include "core/fxcrt/fx_extension.h" #include "core/fxcrt/fx_string.h" #include "core/fxcrt/fx_system.h" CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) : m_pTextPage(pTextPage) {} CPDF_LinkExtract::~CPDF_LinkExtract() {} void CPDF_LinkExtract::ExtractLinks() { m_LinkArray.clear(); if (!m_pTextPage->IsParsed()) return; m_strPageText = m_pTextPage->GetPageText(0, -1); if (m_strPageText.IsEmpty()) return; ParseLink(); } void CPDF_LinkExtract::ParseLink() { int start = 0; int pos = 0; int nTotalChar = m_pTextPage->CountChars(); bool bAfterHyphen = false; bool bLineBreak = false; while (pos < nTotalChar) { FPDF_CHAR_INFO pageChar; m_pTextPage->GetCharInfo(pos, &pageChar); if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) { int nCount = pos - start; if (pos == nTotalChar - 1) { nCount++; } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR || pageChar.m_Unicode == TEXT_RETURN_CHAR)) { // Handle text breaks with a hyphen to the next line. bLineBreak = true; pos++; continue; } CFX_WideString strBeCheck; strBeCheck = m_pTextPage->GetPageText(start, nCount); if (bLineBreak) { strBeCheck.Remove(TEXT_LINEFEED_CHAR); strBeCheck.Remove(TEXT_RETURN_CHAR); bLineBreak = false; } // Replace the generated code with the hyphen char. strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN); if (strBeCheck.GetLength() > 5) { while (strBeCheck.GetLength() > 0) { wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); nCount--; } else { break; } } if (nCount > 5 && (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { m_LinkArray.push_back({start, nCount, strBeCheck}); } } start = ++pos; } else { bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN || (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL && pageChar.m_Unicode == TEXT_HYPHEN_CHAR)); pos++; } } } bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { CFX_WideString str = strBeCheck; str.MakeLower(); if (str.Find(L"http://www.") != -1) { strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); return true; } if (str.Find(L"http://") != -1) { strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); return true; } if (str.Find(L"https://www.") != -1) { strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); return true; } if (str.Find(L"https://") != -1) { strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); return true; } if (str.Find(L"www.") != -1) { strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); strBeCheck = L"http://" + strBeCheck; return true; } return false; } bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { int aPos = str.Find(L'@'); // Invalid when no '@'. if (aPos < 1) return false; // Check the local part. int pPos = aPos; // Used to track the position of '@' or '.'. for (int i = aPos - 1; i >= 0; i--) { wchar_t ch = str.GetAt(i); if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) continue; if (ch != L'.' || i == pPos - 1 || i == 0) { if (i == aPos - 1) { // There is '.' or invalid char before '@'. return false; } // End extracting for other invalid chars, '.' at the beginning, or // consecutive '.'. int removed_len = i == pPos - 1 ? i + 2 : i + 1; str = str.Right(str.GetLength() - removed_len); break; } // Found a valid '.'. pPos = i; } // Check the domain name part. aPos = str.Find(L'@'); if (aPos < 1) return false; str.TrimRight(L'.'); // At least one '.' in domain name, but not at the beginning. // TODO(weili): RFC5322 allows domain names to be a local name without '.'. // Check whether we should remove this check. int ePos = str.Find(L'.', aPos + 1); if (ePos == -1 || ePos == aPos + 1) return false; // Validate all other chars in domain name. int nLen = str.GetLength(); pPos = 0; // Used to track the position of '.'. for (int i = aPos + 1; i < nLen; i++) { wchar_t wch = str.GetAt(i); if (wch == L'-' || FXSYS_iswalnum(wch)) continue; if (wch != L'.' || i == pPos + 1) { // Domain name should end before invalid char. int host_end = i == pPos + 1 ? i - 2 : i - 1; if (pPos > 0 && host_end - aPos >= 3) { // Trim the ending invalid chars if there is at least one '.' and name. str = str.Left(host_end + 1); break; } return false; } pPos = i; } if (str.Find(L"mailto:") == -1) str = L"mailto:" + str; return true; } CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; } std::vector CPDF_LinkExtract::GetRects(size_t index) const { if (index >= m_LinkArray.size()) return std::vector(); return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, m_LinkArray[index].m_Count); }