summaryrefslogtreecommitdiff
path: root/core/fpdftext/cpdf_linkextract.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'core/fpdftext/cpdf_linkextract.cpp')
-rw-r--r--core/fpdftext/cpdf_linkextract.cpp100
1 files changed, 81 insertions, 19 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 56a3ab4ec7..315cff12a0 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -13,6 +13,54 @@
#include "core/fxcrt/fx_string.h"
#include "core/fxcrt/fx_system.h"
+namespace {
+
+// Find the end of a web link starting from offset |start|.
+// The purpose of this function is to separate url from the surrounding context
+// characters, we do not intend to fully validate the url.
+// |str| contains lower case characters only.
+FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) {
+ FX_STRSIZE end = str.GetLength() - 1;
+ if (str.Find(L'/', start) != -1) {
+ // When there is a path and query after '/', most ASCII chars are allowed.
+ // We don't sanitize in this case.
+ return end;
+ }
+
+ // When there is no path, it only has IP address or host name.
+ // Port is optional at the end.
+ if (str[start] == L'[') {
+ // IPv6 reference.
+ // Find the end of the reference.
+ end = str.Find(L']', start + 1);
+ if (end != -1 && end > start + 1) { // Has content inside brackets.
+ FX_STRSIZE len = str.GetLength();
+ FX_STRSIZE off = end + 1;
+ if (off < len && str[off] == L':') {
+ off++;
+ while (off < len && str[off] >= L'0' && str[off] <= L'9')
+ off++;
+ if (off > end + 2 && off <= len) // At least one digit in port number.
+ end = off - 1; // |off| is offset of the first invalid char.
+ }
+ }
+ return end;
+ }
+
+ // According to RFC1123, host name only has alphanumeric chars, hyphens,
+ // and periods. Hyphen should not at the end though.
+ // Non-ASCII chars are ignored during checking.
+ while (end > start && str[end] < 0x80) {
+ if ((str[end] >= L'0' && str[end] <= L'9') ||
+ (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.')
+ break;
+ end--;
+ }
+ return end;
+}
+
+} // namespace
+
CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
: m_pTextPage(pTextPage) {}
@@ -71,6 +119,8 @@ void CPDF_LinkExtract::ParseLink() {
break;
}
}
+ // Check for potential web URLs and email addresses.
+ // Ftp address, file system links, data, blob etc. are not checked.
if (nCount > 5 &&
(CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
m_LinkArray.push_back({start, nCount, strBeCheck});
@@ -87,28 +137,40 @@ void CPDF_LinkExtract::ParseLink() {
}
bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
+ static const wchar_t kHttpScheme[] = L"http";
+ static const FX_STRSIZE kHttpSchemeLen = FXSYS_len(kHttpScheme);
+ static const wchar_t kWWWAddrStart[] = L"www.";
+ static const FX_STRSIZE kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
+
CFX_WideString str = strBeCheck;
str.MakeLower();
- if (str.Find(L"http://www.") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
- return true;
- }
- if (str.Find(L"http://") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
- return true;
- }
- if (str.Find(L"https://www.") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
- return true;
- }
- if (str.Find(L"https://") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
- return true;
+
+ FX_STRSIZE len = str.GetLength();
+ // First, try to find the scheme.
+ FX_STRSIZE start = str.Find(kHttpScheme);
+ if (start != -1) {
+ FX_STRSIZE off = start + kHttpSchemeLen; // move after "http".
+ if (len > off + 4) { // At least "://<char>" follows.
+ if (str[off] == L's') // "https" scheme is accepted.
+ off++;
+ if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
+ FX_STRSIZE end = FindWebLinkEnding(str, off + 3);
+ if (end > off + 3) { // Non-empty host name.
+ strBeCheck = strBeCheck.Mid(start, end - start + 1);
+ return true;
+ }
+ }
+ }
}
- if (str.Find(L"www.") != -1) {
- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
- strBeCheck = L"http://" + strBeCheck;
- return true;
+
+ // When there is no scheme, try to find url starting with "www.".
+ start = str.Find(kWWWAddrStart);
+ if (start != -1 && len > start + kWWWAddrStartLen) {
+ FX_STRSIZE end = FindWebLinkEnding(str, start);
+ if (end > start + kWWWAddrStartLen) {
+ strBeCheck = L"http://" + strBeCheck.Mid(start, end - start + 1);
+ return true;
+ }
}
return false;
}