diff options
Diffstat (limited to 'core')
-rw-r--r-- | core/fpdftext/cpdf_linkextract.cpp | 100 | ||||
-rw-r--r-- | core/fpdftext/fpdf_text_int_unittest.cpp | 91 |
2 files changed, 166 insertions, 25 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 56a3ab4ec7..315cff12a0 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -13,6 +13,54 @@ #include "core/fxcrt/fx_string.h" #include "core/fxcrt/fx_system.h" +namespace { + +// Find the end of a web link starting from offset |start|. +// The purpose of this function is to separate url from the surrounding context +// characters, we do not intend to fully validate the url. +// |str| contains lower case characters only. +FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) { + FX_STRSIZE end = str.GetLength() - 1; + if (str.Find(L'/', start) != -1) { + // When there is a path and query after '/', most ASCII chars are allowed. + // We don't sanitize in this case. + return end; + } + + // When there is no path, it only has IP address or host name. + // Port is optional at the end. + if (str[start] == L'[') { + // IPv6 reference. + // Find the end of the reference. + end = str.Find(L']', start + 1); + if (end != -1 && end > start + 1) { // Has content inside brackets. + FX_STRSIZE len = str.GetLength(); + FX_STRSIZE off = end + 1; + if (off < len && str[off] == L':') { + off++; + while (off < len && str[off] >= L'0' && str[off] <= L'9') + off++; + if (off > end + 2 && off <= len) // At least one digit in port number. + end = off - 1; // |off| is offset of the first invalid char. + } + } + return end; + } + + // According to RFC1123, host name only has alphanumeric chars, hyphens, + // and periods. Hyphen should not at the end though. + // Non-ASCII chars are ignored during checking. + while (end > start && str[end] < 0x80) { + if ((str[end] >= L'0' && str[end] <= L'9') || + (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') + break; + end--; + } + return end; +} + +} // namespace + CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) : m_pTextPage(pTextPage) {} @@ -71,6 +119,8 @@ void CPDF_LinkExtract::ParseLink() { break; } } + // Check for potential web URLs and email addresses. + // Ftp address, file system links, data, blob etc. are not checked. if (nCount > 5 && (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { m_LinkArray.push_back({start, nCount, strBeCheck}); @@ -87,28 +137,40 @@ void CPDF_LinkExtract::ParseLink() { } bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { + static const wchar_t kHttpScheme[] = L"http"; + static const FX_STRSIZE kHttpSchemeLen = FXSYS_len(kHttpScheme); + static const wchar_t kWWWAddrStart[] = L"www."; + static const FX_STRSIZE kWWWAddrStartLen = FXSYS_len(kWWWAddrStart); + CFX_WideString str = strBeCheck; str.MakeLower(); - if (str.Find(L"http://www.") != -1) { - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); - return true; - } - if (str.Find(L"http://") != -1) { - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); - return true; - } - if (str.Find(L"https://www.") != -1) { - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); - return true; - } - if (str.Find(L"https://") != -1) { - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); - return true; + + FX_STRSIZE len = str.GetLength(); + // First, try to find the scheme. + FX_STRSIZE start = str.Find(kHttpScheme); + if (start != -1) { + FX_STRSIZE off = start + kHttpSchemeLen; // move after "http". + if (len > off + 4) { // At least "://<char>" follows. + if (str[off] == L's') // "https" scheme is accepted. + off++; + if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { + FX_STRSIZE end = FindWebLinkEnding(str, off + 3); + if (end > off + 3) { // Non-empty host name. + strBeCheck = strBeCheck.Mid(start, end - start + 1); + return true; + } + } + } } - if (str.Find(L"www.") != -1) { - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); - strBeCheck = L"http://" + strBeCheck; - return true; + + // When there is no scheme, try to find url starting with "www.". + start = str.Find(kWWWAddrStart); + if (start != -1 && len > start + kWWWAddrStartLen) { + FX_STRSIZE end = FindWebLinkEnding(str, start); + if (end > start + kWWWAddrStartLen) { + strBeCheck = L"http://" + strBeCheck.Mid(start, end - start + 1); + return true; + } } return false; } diff --git a/core/fpdftext/fpdf_text_int_unittest.cpp b/core/fpdftext/fpdf_text_int_unittest.cpp index d7e48768bc..5730e5cc49 100644 --- a/core/fpdftext/fpdf_text_int_unittest.cpp +++ b/core/fpdftext/fpdf_text_int_unittest.cpp @@ -13,14 +13,15 @@ class CPDF_TestLinkExtract : public CPDF_LinkExtract { private: // Add test cases as friends to access protected member functions. - // Access CheckMailLink. + // Access CheckMailLink and CheckWebLink. FRIEND_TEST(fpdf_text_int, CheckMailLink); + FRIEND_TEST(fpdf_text_int, CheckWebLink); }; TEST(fpdf_text_int, CheckMailLink) { CPDF_TestLinkExtract extractor; // Check cases that fail to extract valid mail link. - const wchar_t* invalid_strs[] = { + const wchar_t* const invalid_strs[] = { L"", L"peter.pan", // '@' is required. L"abc@server", // Domain name needs at least one '.'. @@ -31,12 +32,12 @@ TEST(fpdf_text_int, CheckMailLink) { }; for (size_t i = 0; i < FX_ArraySize(invalid_strs); ++i) { CFX_WideString text_str(invalid_strs[i]); - EXPECT_FALSE(extractor.CheckMailLink(text_str)); + EXPECT_FALSE(extractor.CheckMailLink(text_str)) << text_str.c_str(); } // Check cases that can extract valid mail link. // An array of {input_string, expected_extracted_email_address}. - const wchar_t* valid_strs[][2] = { + const wchar_t* const valid_strs[][2] = { {L"peter@abc.d", L"peter@abc.d"}, {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"}, {L"abc_@gmail.com", L"abc_@gmail.com"}, // '_' is ok before '@'. @@ -53,7 +54,85 @@ TEST(fpdf_text_int, CheckMailLink) { CFX_WideString text_str(valid_strs[i][0]); CFX_WideString expected_str(L"mailto:"); expected_str += valid_strs[i][1]; - EXPECT_TRUE(extractor.CheckMailLink(text_str)); - EXPECT_STREQ(text_str.c_str(), expected_str.c_str()); + EXPECT_TRUE(extractor.CheckMailLink(text_str)) << text_str.c_str(); + EXPECT_STREQ(expected_str.c_str(), text_str.c_str()); + } +} + +TEST(fpdf_text_int, CheckWebLink) { + CPDF_TestLinkExtract extractor; + // Check cases that fail to extract valid web link. + // The last few are legit web addresses that we don't handle now. + const wchar_t* const invalid_cases[] = { + L"", L"http", L"www.", L"https-and-www", + L"http:/abc.com", // Missing slash. + L"http://((()),", // Only invalid chars in host name. + L"ftp://example.com", // Ftp scheme is not supported. + L"http:example.com", // Missing slashes. + L"http//[example.com", // Invalid IPv6 address. + L"http//[00:00:00:00:00:00", // Invalid IPv6 address. + L"http//[]", // Empty IPv6 address. + // Web addresses that in correct format that we don't handle. + L"abc.example.com", // URL without scheme. + }; + for (size_t i = 0; i < FX_ArraySize(invalid_cases); ++i) { + CFX_WideString text_str(invalid_cases[i]); + EXPECT_FALSE(extractor.CheckWebLink(text_str)) << text_str.c_str(); + } + + // Check cases that can extract valid web link. + // An array of {input_string, expected_extracted_web_link}. + const wchar_t* const valid_cases[][2] = { + {L"http://www.example.com", L"http://www.example.com"}, // standard URL. + {L"http://www.example.com:88", + L"http://www.example.com:88"}, // URL with port number. + {L"http://test@www.example.com", + L"http://test@www.example.com"}, // URL with username. + {L"http://test:test@example.com", + L"http://test:test@example.com"}, // URL with username and password. + {L"http://example", L"http://example"}, // URL with short domain name. + {L"http////www.server", L"http://www.server"}, // URL starts with "www.". + {L"http:/www.abc.com", L"http://www.abc.com"}, // URL starts with "www.". + {L"www.a.b.c", L"http://www.a.b.c"}, // URL starts with "www.". + {L"https://a.us", L"https://a.us"}, // Secure http URL. + {L"https://www.t.us", L"https://www.t.us"}, // Secure http URL. + {L"www.example-test.com", + L"http://www.example-test.com"}, // '-' in host is ok. + {L"www.example.com,", + L"http://www.example.com"}, // Trim ending invalid chars. + {L"www.example.com;(", + L"http://www.example.com"}, // Trim ending invalid chars. + {L"test:www.abc.com", L"http://www.abc.com"}, // Trim chars before URL. + {L"www.g.com..", L"http://www.g.com.."}, // Leave ending periods. + // Web link can contain IP address too. + {L"http://192.168.0.1", L"http://192.168.0.1"}, // IPv4 address. + {L"http://192.168.0.1:80", + L"http://192.168.0.1:80"}, // IPv4 address with port. + {L"http://[aa::00:bb::00:cc:00]", + L"http://[aa::00:bb::00:cc:00]"}, // IPv6 reference. + {L"http://[aa::00:bb::00:cc:00]:12", + L"http://[aa::00:bb::00:cc:00]:12"}, // IPv6 reference with port. + {L"http://[aa]:12", L"http://[aa]:12"}, // Not validate IP address. + {L"http://[aa]:12abc", L"http://[aa]:12"}, // Trim for IPv6 address. + {L"http://[aa]:", L"http://[aa]"}, // Trim for IPv6 address. + // Path and query parts can be anything. + {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*("}, + {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y"}, + {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6"}, + {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar"}, + // Invalid chars inside host name are ok as we don't validate them. + {L"http://ex[am]ple", L"http://ex[am]ple"}, + {L"http://:example.com", L"http://:example.com"}, + {L"http://((())/path?", L"http://((())/path?"}, + {L"http:////abc.server", L"http:////abc.server"}, + // Non-ASCII chars are not validated either. + {L"www.测试.net", L"http://www.测试.net"}, + {L"www.测试。net。", L"http://www.测试。net。"}, + {L"www.测试.net;", L"http://www.测试.net;"}, + }; + for (size_t i = 0; i < FX_ArraySize(valid_cases); ++i) { + CFX_WideString text_str(valid_cases[i][0]); + EXPECT_TRUE(extractor.CheckWebLink(text_str)) << text_str.c_str(); + EXPECT_STREQ(valid_cases[i][1], text_str.c_str()); } } |