diff options
-rw-r--r-- | core/fpdftext/cpdf_linkextract.cpp | 50 | ||||
-rw-r--r-- | core/fpdftext/cpdf_linkextract.h | 4 | ||||
-rw-r--r-- | core/fpdftext/cpdf_linkextract_unittest.cpp | 129 |
3 files changed, 112 insertions, 71 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 315cff12a0..98bf915519 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -121,9 +121,15 @@ void CPDF_LinkExtract::ParseLink() { } // Check for potential web URLs and email addresses. // Ftp address, file system links, data, blob etc. are not checked. - if (nCount > 5 && - (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { - m_LinkArray.push_back({start, nCount, strBeCheck}); + if (nCount > 5) { + int32_t nStartOffset; + int32_t nCountOverload; + if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) { + m_LinkArray.push_back( + {start + nStartOffset, nCountOverload, strBeCheck}); + } else if (CheckMailLink(&strBeCheck)) { + m_LinkArray.push_back({start, nCount, strBeCheck}); + } } } start = ++pos; @@ -136,13 +142,15 @@ void CPDF_LinkExtract::ParseLink() { } } -bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { +bool CPDF_LinkExtract::CheckWebLink(CFX_WideString* strBeCheck, + int32_t* nStart, + int32_t* nCount) { static const wchar_t kHttpScheme[] = L"http"; static const FX_STRSIZE kHttpSchemeLen = FXSYS_len(kHttpScheme); static const wchar_t kWWWAddrStart[] = L"www."; static const FX_STRSIZE kWWWAddrStartLen = FXSYS_len(kWWWAddrStart); - CFX_WideString str = strBeCheck; + CFX_WideString str = *strBeCheck; str.MakeLower(); FX_STRSIZE len = str.GetLength(); @@ -156,7 +164,9 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { FX_STRSIZE end = FindWebLinkEnding(str, off + 3); if (end > off + 3) { // Non-empty host name. - strBeCheck = strBeCheck.Mid(start, end - start + 1); + *nStart = start; + *nCount = end - start + 1; + *strBeCheck = strBeCheck->Mid(*nStart, *nCount); return true; } } @@ -168,15 +178,17 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { if (start != -1 && len > start + kWWWAddrStartLen) { FX_STRSIZE end = FindWebLinkEnding(str, start); if (end > start + kWWWAddrStartLen) { - strBeCheck = L"http://" + strBeCheck.Mid(start, end - start + 1); + *nStart = start; + *nCount = end - start + 1; + *strBeCheck = L"http://" + strBeCheck->Mid(*nStart, *nCount); return true; } } return false; } -bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { - int aPos = str.Find(L'@'); +bool CPDF_LinkExtract::CheckMailLink(CFX_WideString* str) { + int aPos = str->Find(L'@'); // Invalid when no '@'. if (aPos < 1) return false; @@ -184,7 +196,7 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { // Check the local part. int pPos = aPos; // Used to track the position of '@' or '.'. for (int i = aPos - 1; i >= 0; i--) { - wchar_t ch = str.GetAt(i); + wchar_t ch = str->GetAt(i); if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) continue; @@ -196,7 +208,7 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { // End extracting for other invalid chars, '.' at the beginning, or // consecutive '.'. int removed_len = i == pPos - 1 ? i + 2 : i + 1; - str = str.Right(str.GetLength() - removed_len); + *str = str->Right(str->GetLength() - removed_len); break; } // Found a valid '.'. @@ -204,23 +216,23 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { } // Check the domain name part. - aPos = str.Find(L'@'); + aPos = str->Find(L'@'); if (aPos < 1) return false; - str.TrimRight(L'.'); + str->TrimRight(L'.'); // At least one '.' in domain name, but not at the beginning. // TODO(weili): RFC5322 allows domain names to be a local name without '.'. // Check whether we should remove this check. - int ePos = str.Find(L'.', aPos + 1); + int ePos = str->Find(L'.', aPos + 1); if (ePos == -1 || ePos == aPos + 1) return false; // Validate all other chars in domain name. - int nLen = str.GetLength(); + int nLen = str->GetLength(); pPos = 0; // Used to track the position of '.'. for (int i = aPos + 1; i < nLen; i++) { - wchar_t wch = str.GetAt(i); + wchar_t wch = str->GetAt(i); if (wch == L'-' || FXSYS_iswalnum(wch)) continue; @@ -229,7 +241,7 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { int host_end = i == pPos + 1 ? i - 2 : i - 1; if (pPos > 0 && host_end - aPos >= 3) { // Trim the ending invalid chars if there is at least one '.' and name. - str = str.Left(host_end + 1); + *str = str->Left(host_end + 1); break; } return false; @@ -237,8 +249,8 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { pPos = i; } - if (str.Find(L"mailto:") == -1) - str = L"mailto:" + str; + if (str->Find(L"mailto:") == -1) + *str = L"mailto:" + *str; return true; } diff --git a/core/fpdftext/cpdf_linkextract.h b/core/fpdftext/cpdf_linkextract.h index 564d552548..31004577b9 100644 --- a/core/fpdftext/cpdf_linkextract.h +++ b/core/fpdftext/cpdf_linkextract.h @@ -28,8 +28,8 @@ class CPDF_LinkExtract { protected: void ParseLink(); - bool CheckWebLink(CFX_WideString& str); - bool CheckMailLink(CFX_WideString& str); + bool CheckWebLink(CFX_WideString* str, int32_t* nStart, int32_t* nCount); + bool CheckMailLink(CFX_WideString* str); private: struct Link { diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp index bd059862fd..de870b0cc1 100644 --- a/core/fpdftext/cpdf_linkextract_unittest.cpp +++ b/core/fpdftext/cpdf_linkextract_unittest.cpp @@ -32,7 +32,7 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) { }; for (size_t i = 0; i < FX_ArraySize(invalid_strs); ++i) { CFX_WideString text_str(invalid_strs[i]); - EXPECT_FALSE(extractor.CheckMailLink(text_str)) << text_str.c_str(); + EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << text_str.c_str(); } // Check cases that can extract valid mail link. @@ -54,7 +54,7 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) { CFX_WideString text_str(valid_strs[i][0]); CFX_WideString expected_str(L"mailto:"); expected_str += valid_strs[i][1]; - EXPECT_TRUE(extractor.CheckMailLink(text_str)) << text_str.c_str(); + EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << text_str.c_str(); EXPECT_STREQ(expected_str.c_str(), text_str.c_str()); } } @@ -75,64 +75,93 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) { // Web addresses that in correct format that we don't handle. L"abc.example.com", // URL without scheme. }; + const int32_t DEFAULT_VALUE = -42; for (size_t i = 0; i < FX_ArraySize(invalid_cases); ++i) { CFX_WideString text_str(invalid_cases[i]); - EXPECT_FALSE(extractor.CheckWebLink(text_str)) << text_str.c_str(); + int32_t start_offset = DEFAULT_VALUE; + int32_t count = DEFAULT_VALUE; + EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count)) + << text_str.c_str(); + EXPECT_EQ(DEFAULT_VALUE, start_offset) << text_str.c_str(); + EXPECT_EQ(DEFAULT_VALUE, count) << text_str.c_str(); } // Check cases that can extract valid web link. // An array of {input_string, expected_extracted_web_link}. - const wchar_t* const valid_cases[][2] = { - {L"http://www.example.com", L"http://www.example.com"}, // standard URL. - {L"http://www.example.com:88", - L"http://www.example.com:88"}, // URL with port number. - {L"http://test@www.example.com", - L"http://test@www.example.com"}, // URL with username. - {L"http://test:test@example.com", - L"http://test:test@example.com"}, // URL with username and password. - {L"http://example", L"http://example"}, // URL with short domain name. - {L"http////www.server", L"http://www.server"}, // URL starts with "www.". - {L"http:/www.abc.com", L"http://www.abc.com"}, // URL starts with "www.". - {L"www.a.b.c", L"http://www.a.b.c"}, // URL starts with "www.". - {L"https://a.us", L"https://a.us"}, // Secure http URL. - {L"https://www.t.us", L"https://www.t.us"}, // Secure http URL. - {L"www.example-test.com", - L"http://www.example-test.com"}, // '-' in host is ok. - {L"www.example.com,", - L"http://www.example.com"}, // Trim ending invalid chars. - {L"www.example.com;(", - L"http://www.example.com"}, // Trim ending invalid chars. - {L"test:www.abc.com", L"http://www.abc.com"}, // Trim chars before URL. - {L"www.g.com..", L"http://www.g.com.."}, // Leave ending periods. - // Web link can contain IP address too. - {L"http://192.168.0.1", L"http://192.168.0.1"}, // IPv4 address. - {L"http://192.168.0.1:80", - L"http://192.168.0.1:80"}, // IPv4 address with port. - {L"http://[aa::00:bb::00:cc:00]", - L"http://[aa::00:bb::00:cc:00]"}, // IPv6 reference. - {L"http://[aa::00:bb::00:cc:00]:12", - L"http://[aa::00:bb::00:cc:00]:12"}, // IPv6 reference with port. - {L"http://[aa]:12", L"http://[aa]:12"}, // Not validate IP address. - {L"http://[aa]:12abc", L"http://[aa]:12"}, // Trim for IPv6 address. - {L"http://[aa]:", L"http://[aa]"}, // Trim for IPv6 address. + struct ValidCase { + const wchar_t* const input_string; + const wchar_t* const url_extracted; + const int32_t start_offset; + const int32_t count; + }; + const ValidCase valid_cases[] = { + {L"http://www.example.com", L"http://www.example.com", 0, + 22}, // standard URL. + {L"http://www.example.com:88", L"http://www.example.com:88", 0, + 25}, // URL with port number. + {L"http://test@www.example.com", L"http://test@www.example.com", 0, + 27}, // URL with username. + {L"http://test:test@example.com", L"http://test:test@example.com", 0, + 28}, // URL with username and password. + {L"http://example", L"http://example", 0, + 14}, // URL with short domain name. + {L"http////www.server", L"http://www.server", 8, + 10}, // URL starts with "www.". + {L"http:/www.abc.com", L"http://www.abc.com", 6, + 11}, // URL starts with "www.". + {L"www.a.b.c", L"http://www.a.b.c", 0, 9}, // URL starts with "www.". + {L"https://a.us", L"https://a.us", 0, 12}, // Secure http URL. + {L"https://www.t.us", L"https://www.t.us", 0, 16}, // Secure http URL. + {L"www.example-test.com", L"http://www.example-test.com", 0, + 20}, // '-' in host is ok. + {L"www.example.com,", L"http://www.example.com", 0, + 15}, // Trim ending invalid chars. + {L"www.example.com;(", L"http://www.example.com", 0, + 15}, // Trim ending invalid chars. + {L"test:www.abc.com", L"http://www.abc.com", 5, + 11}, // Trim chars before URL. + {L"www.g.com..", L"http://www.g.com..", 0, 11}, // Leave ending periods. + + // Web links can contain IP addresses too. + {L"http://192.168.0.1", L"http://192.168.0.1", 0, 18}, // IPv4 address. + {L"http://192.168.0.1:80", L"http://192.168.0.1:80", 0, + 21}, // IPv4 address with port. + {L"http://[aa::00:bb::00:cc:00]", L"http://[aa::00:bb::00:cc:00]", 0, + 28}, // IPv6 reference. + {L"http://[aa::00:bb::00:cc:00]:12", L"http://[aa::00:bb::00:cc:00]:12", + 0, 31}, // IPv6 reference with port. + {L"http://[aa]:12", L"http://[aa]:12", 0, + 14}, // Not validate IP address. + {L"http://[aa]:12abc", L"http://[aa]:12", 0, + 14}, // Trim for IPv6 address. + {L"http://[aa]:", L"http://[aa]", 0, 11}, // Trim for IPv6 address. + // Path and query parts can be anything. - {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*("}, - {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y"}, - {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6"}, - {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar"}, + {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*(", 0, 20}, + {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y", 0, 23}, + {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6", 0, 22}, + {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar", 0, + 30}, + // Invalid chars inside host name are ok as we don't validate them. - {L"http://ex[am]ple", L"http://ex[am]ple"}, - {L"http://:example.com", L"http://:example.com"}, - {L"http://((())/path?", L"http://((())/path?"}, - {L"http:////abc.server", L"http:////abc.server"}, + {L"http://ex[am]ple", L"http://ex[am]ple", 0, 16}, + {L"http://:example.com", L"http://:example.com", 0, 19}, + {L"http://((())/path?", L"http://((())/path?", 0, 18}, + {L"http:////abc.server", L"http:////abc.server", 0, 19}, + // Non-ASCII chars are not validated either. - {L"www.测试.net", L"http://www.测试.net"}, - {L"www.测试。net。", L"http://www.测试。net。"}, - {L"www.测试.net;", L"http://www.测试.net;"}, + {L"www.测试.net", L"http://www.测试.net", 0, 10}, + {L"www.测试。net。", L"http://www.测试。net。", 0, 11}, + {L"www.测试.net;", L"http://www.测试.net;", 0, 11}, }; for (size_t i = 0; i < FX_ArraySize(valid_cases); ++i) { - CFX_WideString text_str(valid_cases[i][0]); - EXPECT_TRUE(extractor.CheckWebLink(text_str)) << text_str.c_str(); - EXPECT_STREQ(valid_cases[i][1], text_str.c_str()); + CFX_WideString text_str(valid_cases[i].input_string); + int32_t start_offset = DEFAULT_VALUE; + int32_t count = DEFAULT_VALUE; + EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count)) + << text_str.c_str(); + EXPECT_STREQ(valid_cases[i].url_extracted, text_str.c_str()); + EXPECT_EQ(valid_cases[i].start_offset, start_offset) << text_str.c_str(); + EXPECT_EQ(valid_cases[i].count, count) << text_str.c_str(); } } |