2 files changed, 166 insertions, 25 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 56a3ab4ec7..315cff12a0 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -13,6 +13,54 @@
 #include "core/fxcrt/fx_string.h"
 #include "core/fxcrt/fx_system.h"
 
+namespace {
+
+// Find the end of a web link starting from offset |start|.
+// The purpose of this function is to separate url from the surrounding context
+// characters, we do not intend to fully validate the url.
+// |str| contains lower case characters only.
+FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) {
+  FX_STRSIZE end = str.GetLength() - 1;
+  if (str.Find(L'/', start) != -1) {
+    // When there is a path and query after '/', most ASCII chars are allowed.
+    // We don't sanitize in this case.
+    return end;
+  }
+
+  // When there is no path, it only has IP address or host name.
+  // Port is optional at the end.
+  if (str[start] == L'[') {
+    // IPv6 reference.
+    // Find the end of the reference.
+    end = str.Find(L']', start + 1);
+    if (end != -1 && end > start + 1) {  // Has content inside brackets.
+      FX_STRSIZE len = str.GetLength();
+      FX_STRSIZE off = end + 1;
+      if (off < len && str[off] == L':') {
+        off++;
+        while (off < len && str[off] >= L'0' && str[off] <= L'9')
+          off++;
+        if (off > end + 2 && off <= len)  // At least one digit in port number.
+          end = off - 1;  // |off| is offset of the first invalid char.
+      }
+    }
+    return end;
+  }
+
+  // According to RFC1123, host name only has alphanumeric chars, hyphens,
+  // and periods. Hyphen should not at the end though.
+  // Non-ASCII chars are ignored during checking.
+  while (end > start && str[end] < 0x80) {
+    if ((str[end] >= L'0' && str[end] <= L'9') ||
+        (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.')
+      break;
+    end--;
+  }
+  return end;
+}
+
+}  // namespace
+
 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
     : m_pTextPage(pTextPage) {}
 
@@ -71,6 +119,8 @@ void CPDF_LinkExtract::ParseLink() {
             break;
           }
         }
+        // Check for potential web URLs and email addresses.
+        // Ftp address, file system links, data, blob etc. are not checked.
         if (nCount > 5 &&
             (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
           m_LinkArray.push_back({start, nCount, strBeCheck});
@@ -87,28 +137,40 @@ void CPDF_LinkExtract::ParseLink() {
 }
 
 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
+  static const wchar_t kHttpScheme[] = L"http";
+  static const FX_STRSIZE kHttpSchemeLen = FXSYS_len(kHttpScheme);
+  static const wchar_t kWWWAddrStart[] = L"www.";
+  static const FX_STRSIZE kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
+
   CFX_WideString str = strBeCheck;
   str.MakeLower();
-  if (str.Find(L"http://www.") != -1) {
-    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
-    return true;
-  }
-  if (str.Find(L"http://") != -1) {
-    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
-    return true;
-  }
-  if (str.Find(L"https://www.") != -1) {
-    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
-    return true;
-  }
-  if (str.Find(L"https://") != -1) {
-    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
-    return true;
+
+  FX_STRSIZE len = str.GetLength();
+  // First, try to find the scheme.
+  FX_STRSIZE start = str.Find(kHttpScheme);
+  if (start != -1) {
+    FX_STRSIZE off = start + kHttpSchemeLen;  // move after "http".
+    if (len > off + 4) {                      // At least "://<char>" follows.
+      if (str[off] == L's')                   // "https" scheme is accepted.
+        off++;
+      if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
+        FX_STRSIZE end = FindWebLinkEnding(str, off + 3);
+        if (end > off + 3) {  // Non-empty host name.
+          strBeCheck = strBeCheck.Mid(start, end - start + 1);
+          return true;
+        }
+      }
+    }
   }
-  if (str.Find(L"www.") != -1) {
-    strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
-    strBeCheck = L"http://" + strBeCheck;
-    return true;
+
+  // When there is no scheme, try to find url starting with "www.".
+  start = str.Find(kWWWAddrStart);
+  if (start != -1 && len > start + kWWWAddrStartLen) {
+    FX_STRSIZE end = FindWebLinkEnding(str, start);
+    if (end > start + kWWWAddrStartLen) {
+      strBeCheck = L"http://" + strBeCheck.Mid(start, end - start + 1);
+      return true;
+    }
   }
   return false;
 }
diff --git a/core/fpdftext/fpdf_text_int_unittest.cpp b/core/fpdftext/fpdf_text_int_unittest.cpp
index d7e48768bc..5730e5cc49 100644
--- a/core/fpdftext/fpdf_text_int_unittest.cpp
+++ b/core/fpdftext/fpdf_text_int_unittest.cpp
@@ -13,14 +13,15 @@ class CPDF_TestLinkExtract : public CPDF_LinkExtract {
 
  private:
   // Add test cases as friends to access protected member functions.
-  // Access CheckMailLink.
+  // Access CheckMailLink and CheckWebLink.
   FRIEND_TEST(fpdf_text_int, CheckMailLink);
+  FRIEND_TEST(fpdf_text_int, CheckWebLink);
 };
 
 TEST(fpdf_text_int, CheckMailLink) {
   CPDF_TestLinkExtract extractor;
   // Check cases that fail to extract valid mail link.
-  const wchar_t* invalid_strs[] = {
+  const wchar_t* const invalid_strs[] = {
       L"",
       L"peter.pan",       // '@' is required.
       L"abc@server",      // Domain name needs at least one '.'.
@@ -31,12 +32,12 @@ TEST(fpdf_text_int, CheckMailLink) {
   };
   for (size_t i = 0; i < FX_ArraySize(invalid_strs); ++i) {
     CFX_WideString text_str(invalid_strs[i]);
-    EXPECT_FALSE(extractor.CheckMailLink(text_str));
+    EXPECT_FALSE(extractor.CheckMailLink(text_str)) << text_str.c_str();
   }
 
   // Check cases that can extract valid mail link.
   // An array of {input_string, expected_extracted_email_address}.
-  const wchar_t* valid_strs[][2] = {
+  const wchar_t* const valid_strs[][2] = {
       {L"peter@abc.d", L"peter@abc.d"},
       {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"},
       {L"abc_@gmail.com", L"abc_@gmail.com"},  // '_' is ok before '@'.
@@ -53,7 +54,85 @@ TEST(fpdf_text_int, CheckMailLink) {
     CFX_WideString text_str(valid_strs[i][0]);
     CFX_WideString expected_str(L"mailto:");
     expected_str += valid_strs[i][1];
-    EXPECT_TRUE(extractor.CheckMailLink(text_str));
-    EXPECT_STREQ(text_str.c_str(), expected_str.c_str());
+    EXPECT_TRUE(extractor.CheckMailLink(text_str)) << text_str.c_str();
+    EXPECT_STREQ(expected_str.c_str(), text_str.c_str());
+  }
+}
+
+TEST(fpdf_text_int, CheckWebLink) {
+  CPDF_TestLinkExtract extractor;
+  // Check cases that fail to extract valid web link.
+  // The last few are legit web addresses that we don't handle now.
+  const wchar_t* const invalid_cases[] = {
+      L"", L"http", L"www.", L"https-and-www",
+      L"http:/abc.com",             // Missing slash.
+      L"http://((()),",             // Only invalid chars in host name.
+      L"ftp://example.com",         // Ftp scheme is not supported.
+      L"http:example.com",          // Missing slashes.
+      L"http//[example.com",        // Invalid IPv6 address.
+      L"http//[00:00:00:00:00:00",  // Invalid IPv6 address.
+      L"http//[]",                  // Empty IPv6 address.
+      // Web addresses that in correct format that we don't handle.
+      L"abc.example.com",  // URL without scheme.
+  };
+  for (size_t i = 0; i < FX_ArraySize(invalid_cases); ++i) {
+    CFX_WideString text_str(invalid_cases[i]);
+    EXPECT_FALSE(extractor.CheckWebLink(text_str)) << text_str.c_str();
+  }
+
+  // Check cases that can extract valid web link.
+  // An array of {input_string, expected_extracted_web_link}.
+  const wchar_t* const valid_cases[][2] = {
+      {L"http://www.example.com", L"http://www.example.com"},  // standard URL.
+      {L"http://www.example.com:88",
+       L"http://www.example.com:88"},  // URL with port number.
+      {L"http://test@www.example.com",
+       L"http://test@www.example.com"},  // URL with username.
+      {L"http://test:test@example.com",
+       L"http://test:test@example.com"},  // URL with username and password.
+      {L"http://example", L"http://example"},  // URL with short domain name.
+      {L"http////www.server", L"http://www.server"},  // URL starts with "www.".
+      {L"http:/www.abc.com", L"http://www.abc.com"},  // URL starts with "www.".
+      {L"www.a.b.c", L"http://www.a.b.c"},            // URL starts with "www.".
+      {L"https://a.us", L"https://a.us"},             // Secure http URL.
+      {L"https://www.t.us", L"https://www.t.us"},     // Secure http URL.
+      {L"www.example-test.com",
+       L"http://www.example-test.com"},  // '-' in host is ok.
+      {L"www.example.com,",
+       L"http://www.example.com"},  // Trim ending invalid chars.
+      {L"www.example.com;(",
+       L"http://www.example.com"},  // Trim ending invalid chars.
+      {L"test:www.abc.com", L"http://www.abc.com"},  // Trim chars before URL.
+      {L"www.g.com..", L"http://www.g.com.."},       // Leave ending periods.
+      // Web link can contain IP address too.
+      {L"http://192.168.0.1", L"http://192.168.0.1"},  // IPv4 address.
+      {L"http://192.168.0.1:80",
+       L"http://192.168.0.1:80"},  // IPv4 address with port.
+      {L"http://[aa::00:bb::00:cc:00]",
+       L"http://[aa::00:bb::00:cc:00]"},  // IPv6 reference.
+      {L"http://[aa::00:bb::00:cc:00]:12",
+       L"http://[aa::00:bb::00:cc:00]:12"},       // IPv6 reference with port.
+      {L"http://[aa]:12", L"http://[aa]:12"},     // Not validate IP address.
+      {L"http://[aa]:12abc", L"http://[aa]:12"},  // Trim for IPv6 address.
+      {L"http://[aa]:", L"http://[aa]"},          // Trim for IPv6 address.
+      // Path and query parts can be anything.
+      {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*("},
+      {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y"},
+      {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6"},
+      {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar"},
+      // Invalid chars inside host name are ok as we don't validate them.
+      {L"http://ex[am]ple", L"http://ex[am]ple"},
+      {L"http://:example.com", L"http://:example.com"},
+      {L"http://((())/path?", L"http://((())/path?"},
+      {L"http:////abc.server", L"http:////abc.server"},
+      // Non-ASCII chars are not validated either.
+      {L"www.测试.net", L"http://www.测试.net"},
+      {L"www.测试。net。", L"http://www.测试。net。"},
+      {L"www.测试.net；", L"http://www.测试.net；"},
+  };
+  for (size_t i = 0; i < FX_ArraySize(valid_cases); ++i) {
+    CFX_WideString text_str(valid_cases[i][0]);
+    EXPECT_TRUE(extractor.CheckWebLink(text_str)) << text_str.c_str();
+    EXPECT_STREQ(valid_cases[i][1], text_str.c_str());
   }
 }