From defe54dbf60459972ff1c295af332ccdbbe15a70 Mon Sep 17 00:00:00 2001
From: Henrique Nakashima <hnakashima@chromium.org>
Date: Thu, 8 Jun 2017 11:57:57 -0400
Subject: Fixing click area for links with a URL with prepended text.

Bug: pdfium:655
Change-Id: Idd90be487d390f066a76140800096feead6b9e55
Reviewed-on: https://pdfium-review.googlesource.com/6310
Reviewed-by: dsinclair <dsinclair@chromium.org>
Commit-Queue: Henrique Nakashima <hnakashima@chromium.org>
---
 core/fpdftext/cpdf_linkextract.cpp          |  50 +++++++----
 core/fpdftext/cpdf_linkextract.h            |   4 +-
 core/fpdftext/cpdf_linkextract_unittest.cpp | 129 +++++++++++++++++-----------
 3 files changed, 112 insertions(+), 71 deletions(-)

diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 315cff12a0..98bf915519 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -121,9 +121,15 @@ void CPDF_LinkExtract::ParseLink() {
         }
         // Check for potential web URLs and email addresses.
         // Ftp address, file system links, data, blob etc. are not checked.
-        if (nCount > 5 &&
-            (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
-          m_LinkArray.push_back({start, nCount, strBeCheck});
+        if (nCount > 5) {
+          int32_t nStartOffset;
+          int32_t nCountOverload;
+          if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
+            m_LinkArray.push_back(
+                {start + nStartOffset, nCountOverload, strBeCheck});
+          } else if (CheckMailLink(&strBeCheck)) {
+            m_LinkArray.push_back({start, nCount, strBeCheck});
+          }
         }
       }
       start = ++pos;
@@ -136,13 +142,15 @@ void CPDF_LinkExtract::ParseLink() {
   }
 }
 
-bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
+bool CPDF_LinkExtract::CheckWebLink(CFX_WideString* strBeCheck,
+                                    int32_t* nStart,
+                                    int32_t* nCount) {
   static const wchar_t kHttpScheme[] = L"http";
   static const FX_STRSIZE kHttpSchemeLen = FXSYS_len(kHttpScheme);
   static const wchar_t kWWWAddrStart[] = L"www.";
   static const FX_STRSIZE kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
 
-  CFX_WideString str = strBeCheck;
+  CFX_WideString str = *strBeCheck;
   str.MakeLower();
 
   FX_STRSIZE len = str.GetLength();
@@ -156,7 +164,9 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
         FX_STRSIZE end = FindWebLinkEnding(str, off + 3);
         if (end > off + 3) {  // Non-empty host name.
-          strBeCheck = strBeCheck.Mid(start, end - start + 1);
+          *nStart = start;
+          *nCount = end - start + 1;
+          *strBeCheck = strBeCheck->Mid(*nStart, *nCount);
           return true;
         }
       }
@@ -168,15 +178,17 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
   if (start != -1 && len > start + kWWWAddrStartLen) {
     FX_STRSIZE end = FindWebLinkEnding(str, start);
     if (end > start + kWWWAddrStartLen) {
-      strBeCheck = L"http://" + strBeCheck.Mid(start, end - start + 1);
+      *nStart = start;
+      *nCount = end - start + 1;
+      *strBeCheck = L"http://" + strBeCheck->Mid(*nStart, *nCount);
       return true;
     }
   }
   return false;
 }
 
-bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
-  int aPos = str.Find(L'@');
+bool CPDF_LinkExtract::CheckMailLink(CFX_WideString* str) {
+  int aPos = str->Find(L'@');
   // Invalid when no '@'.
   if (aPos < 1)
     return false;
@@ -184,7 +196,7 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
   // Check the local part.
   int pPos = aPos;  // Used to track the position of '@' or '.'.
   for (int i = aPos - 1; i >= 0; i--) {
-    wchar_t ch = str.GetAt(i);
+    wchar_t ch = str->GetAt(i);
     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
       continue;
 
@@ -196,7 +208,7 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
       // End extracting for other invalid chars, '.' at the beginning, or
       // consecutive '.'.
       int removed_len = i == pPos - 1 ? i + 2 : i + 1;
-      str = str.Right(str.GetLength() - removed_len);
+      *str = str->Right(str->GetLength() - removed_len);
       break;
     }
     // Found a valid '.'.
@@ -204,23 +216,23 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
   }
 
   // Check the domain name part.
-  aPos = str.Find(L'@');
+  aPos = str->Find(L'@');
   if (aPos < 1)
     return false;
 
-  str.TrimRight(L'.');
+  str->TrimRight(L'.');
   // At least one '.' in domain name, but not at the beginning.
   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
   // Check whether we should remove this check.
-  int ePos = str.Find(L'.', aPos + 1);
+  int ePos = str->Find(L'.', aPos + 1);
   if (ePos == -1 || ePos == aPos + 1)
     return false;
 
   // Validate all other chars in domain name.
-  int nLen = str.GetLength();
+  int nLen = str->GetLength();
   pPos = 0;  // Used to track the position of '.'.
   for (int i = aPos + 1; i < nLen; i++) {
-    wchar_t wch = str.GetAt(i);
+    wchar_t wch = str->GetAt(i);
     if (wch == L'-' || FXSYS_iswalnum(wch))
       continue;
 
@@ -229,7 +241,7 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
       int host_end = i == pPos + 1 ? i - 2 : i - 1;
       if (pPos > 0 && host_end - aPos >= 3) {
         // Trim the ending invalid chars if there is at least one '.' and name.
-        str = str.Left(host_end + 1);
+        *str = str->Left(host_end + 1);
         break;
       }
       return false;
@@ -237,8 +249,8 @@ bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
     pPos = i;
   }
 
-  if (str.Find(L"mailto:") == -1)
-    str = L"mailto:" + str;
+  if (str->Find(L"mailto:") == -1)
+    *str = L"mailto:" + *str;
 
   return true;
 }
diff --git a/core/fpdftext/cpdf_linkextract.h b/core/fpdftext/cpdf_linkextract.h
index 564d552548..31004577b9 100644
--- a/core/fpdftext/cpdf_linkextract.h
+++ b/core/fpdftext/cpdf_linkextract.h
@@ -28,8 +28,8 @@ class CPDF_LinkExtract {
 
  protected:
   void ParseLink();
-  bool CheckWebLink(CFX_WideString& str);
-  bool CheckMailLink(CFX_WideString& str);
+  bool CheckWebLink(CFX_WideString* str, int32_t* nStart, int32_t* nCount);
+  bool CheckMailLink(CFX_WideString* str);
 
  private:
   struct Link {
diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp
index bd059862fd..de870b0cc1 100644
--- a/core/fpdftext/cpdf_linkextract_unittest.cpp
+++ b/core/fpdftext/cpdf_linkextract_unittest.cpp
@@ -32,7 +32,7 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) {
   };
   for (size_t i = 0; i < FX_ArraySize(invalid_strs); ++i) {
     CFX_WideString text_str(invalid_strs[i]);
-    EXPECT_FALSE(extractor.CheckMailLink(text_str)) << text_str.c_str();
+    EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << text_str.c_str();
   }
 
   // Check cases that can extract valid mail link.
@@ -54,7 +54,7 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) {
     CFX_WideString text_str(valid_strs[i][0]);
     CFX_WideString expected_str(L"mailto:");
     expected_str += valid_strs[i][1];
-    EXPECT_TRUE(extractor.CheckMailLink(text_str)) << text_str.c_str();
+    EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << text_str.c_str();
     EXPECT_STREQ(expected_str.c_str(), text_str.c_str());
   }
 }
@@ -75,64 +75,93 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) {
       // Web addresses that in correct format that we don't handle.
       L"abc.example.com",  // URL without scheme.
   };
+  const int32_t DEFAULT_VALUE = -42;
   for (size_t i = 0; i < FX_ArraySize(invalid_cases); ++i) {
     CFX_WideString text_str(invalid_cases[i]);
-    EXPECT_FALSE(extractor.CheckWebLink(text_str)) << text_str.c_str();
+    int32_t start_offset = DEFAULT_VALUE;
+    int32_t count = DEFAULT_VALUE;
+    EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count))
+        << text_str.c_str();
+    EXPECT_EQ(DEFAULT_VALUE, start_offset) << text_str.c_str();
+    EXPECT_EQ(DEFAULT_VALUE, count) << text_str.c_str();
   }
 
   // Check cases that can extract valid web link.
   // An array of {input_string, expected_extracted_web_link}.
-  const wchar_t* const valid_cases[][2] = {
-      {L"http://www.example.com", L"http://www.example.com"},  // standard URL.
-      {L"http://www.example.com:88",
-       L"http://www.example.com:88"},  // URL with port number.
-      {L"http://test@www.example.com",
-       L"http://test@www.example.com"},  // URL with username.
-      {L"http://test:test@example.com",
-       L"http://test:test@example.com"},  // URL with username and password.
-      {L"http://example", L"http://example"},  // URL with short domain name.
-      {L"http////www.server", L"http://www.server"},  // URL starts with "www.".
-      {L"http:/www.abc.com", L"http://www.abc.com"},  // URL starts with "www.".
-      {L"www.a.b.c", L"http://www.a.b.c"},            // URL starts with "www.".
-      {L"https://a.us", L"https://a.us"},             // Secure http URL.
-      {L"https://www.t.us", L"https://www.t.us"},     // Secure http URL.
-      {L"www.example-test.com",
-       L"http://www.example-test.com"},  // '-' in host is ok.
-      {L"www.example.com,",
-       L"http://www.example.com"},  // Trim ending invalid chars.
-      {L"www.example.com;(",
-       L"http://www.example.com"},  // Trim ending invalid chars.
-      {L"test:www.abc.com", L"http://www.abc.com"},  // Trim chars before URL.
-      {L"www.g.com..", L"http://www.g.com.."},       // Leave ending periods.
-      // Web link can contain IP address too.
-      {L"http://192.168.0.1", L"http://192.168.0.1"},  // IPv4 address.
-      {L"http://192.168.0.1:80",
-       L"http://192.168.0.1:80"},  // IPv4 address with port.
-      {L"http://[aa::00:bb::00:cc:00]",
-       L"http://[aa::00:bb::00:cc:00]"},  // IPv6 reference.
-      {L"http://[aa::00:bb::00:cc:00]:12",
-       L"http://[aa::00:bb::00:cc:00]:12"},       // IPv6 reference with port.
-      {L"http://[aa]:12", L"http://[aa]:12"},     // Not validate IP address.
-      {L"http://[aa]:12abc", L"http://[aa]:12"},  // Trim for IPv6 address.
-      {L"http://[aa]:", L"http://[aa]"},          // Trim for IPv6 address.
+  struct ValidCase {
+    const wchar_t* const input_string;
+    const wchar_t* const url_extracted;
+    const int32_t start_offset;
+    const int32_t count;
+  };
+  const ValidCase valid_cases[] = {
+      {L"http://www.example.com", L"http://www.example.com", 0,
+       22},  // standard URL.
+      {L"http://www.example.com:88", L"http://www.example.com:88", 0,
+       25},  // URL with port number.
+      {L"http://test@www.example.com", L"http://test@www.example.com", 0,
+       27},  // URL with username.
+      {L"http://test:test@example.com", L"http://test:test@example.com", 0,
+       28},  // URL with username and password.
+      {L"http://example", L"http://example", 0,
+       14},  // URL with short domain name.
+      {L"http////www.server", L"http://www.server", 8,
+       10},  // URL starts with "www.".
+      {L"http:/www.abc.com", L"http://www.abc.com", 6,
+       11},                                       // URL starts with "www.".
+      {L"www.a.b.c", L"http://www.a.b.c", 0, 9},  // URL starts with "www.".
+      {L"https://a.us", L"https://a.us", 0, 12},  // Secure http URL.
+      {L"https://www.t.us", L"https://www.t.us", 0, 16},  // Secure http URL.
+      {L"www.example-test.com", L"http://www.example-test.com", 0,
+       20},  // '-' in host is ok.
+      {L"www.example.com,", L"http://www.example.com", 0,
+       15},  // Trim ending invalid chars.
+      {L"www.example.com;(", L"http://www.example.com", 0,
+       15},  // Trim ending invalid chars.
+      {L"test:www.abc.com", L"http://www.abc.com", 5,
+       11},                                            // Trim chars before URL.
+      {L"www.g.com..", L"http://www.g.com..", 0, 11},  // Leave ending periods.
+
+      // Web links can contain IP addresses too.
+      {L"http://192.168.0.1", L"http://192.168.0.1", 0, 18},  // IPv4 address.
+      {L"http://192.168.0.1:80", L"http://192.168.0.1:80", 0,
+       21},  // IPv4 address with port.
+      {L"http://[aa::00:bb::00:cc:00]", L"http://[aa::00:bb::00:cc:00]", 0,
+       28},  // IPv6 reference.
+      {L"http://[aa::00:bb::00:cc:00]:12", L"http://[aa::00:bb::00:cc:00]:12",
+       0, 31},  // IPv6 reference with port.
+      {L"http://[aa]:12", L"http://[aa]:12", 0,
+       14},  // Not validate IP address.
+      {L"http://[aa]:12abc", L"http://[aa]:12", 0,
+       14},                                      // Trim for IPv6 address.
+      {L"http://[aa]:", L"http://[aa]", 0, 11},  // Trim for IPv6 address.
+
       // Path and query parts can be anything.
-      {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*("},
-      {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y"},
-      {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6"},
-      {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar"},
+      {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*(", 0, 20},
+      {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y", 0, 23},
+      {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6", 0, 22},
+      {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar", 0,
+       30},
+
       // Invalid chars inside host name are ok as we don't validate them.
-      {L"http://ex[am]ple", L"http://ex[am]ple"},
-      {L"http://:example.com", L"http://:example.com"},
-      {L"http://((())/path?", L"http://((())/path?"},
-      {L"http:////abc.server", L"http:////abc.server"},
+      {L"http://ex[am]ple", L"http://ex[am]ple", 0, 16},
+      {L"http://:example.com", L"http://:example.com", 0, 19},
+      {L"http://((())/path?", L"http://((())/path?", 0, 18},
+      {L"http:////abc.server", L"http:////abc.server", 0, 19},
+
       // Non-ASCII chars are not validated either.
-      {L"www.测试.net", L"http://www.测试.net"},
-      {L"www.测试。net。", L"http://www.测试。net。"},
-      {L"www.测试.net；", L"http://www.测试.net；"},
+      {L"www.测试.net", L"http://www.测试.net", 0, 10},
+      {L"www.测试。net。", L"http://www.测试。net。", 0, 11},
+      {L"www.测试.net；", L"http://www.测试.net；", 0, 11},
   };
   for (size_t i = 0; i < FX_ArraySize(valid_cases); ++i) {
-    CFX_WideString text_str(valid_cases[i][0]);
-    EXPECT_TRUE(extractor.CheckWebLink(text_str)) << text_str.c_str();
-    EXPECT_STREQ(valid_cases[i][1], text_str.c_str());
+    CFX_WideString text_str(valid_cases[i].input_string);
+    int32_t start_offset = DEFAULT_VALUE;
+    int32_t count = DEFAULT_VALUE;
+    EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count))
+        << text_str.c_str();
+    EXPECT_STREQ(valid_cases[i].url_extracted, text_str.c_str());
+    EXPECT_EQ(valid_cases[i].start_offset, start_offset) << text_str.c_str();
+    EXPECT_EQ(valid_cases[i].count, count) << text_str.c_str();
   }
 }
-- 
cgit v1.2.3