Merge to XFA: Correctly extracting email addresses

An email address contains user name part and host name part. User name allows dash or underscore, but not leading/ending/double period. Host name doesn't allow leading/ending/double period either. BUG=489107 TBR=thestig@chromium.org Review URL: https://codereview.chromium.org/1530763005 . (cherry picked from commit cc70b7b55c9edcd0ff038f59080699060fbbede1) Review URL: https://codereview.chromium.org/1532303002 .
author: Wei Li <weili@chromium.org> 2015-12-17 18:16:23 -0800
committer: Wei Li <weili@chromium.org> 2015-12-17 18:16:23 -0800
commit: 60eac0f7cebd682e93eb5d8a0950f31856099081 (patch)
tree: f797cf457d0a4b8eb916373ee3afcb7d9ca73abe /core/src
parent: c5d934dff4c2fa254b445dbf06899bb181df1c12 (diff)
download: pdfium-60eac0f7cebd682e93eb5d8a0950f31856099081.tar.xz
3 files changed, 96 insertions, 50 deletions
diff --git a/core/src/fpdftext/fpdf_text_int.cpp b/core/src/fpdftext/fpdf_text_int.cpp
index 1e6d54d133..d7a9c47519 100644
--- a/core/src/fpdftext/fpdf_text_int.cpp
+++ b/core/src/fpdftext/fpdf_text_int.cpp
@@ -14,6 +14,7 @@
 #include "core/include/fpdfapi/fpdf_resource.h"
 #include "core/include/fpdftext/fpdf_text.h"
 #include "core/include/fxcrt/fx_bidi.h"
+#include "core/include/fxcrt/fx_ext.h"
 #include "core/include/fxcrt/fx_ucd.h"
 #include "text_int.h"
 #include "third_party/base/nonstd_unique_ptr.h"
@@ -2607,80 +2608,69 @@ FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
   }
   return FALSE;
 }
-FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
-  str.MakeLower();
+bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
   int aPos = str.Find(L'@');
+  // Invalid when no '@'.
   if (aPos < 1) {
     return FALSE;
   }
-  if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
-    return FALSE;
-  }
-  int i;
-  for (i = aPos - 1; i >= 0; i--) {
+
+  // Check the local part.
+  int pPos = aPos;  // Used to track the position of '@' or '.'.
+  for (int i = aPos - 1; i >= 0; i--) {
     FX_WCHAR ch = str.GetAt(i);
-    if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') ||
-        (ch >= L'0' && ch <= L'9')) {
+    if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
       continue;
-    } else {
+    }
+    if (ch != L'.' || i == pPos - 1 || i == 0) {
       if (i == aPos - 1) {
+        // There is '.' or invalid char before '@'.
         return FALSE;
       }
-      str = str.Right(str.GetLength() - i - 1);
+      // End extracting for other invalid chars, '.' at the beginning, or
+      // consecutive '.'.
+      int removed_len = i == pPos - 1 ? i + 2 : i + 1;
+      str = str.Right(str.GetLength() - removed_len);
       break;
     }
+    // Found a valid '.'.
+    pPos = i;
   }
-  aPos = str.Find(L'@');
-  if (aPos < 1) {
-    return FALSE;
-  }
-  CFX_WideString strtemp = L"";
-  for (i = 0; i < aPos; i++) {
-    FX_WCHAR wch = str.GetAt(i);
-    if (wch >= L'a' && wch <= L'z') {
-      break;
-    } else {
-      strtemp = str.Right(str.GetLength() - i + 1);
-    }
-  }
-  if (strtemp != L"") {
-    str = strtemp;
-  }
+
+  // Check the domain name part.
   aPos = str.Find(L'@');
   if (aPos < 1) {
     return FALSE;
   }
   str.TrimRight(L'.');
-  strtemp = str;
-  int ePos = str.Find(L'.');
-  if (ePos == -1) {
+  // At least one '.' in domain name, but not at the beginning.
+  // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
+  // Check whether we should remove this check.
+  int ePos = str.Find(L'.', aPos + 1);
+  if (ePos == -1 || ePos == aPos + 1) {
     return FALSE;
   }
-  while (ePos != -1) {
-    strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);
-    ePos = strtemp.Find('.');
-  }
-  ePos = strtemp.GetLength();
-  for (i = 0; i < ePos; i++) {
-    FX_WCHAR wch = str.GetAt(i);
-    if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
-      continue;
-    } else {
-      str = str.Left(str.GetLength() - ePos + i + 1);
-      ePos = ePos - i - 1;
-      break;
-    }
-  }
+  // Validate all other chars in domain name.
   int nLen = str.GetLength();
-  for (i = aPos + 1; i < nLen - ePos; i++) {
+  pPos = 0;  // Used to track the position of '.'.
+  for (int i = aPos + 1; i < nLen; i++) {
     FX_WCHAR wch = str.GetAt(i);
-    if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') ||
-        (wch >= L'0' && wch <= L'9')) {
+    if (wch == L'-' || FXSYS_iswalnum(wch)) {
       continue;
-    } else {
+    }
+    if (wch != L'.' || i == pPos + 1) {
+      // Domain name should end before invalid char.
+      int host_end = i == pPos + 1 ? i - 2 : i - 1;
+      if (pPos > 0 && host_end - aPos >= 3) {
+        // Trim the ending invalid chars if there is at least one '.' and name.
+        str = str.Left(host_end + 1);
+        break;
+      }
       return FALSE;
     }
+    pPos = i;
   }
+
   if (str.Find(L"mailto:") == -1) {
     str = L"mailto:" + str;
   }
diff --git a/core/src/fpdftext/fpdf_text_int_unittest.cpp b/core/src/fpdftext/fpdf_text_int_unittest.cpp
new file mode 100644
index 0000000000..b482f02be4
--- /dev/null
+++ b/core/src/fpdftext/fpdf_text_int_unittest.cpp
@@ -0,0 +1,56 @@
+// Copyright 2015 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+#include "core/src/fpdftext/text_int.h"
+
+// Class to help test functions in CPDF_LinkExtract class.
+class CPDF_TestLinkExtract : public CPDF_LinkExtract {
+ private:
+  // Add test cases as friends to access protected member functions.
+  // Access CheckMailLink.
+  FRIEND_TEST(fpdf_text_int, CheckMailLink);
+};
+
+TEST(fpdf_text_int, CheckMailLink) {
+  CPDF_TestLinkExtract extractor;
+  // Check cases that fail to extract valid mail link.
+  const wchar_t* invalid_strs[] = {
+      L"",
+      L"peter.pan"        // '@' is required.
+      L"abc@server",      // Domain name needs at least one '.'.
+      L"abc.@gmail.com",  // '.' can not immediately precede '@'.
+      L"abc@xyz&q.org",   // Domain name should not contain '&'.
+      L"abc@.xyz.org",    // Domain name should not start with '.'.
+      L"fan@g..com"       // Domain name should not have consecutive '.'
+  };
+  for (int i = 0; i < FX_ArraySize(invalid_strs); ++i) {
+    CFX_WideString text_str(invalid_strs[i]);
+    EXPECT_FALSE(extractor.CheckMailLink(text_str));
+  }
+
+  // Check cases that can extract valid mail link.
+  // An array of {input_string, expected_extracted_email_address}.
+  const wchar_t* valid_strs[][2] = {
+      {L"peter@abc.d", L"peter@abc.d"},
+      {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"},
+      {L"abc_@gmail.com", L"abc_@gmail.com"},  // '_' is ok before '@'.
+      {L"dummy-hi@gmail.com",
+       L"dummy-hi@gmail.com"},                  // '-' is ok in user name.
+      {L"a..df@gmail.com", L"df@gmail.com"},    // Stop at consecutive '.'.
+      {L".john@yahoo.com", L"john@yahoo.com"},  // Remove heading '.'.
+      {L"abc@xyz.org?/", L"abc@xyz.org"},       // Trim ending invalid chars.
+      {L"fan{abc@xyz.org", L"abc@xyz.org"},     // Trim beginning invalid chars.
+      {L"fan@g.com..", L"fan@g.com"},           // Trim the ending periods.
+      {L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"},  // Keep the original case.
+  };
+  for (int i = 0; i < FX_ArraySize(valid_strs); ++i) {
+    CFX_WideString text_str(valid_strs[i][0]);
+    CFX_WideString expected_str(L"mailto:");
+    expected_str += valid_strs[i][1];
+    EXPECT_TRUE(extractor.CheckMailLink(text_str));
+    EXPECT_STREQ(text_str.c_str(), expected_str.c_str());
+  }
+}
diff --git a/core/src/fpdftext/text_int.h b/core/src/fpdftext/text_int.h
index 4688bbf1ce..17d0b7e0f9 100644
--- a/core/src/fpdftext/text_int.h
+++ b/core/src/fpdftext/text_int.h
@@ -227,7 +227,7 @@ class CPDF_LinkExtract : public IPDF_LinkExtract {
   void ParseLink();
   void DeleteLinkList();
   FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
-  FX_BOOL CheckMailLink(CFX_WideString& str);
+  bool CheckMailLink(CFX_WideString& str);
   void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
 
  private:
author	Wei Li <weili@chromium.org>	2015-12-17 18:16:23 -0800
committer	Wei Li <weili@chromium.org>	2015-12-17 18:16:23 -0800
commit	60eac0f7cebd682e93eb5d8a0950f31856099081 (patch)
tree	f797cf457d0a4b8eb916373ee3afcb7d9ca73abe /core/src
parent	c5d934dff4c2fa254b445dbf06899bb181df1c12 (diff)
download	pdfium-60eac0f7cebd682e93eb5d8a0950f31856099081.tar.xz