summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWei Li <weili@chromium.org>2015-12-17 17:46:58 -0800
committerWei Li <weili@chromium.org>2015-12-17 17:46:58 -0800
commitcc70b7b55c9edcd0ff038f59080699060fbbede1 (patch)
tree15e4137ce96d2a74450a8a84d2c4c3b31741f2c4
parent391783579c5a8aad346b56903b51e6551600550b (diff)
downloadpdfium-cc70b7b55c9edcd0ff038f59080699060fbbede1.tar.xz
Correctly extracting email addresses
An email address contains user name part and host name part. User name allows dash or underscore, but not leading/ending/double period. Host name doesn't allow leading/ending/double period either. BUG=489107 R=jun_fang@foxitsoftware.com, thestig@chromium.org Review URL: https://codereview.chromium.org/1530763005 .
-rw-r--r--BUILD.gn1
-rw-r--r--core/include/fxcrt/fx_ext.h9
-rw-r--r--core/src/fpdftext/fpdf_text_int.cpp88
-rw-r--r--core/src/fpdftext/fpdf_text_int_unittest.cpp56
-rw-r--r--core/src/fpdftext/text_int.h2
-rw-r--r--pdfium.gyp1
6 files changed, 107 insertions, 50 deletions
diff --git a/BUILD.gn b/BUILD.gn
index 3b029b5fb0..fd23beea7b 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -729,6 +729,7 @@ test("pdfium_unittests") {
"core/src/fpdfapi/fpdf_page/fpdf_page_parser_old_unittest.cpp",
"core/src/fpdfapi/fpdf_parser/fpdf_parser_decode_unittest.cpp",
"core/src/fpdfapi/fpdf_parser/fpdf_parser_parser_unittest.cpp",
+ "core/src/fpdftext/fpdf_text_int_unittest.cpp",
"core/src/fxcodec/codec/fx_codec_jpx_unittest.cpp",
"core/src/fxcrt/fx_basic_bstring_unittest.cpp",
"core/src/fxcrt/fx_basic_memmgr_unittest.cpp",
diff --git a/core/include/fxcrt/fx_ext.h b/core/include/fxcrt/fx_ext.h
index e8d8bc7b5f..a4a92d2fb0 100644
--- a/core/include/fxcrt/fx_ext.h
+++ b/core/include/fxcrt/fx_ext.h
@@ -36,6 +36,15 @@ inline int32_t FXSYS_tolower(int32_t ch) {
inline int32_t FXSYS_toupper(int32_t ch) {
return ch < 'a' || ch > 'z' ? ch : (ch - 0x20);
}
+inline FX_BOOL FXSYS_iswalpha(wchar_t wch) {
+ return (wch >= L'A' && wch <= L'Z') || (wch >= L'a' && wch <= L'z');
+}
+inline FX_BOOL FXSYS_iswdigit(wchar_t wch) {
+ return wch >= L'0' && wch <= L'9';
+}
+inline FX_BOOL FXSYS_iswalnum(wchar_t wch) {
+ return FXSYS_iswalpha(wch) || FXSYS_iswdigit(wch);
+}
inline int FXSYS_toHexDigit(const FX_CHAR c) {
if (!std::isxdigit(c))
diff --git a/core/src/fpdftext/fpdf_text_int.cpp b/core/src/fpdftext/fpdf_text_int.cpp
index 1e6d54d133..d7a9c47519 100644
--- a/core/src/fpdftext/fpdf_text_int.cpp
+++ b/core/src/fpdftext/fpdf_text_int.cpp
@@ -14,6 +14,7 @@
#include "core/include/fpdfapi/fpdf_resource.h"
#include "core/include/fpdftext/fpdf_text.h"
#include "core/include/fxcrt/fx_bidi.h"
+#include "core/include/fxcrt/fx_ext.h"
#include "core/include/fxcrt/fx_ucd.h"
#include "text_int.h"
#include "third_party/base/nonstd_unique_ptr.h"
@@ -2607,80 +2608,69 @@ FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
}
return FALSE;
}
-FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
- str.MakeLower();
+bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
int aPos = str.Find(L'@');
+ // Invalid when no '@'.
if (aPos < 1) {
return FALSE;
}
- if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
- return FALSE;
- }
- int i;
- for (i = aPos - 1; i >= 0; i--) {
+
+ // Check the local part.
+ int pPos = aPos; // Used to track the position of '@' or '.'.
+ for (int i = aPos - 1; i >= 0; i--) {
FX_WCHAR ch = str.GetAt(i);
- if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') ||
- (ch >= L'0' && ch <= L'9')) {
+ if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
continue;
- } else {
+ }
+ if (ch != L'.' || i == pPos - 1 || i == 0) {
if (i == aPos - 1) {
+ // There is '.' or invalid char before '@'.
return FALSE;
}
- str = str.Right(str.GetLength() - i - 1);
+ // End extracting for other invalid chars, '.' at the beginning, or
+ // consecutive '.'.
+ int removed_len = i == pPos - 1 ? i + 2 : i + 1;
+ str = str.Right(str.GetLength() - removed_len);
break;
}
+ // Found a valid '.'.
+ pPos = i;
}
- aPos = str.Find(L'@');
- if (aPos < 1) {
- return FALSE;
- }
- CFX_WideString strtemp = L"";
- for (i = 0; i < aPos; i++) {
- FX_WCHAR wch = str.GetAt(i);
- if (wch >= L'a' && wch <= L'z') {
- break;
- } else {
- strtemp = str.Right(str.GetLength() - i + 1);
- }
- }
- if (strtemp != L"") {
- str = strtemp;
- }
+
+ // Check the domain name part.
aPos = str.Find(L'@');
if (aPos < 1) {
return FALSE;
}
str.TrimRight(L'.');
- strtemp = str;
- int ePos = str.Find(L'.');
- if (ePos == -1) {
+ // At least one '.' in domain name, but not at the beginning.
+ // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
+ // Check whether we should remove this check.
+ int ePos = str.Find(L'.', aPos + 1);
+ if (ePos == -1 || ePos == aPos + 1) {
return FALSE;
}
- while (ePos != -1) {
- strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);
- ePos = strtemp.Find('.');
- }
- ePos = strtemp.GetLength();
- for (i = 0; i < ePos; i++) {
- FX_WCHAR wch = str.GetAt(i);
- if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
- continue;
- } else {
- str = str.Left(str.GetLength() - ePos + i + 1);
- ePos = ePos - i - 1;
- break;
- }
- }
+ // Validate all other chars in domain name.
int nLen = str.GetLength();
- for (i = aPos + 1; i < nLen - ePos; i++) {
+ pPos = 0; // Used to track the position of '.'.
+ for (int i = aPos + 1; i < nLen; i++) {
FX_WCHAR wch = str.GetAt(i);
- if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') ||
- (wch >= L'0' && wch <= L'9')) {
+ if (wch == L'-' || FXSYS_iswalnum(wch)) {
continue;
- } else {
+ }
+ if (wch != L'.' || i == pPos + 1) {
+ // Domain name should end before invalid char.
+ int host_end = i == pPos + 1 ? i - 2 : i - 1;
+ if (pPos > 0 && host_end - aPos >= 3) {
+ // Trim the ending invalid chars if there is at least one '.' and name.
+ str = str.Left(host_end + 1);
+ break;
+ }
return FALSE;
}
+ pPos = i;
}
+
if (str.Find(L"mailto:") == -1) {
str = L"mailto:" + str;
}
diff --git a/core/src/fpdftext/fpdf_text_int_unittest.cpp b/core/src/fpdftext/fpdf_text_int_unittest.cpp
new file mode 100644
index 0000000000..b482f02be4
--- /dev/null
+++ b/core/src/fpdftext/fpdf_text_int_unittest.cpp
@@ -0,0 +1,56 @@
+// Copyright 2015 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+#include "core/src/fpdftext/text_int.h"
+
+// Class to help test functions in CPDF_LinkExtract class.
+class CPDF_TestLinkExtract : public CPDF_LinkExtract {
+ private:
+ // Add test cases as friends to access protected member functions.
+ // Access CheckMailLink.
+ FRIEND_TEST(fpdf_text_int, CheckMailLink);
+};
+
+TEST(fpdf_text_int, CheckMailLink) {
+ CPDF_TestLinkExtract extractor;
+ // Check cases that fail to extract valid mail link.
+ const wchar_t* invalid_strs[] = {
+ L"",
+ L"peter.pan" // '@' is required.
+ L"abc@server", // Domain name needs at least one '.'.
+ L"abc.@gmail.com", // '.' can not immediately precede '@'.
+ L"abc@xyz&q.org", // Domain name should not contain '&'.
+ L"abc@.xyz.org", // Domain name should not start with '.'.
+ L"fan@g..com" // Domain name should not have consecutive '.'
+ };
+ for (int i = 0; i < FX_ArraySize(invalid_strs); ++i) {
+ CFX_WideString text_str(invalid_strs[i]);
+ EXPECT_FALSE(extractor.CheckMailLink(text_str));
+ }
+
+ // Check cases that can extract valid mail link.
+ // An array of {input_string, expected_extracted_email_address}.
+ const wchar_t* valid_strs[][2] = {
+ {L"peter@abc.d", L"peter@abc.d"},
+ {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"},
+ {L"abc_@gmail.com", L"abc_@gmail.com"}, // '_' is ok before '@'.
+ {L"dummy-hi@gmail.com",
+ L"dummy-hi@gmail.com"}, // '-' is ok in user name.
+ {L"a..df@gmail.com", L"df@gmail.com"}, // Stop at consecutive '.'.
+ {L".john@yahoo.com", L"john@yahoo.com"}, // Remove heading '.'.
+ {L"abc@xyz.org?/", L"abc@xyz.org"}, // Trim ending invalid chars.
+ {L"fan{abc@xyz.org", L"abc@xyz.org"}, // Trim beginning invalid chars.
+ {L"fan@g.com..", L"fan@g.com"}, // Trim the ending periods.
+ {L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"}, // Keep the original case.
+ };
+ for (int i = 0; i < FX_ArraySize(valid_strs); ++i) {
+ CFX_WideString text_str(valid_strs[i][0]);
+ CFX_WideString expected_str(L"mailto:");
+ expected_str += valid_strs[i][1];
+ EXPECT_TRUE(extractor.CheckMailLink(text_str));
+ EXPECT_STREQ(text_str.c_str(), expected_str.c_str());
+ }
+}
diff --git a/core/src/fpdftext/text_int.h b/core/src/fpdftext/text_int.h
index a8f4f37e98..0fe43f2c40 100644
--- a/core/src/fpdftext/text_int.h
+++ b/core/src/fpdftext/text_int.h
@@ -226,7 +226,7 @@ class CPDF_LinkExtract : public IPDF_LinkExtract {
void ParseLink();
void DeleteLinkList();
FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
- FX_BOOL CheckMailLink(CFX_WideString& str);
+ bool CheckMailLink(CFX_WideString& str);
void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
private:
diff --git a/pdfium.gyp b/pdfium.gyp
index 58d6b8ffcd..117cdb7913 100644
--- a/pdfium.gyp
+++ b/pdfium.gyp
@@ -722,6 +722,7 @@
'core/src/fpdfapi/fpdf_page/fpdf_page_parser_old_unittest.cpp',
'core/src/fpdfapi/fpdf_parser/fpdf_parser_decode_unittest.cpp',
'core/src/fpdfapi/fpdf_parser/fpdf_parser_parser_unittest.cpp',
+ 'core/src/fpdftext/fpdf_text_int_unittest.cpp',
'core/src/fxcodec/codec/fx_codec_jpx_unittest.cpp',
'core/src/fxcrt/fx_basic_bstring_unittest.cpp',
'core/src/fxcrt/fx_basic_memmgr_unittest.cpp',