From cc70b7b55c9edcd0ff038f59080699060fbbede1 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 17 Dec 2015 17:46:58 -0800 Subject: Correctly extracting email addresses An email address contains user name part and host name part. User name allows dash or underscore, but not leading/ending/double period. Host name doesn't allow leading/ending/double period either. BUG=489107 R=jun_fang@foxitsoftware.com, thestig@chromium.org Review URL: https://codereview.chromium.org/1530763005 . --- BUILD.gn | 1 + core/include/fxcrt/fx_ext.h | 9 +++ core/src/fpdftext/fpdf_text_int.cpp | 88 ++++++++++++---------------- core/src/fpdftext/fpdf_text_int_unittest.cpp | 56 ++++++++++++++++++ core/src/fpdftext/text_int.h | 2 +- pdfium.gyp | 1 + 6 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 core/src/fpdftext/fpdf_text_int_unittest.cpp diff --git a/BUILD.gn b/BUILD.gn index 3b029b5fb0..fd23beea7b 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -729,6 +729,7 @@ test("pdfium_unittests") { "core/src/fpdfapi/fpdf_page/fpdf_page_parser_old_unittest.cpp", "core/src/fpdfapi/fpdf_parser/fpdf_parser_decode_unittest.cpp", "core/src/fpdfapi/fpdf_parser/fpdf_parser_parser_unittest.cpp", + "core/src/fpdftext/fpdf_text_int_unittest.cpp", "core/src/fxcodec/codec/fx_codec_jpx_unittest.cpp", "core/src/fxcrt/fx_basic_bstring_unittest.cpp", "core/src/fxcrt/fx_basic_memmgr_unittest.cpp", diff --git a/core/include/fxcrt/fx_ext.h b/core/include/fxcrt/fx_ext.h index e8d8bc7b5f..a4a92d2fb0 100644 --- a/core/include/fxcrt/fx_ext.h +++ b/core/include/fxcrt/fx_ext.h @@ -36,6 +36,15 @@ inline int32_t FXSYS_tolower(int32_t ch) { inline int32_t FXSYS_toupper(int32_t ch) { return ch < 'a' || ch > 'z' ? ch : (ch - 0x20); } +inline FX_BOOL FXSYS_iswalpha(wchar_t wch) { + return (wch >= L'A' && wch <= L'Z') || (wch >= L'a' && wch <= L'z'); +} +inline FX_BOOL FXSYS_iswdigit(wchar_t wch) { + return wch >= L'0' && wch <= L'9'; +} +inline FX_BOOL FXSYS_iswalnum(wchar_t wch) { + return FXSYS_iswalpha(wch) || FXSYS_iswdigit(wch); +} inline int FXSYS_toHexDigit(const FX_CHAR c) { if (!std::isxdigit(c)) diff --git a/core/src/fpdftext/fpdf_text_int.cpp b/core/src/fpdftext/fpdf_text_int.cpp index 1e6d54d133..d7a9c47519 100644 --- a/core/src/fpdftext/fpdf_text_int.cpp +++ b/core/src/fpdftext/fpdf_text_int.cpp @@ -14,6 +14,7 @@ #include "core/include/fpdfapi/fpdf_resource.h" #include "core/include/fpdftext/fpdf_text.h" #include "core/include/fxcrt/fx_bidi.h" +#include "core/include/fxcrt/fx_ext.h" #include "core/include/fxcrt/fx_ucd.h" #include "text_int.h" #include "third_party/base/nonstd_unique_ptr.h" @@ -2607,80 +2608,69 @@ FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { } return FALSE; } -FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { - str.MakeLower(); +bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { int aPos = str.Find(L'@'); + // Invalid when no '@'. if (aPos < 1) { return FALSE; } - if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { - return FALSE; - } - int i; - for (i = aPos - 1; i >= 0; i--) { + + // Check the local part. + int pPos = aPos; // Used to track the position of '@' or '.'. + for (int i = aPos - 1; i >= 0; i--) { FX_WCHAR ch = str.GetAt(i); - if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || - (ch >= L'0' && ch <= L'9')) { + if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) { continue; - } else { + } + if (ch != L'.' || i == pPos - 1 || i == 0) { if (i == aPos - 1) { + // There is '.' or invalid char before '@'. return FALSE; } - str = str.Right(str.GetLength() - i - 1); + // End extracting for other invalid chars, '.' at the beginning, or + // consecutive '.'. + int removed_len = i == pPos - 1 ? i + 2 : i + 1; + str = str.Right(str.GetLength() - removed_len); break; } + // Found a valid '.'. + pPos = i; } - aPos = str.Find(L'@'); - if (aPos < 1) { - return FALSE; - } - CFX_WideString strtemp = L""; - for (i = 0; i < aPos; i++) { - FX_WCHAR wch = str.GetAt(i); - if (wch >= L'a' && wch <= L'z') { - break; - } else { - strtemp = str.Right(str.GetLength() - i + 1); - } - } - if (strtemp != L"") { - str = strtemp; - } + + // Check the domain name part. aPos = str.Find(L'@'); if (aPos < 1) { return FALSE; } str.TrimRight(L'.'); - strtemp = str; - int ePos = str.Find(L'.'); - if (ePos == -1) { + // At least one '.' in domain name, but not at the beginning. + // TODO(weili): RFC5322 allows domain names to be a local name without '.'. + // Check whether we should remove this check. + int ePos = str.Find(L'.', aPos + 1); + if (ePos == -1 || ePos == aPos + 1) { return FALSE; } - while (ePos != -1) { - strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); - ePos = strtemp.Find('.'); - } - ePos = strtemp.GetLength(); - for (i = 0; i < ePos; i++) { - FX_WCHAR wch = str.GetAt(i); - if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { - continue; - } else { - str = str.Left(str.GetLength() - ePos + i + 1); - ePos = ePos - i - 1; - break; - } - } + // Validate all other chars in domain name. int nLen = str.GetLength(); - for (i = aPos + 1; i < nLen - ePos; i++) { + pPos = 0; // Used to track the position of '.'. + for (int i = aPos + 1; i < nLen; i++) { FX_WCHAR wch = str.GetAt(i); - if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || - (wch >= L'0' && wch <= L'9')) { + if (wch == L'-' || FXSYS_iswalnum(wch)) { continue; - } else { + } + if (wch != L'.' || i == pPos + 1) { + // Domain name should end before invalid char. + int host_end = i == pPos + 1 ? i - 2 : i - 1; + if (pPos > 0 && host_end - aPos >= 3) { + // Trim the ending invalid chars if there is at least one '.' and name. + str = str.Left(host_end + 1); + break; + } return FALSE; } + pPos = i; } + if (str.Find(L"mailto:") == -1) { str = L"mailto:" + str; } diff --git a/core/src/fpdftext/fpdf_text_int_unittest.cpp b/core/src/fpdftext/fpdf_text_int_unittest.cpp new file mode 100644 index 0000000000..b482f02be4 --- /dev/null +++ b/core/src/fpdftext/fpdf_text_int_unittest.cpp @@ -0,0 +1,56 @@ +// Copyright 2015 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "testing/gtest/include/gtest/gtest.h" + +#include "core/src/fpdftext/text_int.h" + +// Class to help test functions in CPDF_LinkExtract class. +class CPDF_TestLinkExtract : public CPDF_LinkExtract { + private: + // Add test cases as friends to access protected member functions. + // Access CheckMailLink. + FRIEND_TEST(fpdf_text_int, CheckMailLink); +}; + +TEST(fpdf_text_int, CheckMailLink) { + CPDF_TestLinkExtract extractor; + // Check cases that fail to extract valid mail link. + const wchar_t* invalid_strs[] = { + L"", + L"peter.pan" // '@' is required. + L"abc@server", // Domain name needs at least one '.'. + L"abc.@gmail.com", // '.' can not immediately precede '@'. + L"abc@xyz&q.org", // Domain name should not contain '&'. + L"abc@.xyz.org", // Domain name should not start with '.'. + L"fan@g..com" // Domain name should not have consecutive '.' + }; + for (int i = 0; i < FX_ArraySize(invalid_strs); ++i) { + CFX_WideString text_str(invalid_strs[i]); + EXPECT_FALSE(extractor.CheckMailLink(text_str)); + } + + // Check cases that can extract valid mail link. + // An array of {input_string, expected_extracted_email_address}. + const wchar_t* valid_strs[][2] = { + {L"peter@abc.d", L"peter@abc.d"}, + {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"}, + {L"abc_@gmail.com", L"abc_@gmail.com"}, // '_' is ok before '@'. + {L"dummy-hi@gmail.com", + L"dummy-hi@gmail.com"}, // '-' is ok in user name. + {L"a..df@gmail.com", L"df@gmail.com"}, // Stop at consecutive '.'. + {L".john@yahoo.com", L"john@yahoo.com"}, // Remove heading '.'. + {L"abc@xyz.org?/", L"abc@xyz.org"}, // Trim ending invalid chars. + {L"fan{abc@xyz.org", L"abc@xyz.org"}, // Trim beginning invalid chars. + {L"fan@g.com..", L"fan@g.com"}, // Trim the ending periods. + {L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"}, // Keep the original case. + }; + for (int i = 0; i < FX_ArraySize(valid_strs); ++i) { + CFX_WideString text_str(valid_strs[i][0]); + CFX_WideString expected_str(L"mailto:"); + expected_str += valid_strs[i][1]; + EXPECT_TRUE(extractor.CheckMailLink(text_str)); + EXPECT_STREQ(text_str.c_str(), expected_str.c_str()); + } +} diff --git a/core/src/fpdftext/text_int.h b/core/src/fpdftext/text_int.h index a8f4f37e98..0fe43f2c40 100644 --- a/core/src/fpdftext/text_int.h +++ b/core/src/fpdftext/text_int.h @@ -226,7 +226,7 @@ class CPDF_LinkExtract : public IPDF_LinkExtract { void ParseLink(); void DeleteLinkList(); FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); - FX_BOOL CheckMailLink(CFX_WideString& str); + bool CheckMailLink(CFX_WideString& str); void AppendToLinkList(int start, int count, const CFX_WideString& strUrl); private: diff --git a/pdfium.gyp b/pdfium.gyp index 58d6b8ffcd..117cdb7913 100644 --- a/pdfium.gyp +++ b/pdfium.gyp @@ -722,6 +722,7 @@ 'core/src/fpdfapi/fpdf_page/fpdf_page_parser_old_unittest.cpp', 'core/src/fpdfapi/fpdf_parser/fpdf_parser_decode_unittest.cpp', 'core/src/fpdfapi/fpdf_parser/fpdf_parser_parser_unittest.cpp', + 'core/src/fpdftext/fpdf_text_int_unittest.cpp', 'core/src/fxcodec/codec/fx_codec_jpx_unittest.cpp', 'core/src/fxcrt/fx_basic_bstring_unittest.cpp', 'core/src/fxcrt/fx_basic_memmgr_unittest.cpp', -- cgit v1.2.3