From c6fd58d69bd8895137364c4a1b0bcd08a2ce8c46 Mon Sep 17 00:00:00 2001 From: Henrique Nakashima Date: Fri, 9 Jun 2017 13:10:03 -0400 Subject: Trimming brackets and quotes outside URLs. Bug: pdfium:655 Change-Id: Ibf4217b35b613d21d3e8e060608b502ef79acd9e Reviewed-on: https://pdfium-review.googlesource.com/6392 Commit-Queue: Henrique Nakashima Reviewed-by: Lei Zhang --- core/fpdftext/cpdf_linkextract.cpp | 64 +++++++++++++++++++++++++---- core/fpdftext/cpdf_linkextract_unittest.cpp | 46 +++++++++++++++------ 2 files changed, 88 insertions(+), 22 deletions(-) (limited to 'core/fpdftext') diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 98bf915519..5179f732c9 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -15,12 +15,13 @@ namespace { -// Find the end of a web link starting from offset |start|. -// The purpose of this function is to separate url from the surrounding context -// characters, we do not intend to fully validate the url. -// |str| contains lower case characters only. -FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) { - FX_STRSIZE end = str.GetLength() - 1; +// Find the end of a web link starting from offset |start| and ending at offset +// |end|. The purpose of this function is to separate url from the surrounding +// context characters, we do not intend to fully validate the url. |str| +// contains lower case characters only. +FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, + FX_STRSIZE start, + FX_STRSIZE end) { if (str.Find(L'/', start) != -1) { // When there is a path and query after '/', most ASCII chars are allowed. // We don't sanitize in this case. @@ -59,6 +60,46 @@ FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) { return end; } +// Remove characters from the end of |str|, delimited by |start| and |end|, up +// to and including |charToFind|. No-op if |charToFind| is not present. Updates +// |end| if characters were removed. +void TrimBackwardsToChar(const CFX_WideString& str, + wchar_t charToFind, + FX_STRSIZE start, + FX_STRSIZE* end) { + for (FX_STRSIZE pos = *end; pos >= start; pos--) { + if (str[pos] == charToFind) { + *end = pos - 1; + break; + } + } +} + +// Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by +// |start| and |end| in |str|. Matches a closing bracket or quote for each +// opening character and, if present, removes everything afterwards. Returns the +// new end position for the string. +FX_STRSIZE TrimExternalBracketsFromWebLink(const CFX_WideString& str, + FX_STRSIZE start, + FX_STRSIZE end) { + for (FX_STRSIZE pos = 0; pos < start; pos++) { + if (str[pos] == '(') { + TrimBackwardsToChar(str, ')', start, &end); + } else if (str[pos] == '[') { + TrimBackwardsToChar(str, ']', start, &end); + } else if (str[pos] == '{') { + TrimBackwardsToChar(str, '}', start, &end); + } else if (str[pos] == '<') { + TrimBackwardsToChar(str, '>', start, &end); + } else if (str[pos] == '"') { + TrimBackwardsToChar(str, '"', start, &end); + } else if (str[pos] == '\'') { + TrimBackwardsToChar(str, '\'', start, &end); + } + } + return end; +} + } // namespace CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) @@ -162,8 +203,11 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString* strBeCheck, if (str[off] == L's') // "https" scheme is accepted. off++; if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { - FX_STRSIZE end = FindWebLinkEnding(str, off + 3); - if (end > off + 3) { // Non-empty host name. + off += 3; + FX_STRSIZE end = + TrimExternalBracketsFromWebLink(str, start, str.GetLength() - 1); + end = FindWebLinkEnding(str, off, end); + if (end > off) { // Non-empty host name. *nStart = start; *nCount = end - start + 1; *strBeCheck = strBeCheck->Mid(*nStart, *nCount); @@ -176,7 +220,9 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString* strBeCheck, // When there is no scheme, try to find url starting with "www.". start = str.Find(kWWWAddrStart); if (start != -1 && len > start + kWWWAddrStartLen) { - FX_STRSIZE end = FindWebLinkEnding(str, start); + FX_STRSIZE end = + TrimExternalBracketsFromWebLink(str, start, str.GetLength() - 1); + end = FindWebLinkEnding(str, start, end); if (end > start + kWWWAddrStartLen) { *nStart = start; *nCount = end - start + 1; diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp index de870b0cc1..efeb53ebe2 100644 --- a/core/fpdftext/cpdf_linkextract_unittest.cpp +++ b/core/fpdftext/cpdf_linkextract_unittest.cpp @@ -31,8 +31,9 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) { L"fan@g..com" // Domain name should not have consecutive '.' }; for (size_t i = 0; i < FX_ArraySize(invalid_strs); ++i) { - CFX_WideString text_str(invalid_strs[i]); - EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << text_str.c_str(); + const wchar_t* const input = invalid_strs[i]; + CFX_WideString text_str(input); + EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << input; } // Check cases that can extract valid mail link. @@ -51,10 +52,11 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) { {L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"}, // Keep the original case. }; for (size_t i = 0; i < FX_ArraySize(valid_strs); ++i) { - CFX_WideString text_str(valid_strs[i][0]); + const wchar_t* const input = valid_strs[i][0]; + CFX_WideString text_str(input); CFX_WideString expected_str(L"mailto:"); expected_str += valid_strs[i][1]; - EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << text_str.c_str(); + EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << input; EXPECT_STREQ(expected_str.c_str(), text_str.c_str()); } } @@ -77,13 +79,14 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) { }; const int32_t DEFAULT_VALUE = -42; for (size_t i = 0; i < FX_ArraySize(invalid_cases); ++i) { - CFX_WideString text_str(invalid_cases[i]); + const wchar_t* const input = invalid_cases[i]; + CFX_WideString text_str(input); int32_t start_offset = DEFAULT_VALUE; int32_t count = DEFAULT_VALUE; EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count)) - << text_str.c_str(); - EXPECT_EQ(DEFAULT_VALUE, start_offset) << text_str.c_str(); - EXPECT_EQ(DEFAULT_VALUE, count) << text_str.c_str(); + << input; + EXPECT_EQ(DEFAULT_VALUE, start_offset) << input; + EXPECT_EQ(DEFAULT_VALUE, count) << input; } // Check cases that can extract valid web link. @@ -119,7 +122,23 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) { {L"www.example.com;(", L"http://www.example.com", 0, 15}, // Trim ending invalid chars. {L"test:www.abc.com", L"http://www.abc.com", 5, - 11}, // Trim chars before URL. + 11}, // Trim chars before URL. + {L"(http://www.abc.com)", L"http://www.abc.com", 1, + 18}, // Trim external brackets. + {L"0(http://www.abc.com)0", L"http://www.abc.com", 2, + 18}, // Trim chars outside brackets as well. + {L"0(www.abc.com)0", L"http://www.abc.com", 2, + 11}, // Links without http should also have brackets trimmed. + {L"http://www.abc.com)0", L"http://www.abc.com)0", 0, + 20}, // Do not trim brackets that were not opened. + {L"{()}", L"http://www.abc.com", 3, + 18}, // Trim chars with multiple levels of brackets. + {L"[http://www.abc.com/z(1)]", L"http://www.abc.com/z(1)", 1, + 23}, // Brackets opened inside the URL should not be trimmed. + {L"(http://www.abc.com/z(1))", L"http://www.abc.com/z(1)", 1, + 23}, // Brackets opened inside the URL should not be trimmed. + {L"\"http://www.abc.com\"", L"http://www.abc.com", 1, + 18}, // External quotes can also be escaped {L"www.g.com..", L"http://www.g.com..", 0, 11}, // Leave ending periods. // Web links can contain IP addresses too. @@ -155,13 +174,14 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) { {L"www.测试.net;", L"http://www.测试.net;", 0, 11}, }; for (size_t i = 0; i < FX_ArraySize(valid_cases); ++i) { - CFX_WideString text_str(valid_cases[i].input_string); + const wchar_t* const input = valid_cases[i].input_string; + CFX_WideString text_str(input); int32_t start_offset = DEFAULT_VALUE; int32_t count = DEFAULT_VALUE; EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count)) - << text_str.c_str(); + << input; EXPECT_STREQ(valid_cases[i].url_extracted, text_str.c_str()); - EXPECT_EQ(valid_cases[i].start_offset, start_offset) << text_str.c_str(); - EXPECT_EQ(valid_cases[i].count, count) << text_str.c_str(); + EXPECT_EQ(valid_cases[i].start_offset, start_offset) << input; + EXPECT_EQ(valid_cases[i].count, count) << input; } } -- cgit v1.2.3