summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrique Nakashima <hnakashima@chromium.org>2017-06-09 13:10:03 -0400
committerChromium commit bot <commit-bot@chromium.org>2017-06-12 13:53:05 +0000
commitc6fd58d69bd8895137364c4a1b0bcd08a2ce8c46 (patch)
treea7fa59e0959c2db143149afe99fbabad69f527de
parentb83d870b007f25b18e6b7a4ce2a417420d4dcb89 (diff)
downloadpdfium-c6fd58d69bd8895137364c4a1b0bcd08a2ce8c46.tar.xz
Trimming brackets and quotes outside URLs.chromium/3129
Bug: pdfium:655 Change-Id: Ibf4217b35b613d21d3e8e060608b502ef79acd9e Reviewed-on: https://pdfium-review.googlesource.com/6392 Commit-Queue: Henrique Nakashima <hnakashima@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org>
-rw-r--r--core/fpdftext/cpdf_linkextract.cpp64
-rw-r--r--core/fpdftext/cpdf_linkextract_unittest.cpp46
2 files changed, 88 insertions, 22 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 98bf915519..5179f732c9 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -15,12 +15,13 @@
namespace {
-// Find the end of a web link starting from offset |start|.
-// The purpose of this function is to separate url from the surrounding context
-// characters, we do not intend to fully validate the url.
-// |str| contains lower case characters only.
-FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) {
- FX_STRSIZE end = str.GetLength() - 1;
+// Find the end of a web link starting from offset |start| and ending at offset
+// |end|. The purpose of this function is to separate url from the surrounding
+// context characters, we do not intend to fully validate the url. |str|
+// contains lower case characters only.
+FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str,
+ FX_STRSIZE start,
+ FX_STRSIZE end) {
if (str.Find(L'/', start) != -1) {
// When there is a path and query after '/', most ASCII chars are allowed.
// We don't sanitize in this case.
@@ -59,6 +60,46 @@ FX_STRSIZE FindWebLinkEnding(const CFX_WideString& str, FX_STRSIZE start) {
return end;
}
+// Remove characters from the end of |str|, delimited by |start| and |end|, up
+// to and including |charToFind|. No-op if |charToFind| is not present. Updates
+// |end| if characters were removed.
+void TrimBackwardsToChar(const CFX_WideString& str,
+ wchar_t charToFind,
+ FX_STRSIZE start,
+ FX_STRSIZE* end) {
+ for (FX_STRSIZE pos = *end; pos >= start; pos--) {
+ if (str[pos] == charToFind) {
+ *end = pos - 1;
+ break;
+ }
+ }
+}
+
+// Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by
+// |start| and |end| in |str|. Matches a closing bracket or quote for each
+// opening character and, if present, removes everything afterwards. Returns the
+// new end position for the string.
+FX_STRSIZE TrimExternalBracketsFromWebLink(const CFX_WideString& str,
+ FX_STRSIZE start,
+ FX_STRSIZE end) {
+ for (FX_STRSIZE pos = 0; pos < start; pos++) {
+ if (str[pos] == '(') {
+ TrimBackwardsToChar(str, ')', start, &end);
+ } else if (str[pos] == '[') {
+ TrimBackwardsToChar(str, ']', start, &end);
+ } else if (str[pos] == '{') {
+ TrimBackwardsToChar(str, '}', start, &end);
+ } else if (str[pos] == '<') {
+ TrimBackwardsToChar(str, '>', start, &end);
+ } else if (str[pos] == '"') {
+ TrimBackwardsToChar(str, '"', start, &end);
+ } else if (str[pos] == '\'') {
+ TrimBackwardsToChar(str, '\'', start, &end);
+ }
+ }
+ return end;
+}
+
} // namespace
CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
@@ -162,8 +203,11 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString* strBeCheck,
if (str[off] == L's') // "https" scheme is accepted.
off++;
if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
- FX_STRSIZE end = FindWebLinkEnding(str, off + 3);
- if (end > off + 3) { // Non-empty host name.
+ off += 3;
+ FX_STRSIZE end =
+ TrimExternalBracketsFromWebLink(str, start, str.GetLength() - 1);
+ end = FindWebLinkEnding(str, off, end);
+ if (end > off) { // Non-empty host name.
*nStart = start;
*nCount = end - start + 1;
*strBeCheck = strBeCheck->Mid(*nStart, *nCount);
@@ -176,7 +220,9 @@ bool CPDF_LinkExtract::CheckWebLink(CFX_WideString* strBeCheck,
// When there is no scheme, try to find url starting with "www.".
start = str.Find(kWWWAddrStart);
if (start != -1 && len > start + kWWWAddrStartLen) {
- FX_STRSIZE end = FindWebLinkEnding(str, start);
+ FX_STRSIZE end =
+ TrimExternalBracketsFromWebLink(str, start, str.GetLength() - 1);
+ end = FindWebLinkEnding(str, start, end);
if (end > start + kWWWAddrStartLen) {
*nStart = start;
*nCount = end - start + 1;
diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp
index de870b0cc1..efeb53ebe2 100644
--- a/core/fpdftext/cpdf_linkextract_unittest.cpp
+++ b/core/fpdftext/cpdf_linkextract_unittest.cpp
@@ -31,8 +31,9 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) {
L"fan@g..com" // Domain name should not have consecutive '.'
};
for (size_t i = 0; i < FX_ArraySize(invalid_strs); ++i) {
- CFX_WideString text_str(invalid_strs[i]);
- EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << text_str.c_str();
+ const wchar_t* const input = invalid_strs[i];
+ CFX_WideString text_str(input);
+ EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << input;
}
// Check cases that can extract valid mail link.
@@ -51,10 +52,11 @@ TEST(CPDF_LinkExtractTest, CheckMailLink) {
{L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"}, // Keep the original case.
};
for (size_t i = 0; i < FX_ArraySize(valid_strs); ++i) {
- CFX_WideString text_str(valid_strs[i][0]);
+ const wchar_t* const input = valid_strs[i][0];
+ CFX_WideString text_str(input);
CFX_WideString expected_str(L"mailto:");
expected_str += valid_strs[i][1];
- EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << text_str.c_str();
+ EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << input;
EXPECT_STREQ(expected_str.c_str(), text_str.c_str());
}
}
@@ -77,13 +79,14 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) {
};
const int32_t DEFAULT_VALUE = -42;
for (size_t i = 0; i < FX_ArraySize(invalid_cases); ++i) {
- CFX_WideString text_str(invalid_cases[i]);
+ const wchar_t* const input = invalid_cases[i];
+ CFX_WideString text_str(input);
int32_t start_offset = DEFAULT_VALUE;
int32_t count = DEFAULT_VALUE;
EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count))
- << text_str.c_str();
- EXPECT_EQ(DEFAULT_VALUE, start_offset) << text_str.c_str();
- EXPECT_EQ(DEFAULT_VALUE, count) << text_str.c_str();
+ << input;
+ EXPECT_EQ(DEFAULT_VALUE, start_offset) << input;
+ EXPECT_EQ(DEFAULT_VALUE, count) << input;
}
// Check cases that can extract valid web link.
@@ -119,7 +122,23 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) {
{L"www.example.com;(", L"http://www.example.com", 0,
15}, // Trim ending invalid chars.
{L"test:www.abc.com", L"http://www.abc.com", 5,
- 11}, // Trim chars before URL.
+ 11}, // Trim chars before URL.
+ {L"(http://www.abc.com)", L"http://www.abc.com", 1,
+ 18}, // Trim external brackets.
+ {L"0(http://www.abc.com)0", L"http://www.abc.com", 2,
+ 18}, // Trim chars outside brackets as well.
+ {L"0(www.abc.com)0", L"http://www.abc.com", 2,
+ 11}, // Links without http should also have brackets trimmed.
+ {L"http://www.abc.com)0", L"http://www.abc.com)0", 0,
+ 20}, // Do not trim brackets that were not opened.
+ {L"{(<http://www.abc.com>)}", L"http://www.abc.com", 3,
+ 18}, // Trim chars with multiple levels of brackets.
+ {L"[http://www.abc.com/z(1)]", L"http://www.abc.com/z(1)", 1,
+ 23}, // Brackets opened inside the URL should not be trimmed.
+ {L"(http://www.abc.com/z(1))", L"http://www.abc.com/z(1)", 1,
+ 23}, // Brackets opened inside the URL should not be trimmed.
+ {L"\"http://www.abc.com\"", L"http://www.abc.com", 1,
+ 18}, // External quotes can also be escaped
{L"www.g.com..", L"http://www.g.com..", 0, 11}, // Leave ending periods.
// Web links can contain IP addresses too.
@@ -155,13 +174,14 @@ TEST(CPDF_LinkExtractTest, CheckWebLink) {
{L"www.测试.net;", L"http://www.测试.net;", 0, 11},
};
for (size_t i = 0; i < FX_ArraySize(valid_cases); ++i) {
- CFX_WideString text_str(valid_cases[i].input_string);
+ const wchar_t* const input = valid_cases[i].input_string;
+ CFX_WideString text_str(input);
int32_t start_offset = DEFAULT_VALUE;
int32_t count = DEFAULT_VALUE;
EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count))
- << text_str.c_str();
+ << input;
EXPECT_STREQ(valid_cases[i].url_extracted, text_str.c_str());
- EXPECT_EQ(valid_cases[i].start_offset, start_offset) << text_str.c_str();
- EXPECT_EQ(valid_cases[i].count, count) << text_str.c_str();
+ EXPECT_EQ(valid_cases[i].start_offset, start_offset) << input;
+ EXPECT_EQ(valid_cases[i].count, count) << input;
}
}