From 6c8ed646d1fcb8cce5a01c843c5149d989e6d5f0 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Fri, 19 May 2017 22:17:38 -0700 Subject: Better identify web links by trimming irrelevant chars Sometimes, web links are written with other text such as punctuations which makes the extracted web links invalid. We improve this by trimming invalid chars at the end of host name only URLs. For example, host names never ends with ';' or ','. BUG=chromium:720578 Change-Id: Id619025b2153531376d268a69a3a89c3d49fce08 Reviewed-on: https://pdfium-review.googlesource.com/5692 Commit-Queue: Wei Li Reviewed-by: Lei Zhang --- fpdfsdk/fpdftext_embeddertest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fpdfsdk/fpdftext_embeddertest.cpp') diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp index 3d496bc06f..65f5734122 100644 --- a/fpdfsdk/fpdftext_embeddertest.cpp +++ b/fpdfsdk/fpdftext_embeddertest.cpp @@ -382,7 +382,7 @@ TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) { EXPECT_TRUE(pagelink); static const char* const kExpectedUrls[] = { - "http://example.com?", // from "http://www.example.com?\r\nfoo" + "http://example.com", // from "http://www.example.com?\r\nfoo" "http://example.com/", // from "http://www.example.com/\r\nfoo" "http://example.com/test-foo", // from "http://example.com/test-\r\nfoo" "http://abc.com/test-foo", // from "http://abc.com/test-\r\n\r\nfoo" -- cgit v1.2.3