From 7630907c7ecbb700e4de287550dbed06f36fbe9e Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 16 Mar 2017 17:31:03 -0700 Subject: Handle web links across lines When a web link has a hyphen at the end of line, we consider it to be continued to the next line. For example, "http://www.abc.com/my-\r\ntest" should be extracted as "http://www.abc.com/my-test". BUG=pdfium:650 Change-Id: I64a93d9c66faf2be0abdaf8cfe8ee496c435d0ca Reviewed-on: https://pdfium-review.googlesource.com/3092 Commit-Queue: Wei Li Reviewed-by: Lei Zhang --- core/fpdftext/cpdf_linkextract.cpp | 31 ++++++++-- core/fpdftext/cpdf_textpage.h | 2 + fpdfsdk/fpdftext_embeddertest.cpp | 68 ++++++++++++++++++++++ testing/resources/bug_650.pdf | Bin 0 -> 85296 bytes testing/resources/weblinks_across_lines.in | 74 ++++++++++++++++++++++++ testing/resources/weblinks_across_lines.pdf | 84 ++++++++++++++++++++++++++++ 6 files changed, 254 insertions(+), 5 deletions(-) create mode 100644 testing/resources/bug_650.pdf create mode 100644 testing/resources/weblinks_across_lines.in create mode 100644 testing/resources/weblinks_across_lines.pdf diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 686b6a23b8..47d0754bd2 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -31,18 +31,36 @@ void CPDF_LinkExtract::ExtractLinks() { } void CPDF_LinkExtract::ParseLink() { - int start = 0, pos = 0; - int TotalChar = m_pTextPage->CountChars(); - while (pos < TotalChar) { + int start = 0; + int pos = 0; + int nTotalChar = m_pTextPage->CountChars(); + bool bAfterHyphen = false; + bool bLineBreak = false; + while (pos < nTotalChar) { FPDF_CHAR_INFO pageChar; m_pTextPage->GetCharInfo(pos, &pageChar); if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || - pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { + pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) { int nCount = pos - start; - if (pos == TotalChar - 1) + if (pos == nTotalChar - 1) { nCount++; + } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR || + pageChar.m_Unicode == TEXT_RETURN_CHAR)) { + // Handle text breaks with a hyphen to the next line. + bLineBreak = true; + pos++; + continue; + } CFX_WideString strBeCheck; strBeCheck = m_pTextPage->GetPageText(start, nCount); + if (bLineBreak) { + strBeCheck.Remove(TEXT_LINEFEED_CHAR); + strBeCheck.Remove(TEXT_RETURN_CHAR); + bLineBreak = false; + } + // Replace the generated code with the hyphen char. + strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN); + if (strBeCheck.GetLength() > 5) { while (strBeCheck.GetLength() > 0) { wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); @@ -60,6 +78,9 @@ void CPDF_LinkExtract::ParseLink() { } start = ++pos; } else { + bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN || + (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL && + pageChar.m_Unicode == TEXT_HYPHEN_CHAR)); pos++; } } diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h index ebe58eb7ff..d7e29edf3b 100644 --- a/core/fpdftext/cpdf_textpage.h +++ b/core/fpdftext/cpdf_textpage.h @@ -34,10 +34,12 @@ class CPDF_TextObject; #define TEXT_SPACE_CHAR L' ' #define TEXT_LINEFEED_CHAR L'\n' #define TEXT_RETURN_CHAR L'\r' +#define TEXT_HYPHEN_CHAR L'-' #define TEXT_EMPTY L"" #define TEXT_SPACE L" " #define TEXT_RETURN_LINEFEED L"\r\n" #define TEXT_LINEFEED L"\n" +#define TEXT_HYPHEN L"-" #define TEXT_CHARRATIO_GAPDELTA 0.070 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay }; diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp index 198ef8a7f2..3d496bc06f 100644 --- a/fpdfsdk/fpdftext_embeddertest.cpp +++ b/fpdfsdk/fpdftext_embeddertest.cpp @@ -370,6 +370,74 @@ TEST_F(FPDFTextEmbeddertest, WebLinks) { UnloadPage(page); } +TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) { + EXPECT_TRUE(OpenDocument("weblinks_across_lines.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + + FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage); + EXPECT_TRUE(pagelink); + + static const char* const kExpectedUrls[] = { + "http://example.com?", // from "http://www.example.com?\r\nfoo" + "http://example.com/", // from "http://www.example.com/\r\nfoo" + "http://example.com/test-foo", // from "http://example.com/test-\r\nfoo" + "http://abc.com/test-foo", // from "http://abc.com/test-\r\n\r\nfoo" + // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/" + "http://example.com/", "http://www.abc.com", + }; + static const int kNumLinks = static_cast(FX_ArraySize(kExpectedUrls)); + + EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink)); + + unsigned short fixed_buffer[128]; + for (int i = 0; i < kNumLinks; i++) { + const size_t expected_len = strlen(kExpectedUrls[i]) + 1; + memset(fixed_buffer, 0, FX_ArraySize(fixed_buffer)); + EXPECT_EQ(static_cast(expected_len), + FPDFLink_GetURL(pagelink, i, nullptr, 0)); + EXPECT_EQ( + static_cast(expected_len), + FPDFLink_GetURL(pagelink, i, fixed_buffer, FX_ArraySize(fixed_buffer))); + EXPECT_TRUE( + check_unsigned_shorts(kExpectedUrls[i], fixed_buffer, expected_len)); + } + + FPDFLink_CloseWebLinks(pagelink); + FPDFText_ClosePage(textpage); + UnloadPage(page); +} + +TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLinesBug) { + EXPECT_TRUE(OpenDocument("bug_650.pdf")); + FPDF_PAGE page = LoadPage(0); + EXPECT_TRUE(page); + + FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page); + EXPECT_TRUE(textpage); + + FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage); + EXPECT_TRUE(pagelink); + + EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink)); + unsigned short fixed_buffer[128] = {0}; + static const char kExpectedUrl[] = + "http://tutorial45.com/learn-autocad-basics-day-166/"; + static const int kUrlSize = static_cast(sizeof(kExpectedUrl)); + + EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0)); + EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, fixed_buffer, + FX_ArraySize(fixed_buffer))); + EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, fixed_buffer, kUrlSize)); + + FPDFLink_CloseWebLinks(pagelink); + FPDFText_ClosePage(textpage); + UnloadPage(page); +} + TEST_F(FPDFTextEmbeddertest, GetFontSize) { EXPECT_TRUE(OpenDocument("hello_world.pdf")); FPDF_PAGE page = LoadPage(0); diff --git a/testing/resources/bug_650.pdf b/testing/resources/bug_650.pdf new file mode 100644 index 0000000000..5e46032f6c Binary files /dev/null and b/testing/resources/bug_650.pdf differ diff --git a/testing/resources/weblinks_across_lines.in b/testing/resources/weblinks_across_lines.in new file mode 100644 index 0000000000..bb04b5e9cf --- /dev/null +++ b/testing/resources/weblinks_across_lines.in @@ -0,0 +1,74 @@ +{{header}} +{{object 1 0}} << + /Type /Catalog + /Pages 2 0 R +>> +{{object 2 0}} << + /Type /Pages + /MediaBox [ 0 0 600 600 ] + /Count 1 + /Kids [ 3 0 R ] +>> +endobj +{{object 3 0}} << + /Type /Page + /Parent 2 0 R + /Resources << + /Font << + /F1 4 0 R + /F2 5 0 R + >> + >> + /Contents 6 0 R +>> +endobj +{{object 4 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +{{object 5 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +{{object 6 0}} << +>> +stream +BT +/F1 12 Tf +50 50 Td +(Hello, world! This is not a link.) Tj +0 50 Td +(Is this http://example.com?) Tj +0 50 Td +(foo a link?) Tj +/F2 14 Tf +0 50 Td +(How about this http://example.com/) Tj +0 50 Td +(foo a link?) Tj +0 50 Td +(Is this http://example.com/test-) Tj +0 50 Td +(foo a link?) Tj +(Is this http://abc.com/test-) Tj +0 50 Td +0 50 Td +(foo a link?) Tj +0 50 Td +(And this http://example.com/) Tj +0 50 Td +(http://www.abc.com a link?) Tj +ET +endstream +endobj +{{xref}} +trailer << + /Size 6 + /Root 1 0 R +>> +{{startxref}} +%%EOF diff --git a/testing/resources/weblinks_across_lines.pdf b/testing/resources/weblinks_across_lines.pdf new file mode 100644 index 0000000000..e9327c4b34 --- /dev/null +++ b/testing/resources/weblinks_across_lines.pdf @@ -0,0 +1,84 @@ +%PDF-1.7 +% ò¤ô +1 0 obj << + /Type /Catalog + /Pages 2 0 R +>> +2 0 obj << + /Type /Pages + /MediaBox [ 0 0 600 600 ] + /Count 1 + /Kids [ 3 0 R ] +>> +endobj +3 0 obj << + /Type /Page + /Parent 2 0 R + /Resources << + /Font << + /F1 4 0 R + /F2 5 0 R + >> + >> + /Contents 6 0 R +>> +endobj +4 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +5 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +6 0 obj << +>> +stream +BT +/F1 12 Tf +50 50 Td +(Hello, world! This is not a link.) Tj +0 50 Td +(Is this http://example.com?) Tj +0 50 Td +(foo a link?) Tj +/F2 14 Tf +0 50 Td +(How about this http://example.com/) Tj +0 50 Td +(foo a link?) Tj +0 50 Td +(Is this http://example.com/test-) Tj +0 50 Td +(foo a link?) Tj +(Is this http://abc.com/test-) Tj +0 50 Td +0 50 Td +(foo a link?) Tj +0 50 Td +(And this http://example.com/) Tj +0 50 Td +(http://www.abc.com a link?) Tj +ET +endstream +endobj +xref +0 7 +0000000000 65535 f +0000000015 00000 n +0000000061 00000 n +0000000154 00000 n +0000000296 00000 n +0000000374 00000 n +0000000450 00000 n +trailer << + /Size 6 + /Root 1 0 R +>> +startxref +921 +%%EOF -- cgit v1.2.3