summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWei Li <weili@chromium.org>2017-03-16 17:31:03 -0700
committerChromium commit bot <commit-bot@chromium.org>2017-03-17 16:44:55 +0000
commit7630907c7ecbb700e4de287550dbed06f36fbe9e (patch)
tree26a8170173f394dc784f29a054af05723ed682ce
parent78616574cedcb52cce8a25bd684bf9638a87de7a (diff)
downloadpdfium-7630907c7ecbb700e4de287550dbed06f36fbe9e.tar.xz
Handle web links across lineschromium/3045
When a web link has a hyphen at the end of line, we consider it to be continued to the next line. For example, "http://www.abc.com/my-\r\ntest" should be extracted as "http://www.abc.com/my-test". BUG=pdfium:650 Change-Id: I64a93d9c66faf2be0abdaf8cfe8ee496c435d0ca Reviewed-on: https://pdfium-review.googlesource.com/3092 Commit-Queue: Wei Li <weili@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org>
-rw-r--r--core/fpdftext/cpdf_linkextract.cpp31
-rw-r--r--core/fpdftext/cpdf_textpage.h2
-rw-r--r--fpdfsdk/fpdftext_embeddertest.cpp68
-rw-r--r--testing/resources/bug_650.pdfbin0 -> 85296 bytes
-rw-r--r--testing/resources/weblinks_across_lines.in74
-rw-r--r--testing/resources/weblinks_across_lines.pdf84
6 files changed, 254 insertions, 5 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 686b6a23b8..47d0754bd2 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -31,18 +31,36 @@ void CPDF_LinkExtract::ExtractLinks() {
}
void CPDF_LinkExtract::ParseLink() {
- int start = 0, pos = 0;
- int TotalChar = m_pTextPage->CountChars();
- while (pos < TotalChar) {
+ int start = 0;
+ int pos = 0;
+ int nTotalChar = m_pTextPage->CountChars();
+ bool bAfterHyphen = false;
+ bool bLineBreak = false;
+ while (pos < nTotalChar) {
FPDF_CHAR_INFO pageChar;
m_pTextPage->GetCharInfo(pos, &pageChar);
if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
- pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
+ pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
int nCount = pos - start;
- if (pos == TotalChar - 1)
+ if (pos == nTotalChar - 1) {
nCount++;
+ } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
+ pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
+ // Handle text breaks with a hyphen to the next line.
+ bLineBreak = true;
+ pos++;
+ continue;
+ }
CFX_WideString strBeCheck;
strBeCheck = m_pTextPage->GetPageText(start, nCount);
+ if (bLineBreak) {
+ strBeCheck.Remove(TEXT_LINEFEED_CHAR);
+ strBeCheck.Remove(TEXT_RETURN_CHAR);
+ bLineBreak = false;
+ }
+ // Replace the generated code with the hyphen char.
+ strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
+
if (strBeCheck.GetLength() > 5) {
while (strBeCheck.GetLength() > 0) {
wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
@@ -60,6 +78,9 @@ void CPDF_LinkExtract::ParseLink() {
}
start = ++pos;
} else {
+ bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
+ (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
+ pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
pos++;
}
}
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index ebe58eb7ff..d7e29edf3b 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -34,10 +34,12 @@ class CPDF_TextObject;
#define TEXT_SPACE_CHAR L' '
#define TEXT_LINEFEED_CHAR L'\n'
#define TEXT_RETURN_CHAR L'\r'
+#define TEXT_HYPHEN_CHAR L'-'
#define TEXT_EMPTY L""
#define TEXT_SPACE L" "
#define TEXT_RETURN_LINEFEED L"\r\n"
#define TEXT_LINEFEED L"\n"
+#define TEXT_HYPHEN L"-"
#define TEXT_CHARRATIO_GAPDELTA 0.070
enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 198ef8a7f2..3d496bc06f 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -370,6 +370,74 @@ TEST_F(FPDFTextEmbeddertest, WebLinks) {
UnloadPage(page);
}
+TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) {
+ EXPECT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ EXPECT_TRUE(page);
+
+ FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+ EXPECT_TRUE(textpage);
+
+ FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
+ EXPECT_TRUE(pagelink);
+
+ static const char* const kExpectedUrls[] = {
+ "http://example.com?", // from "http://www.example.com?\r\nfoo"
+ "http://example.com/", // from "http://www.example.com/\r\nfoo"
+ "http://example.com/test-foo", // from "http://example.com/test-\r\nfoo"
+ "http://abc.com/test-foo", // from "http://abc.com/test-\r\n\r\nfoo"
+ // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
+ "http://example.com/", "http://www.abc.com",
+ };
+ static const int kNumLinks = static_cast<int>(FX_ArraySize(kExpectedUrls));
+
+ EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
+
+ unsigned short fixed_buffer[128];
+ for (int i = 0; i < kNumLinks; i++) {
+ const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
+ memset(fixed_buffer, 0, FX_ArraySize(fixed_buffer));
+ EXPECT_EQ(static_cast<int>(expected_len),
+ FPDFLink_GetURL(pagelink, i, nullptr, 0));
+ EXPECT_EQ(
+ static_cast<int>(expected_len),
+ FPDFLink_GetURL(pagelink, i, fixed_buffer, FX_ArraySize(fixed_buffer)));
+ EXPECT_TRUE(
+ check_unsigned_shorts(kExpectedUrls[i], fixed_buffer, expected_len));
+ }
+
+ FPDFLink_CloseWebLinks(pagelink);
+ FPDFText_ClosePage(textpage);
+ UnloadPage(page);
+}
+
+TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLinesBug) {
+ EXPECT_TRUE(OpenDocument("bug_650.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ EXPECT_TRUE(page);
+
+ FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+ EXPECT_TRUE(textpage);
+
+ FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
+ EXPECT_TRUE(pagelink);
+
+ EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
+ unsigned short fixed_buffer[128] = {0};
+ static const char kExpectedUrl[] =
+ "http://tutorial45.com/learn-autocad-basics-day-166/";
+ static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
+
+ EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
+ EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, fixed_buffer,
+ FX_ArraySize(fixed_buffer)));
+ EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, fixed_buffer, kUrlSize));
+
+ FPDFLink_CloseWebLinks(pagelink);
+ FPDFText_ClosePage(textpage);
+ UnloadPage(page);
+}
+
TEST_F(FPDFTextEmbeddertest, GetFontSize) {
EXPECT_TRUE(OpenDocument("hello_world.pdf"));
FPDF_PAGE page = LoadPage(0);
diff --git a/testing/resources/bug_650.pdf b/testing/resources/bug_650.pdf
new file mode 100644
index 0000000000..5e46032f6c
--- /dev/null
+++ b/testing/resources/bug_650.pdf
Binary files differ
diff --git a/testing/resources/weblinks_across_lines.in b/testing/resources/weblinks_across_lines.in
new file mode 100644
index 0000000000..bb04b5e9cf
--- /dev/null
+++ b/testing/resources/weblinks_across_lines.in
@@ -0,0 +1,74 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+{{object 2 0}} <<
+ /Type /Pages
+ /MediaBox [ 0 0 600 600 ]
+ /Count 1
+ /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 4 0 R
+ /F2 5 0 R
+ >>
+ >>
+ /Contents 6 0 R
+>>
+endobj
+{{object 4 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+>>
+stream
+BT
+/F1 12 Tf
+50 50 Td
+(Hello, world! This is not a link.) Tj
+0 50 Td
+(Is this http://example.com?) Tj
+0 50 Td
+(foo a link?) Tj
+/F2 14 Tf
+0 50 Td
+(How about this http://example.com/) Tj
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(Is this http://example.com/test-) Tj
+0 50 Td
+(foo a link?) Tj
+(Is this http://abc.com/test-) Tj
+0 50 Td
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(And this http://example.com/) Tj
+0 50 Td
+(http://www.abc.com a link?) Tj
+ET
+endstream
+endobj
+{{xref}}
+trailer <<
+ /Size 6
+ /Root 1 0 R
+>>
+{{startxref}}
+%%EOF
diff --git a/testing/resources/weblinks_across_lines.pdf b/testing/resources/weblinks_across_lines.pdf
new file mode 100644
index 0000000000..e9327c4b34
--- /dev/null
+++ b/testing/resources/weblinks_across_lines.pdf
@@ -0,0 +1,84 @@
+%PDF-1.7
+%���
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+2 0 obj <<
+ /Type /Pages
+ /MediaBox [ 0 0 600 600 ]
+ /Count 1
+ /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 4 0 R
+ /F2 5 0 R
+ >>
+ >>
+ /Contents 6 0 R
+>>
+endobj
+4 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+>>
+stream
+BT
+/F1 12 Tf
+50 50 Td
+(Hello, world! This is not a link.) Tj
+0 50 Td
+(Is this http://example.com?) Tj
+0 50 Td
+(foo a link?) Tj
+/F2 14 Tf
+0 50 Td
+(How about this http://example.com/) Tj
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(Is this http://example.com/test-) Tj
+0 50 Td
+(foo a link?) Tj
+(Is this http://abc.com/test-) Tj
+0 50 Td
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(And this http://example.com/) Tj
+0 50 Td
+(http://www.abc.com a link?) Tj
+ET
+endstream
+endobj
+xref
+0 7
+0000000000 65535 f
+0000000015 00000 n
+0000000061 00000 n
+0000000154 00000 n
+0000000296 00000 n
+0000000374 00000 n
+0000000450 00000 n
+trailer <<
+ /Size 6
+ /Root 1 0 R
+>>
+startxref
+921
+%%EOF