diff options
author | Wei Li <weili@chromium.org> | 2017-03-16 17:31:03 -0700 |
---|---|---|
committer | Chromium commit bot <commit-bot@chromium.org> | 2017-03-17 16:44:55 +0000 |
commit | 7630907c7ecbb700e4de287550dbed06f36fbe9e (patch) | |
tree | 26a8170173f394dc784f29a054af05723ed682ce /core/fpdftext | |
parent | 78616574cedcb52cce8a25bd684bf9638a87de7a (diff) | |
download | pdfium-7630907c7ecbb700e4de287550dbed06f36fbe9e.tar.xz |
Handle web links across lineschromium/3045
When a web link has a hyphen at the end of line, we consider it to
be continued to the next line. For example, "http://www.abc.com/my-\r\ntest"
should be extracted as "http://www.abc.com/my-test".
BUG=pdfium:650
Change-Id: I64a93d9c66faf2be0abdaf8cfe8ee496c435d0ca
Reviewed-on: https://pdfium-review.googlesource.com/3092
Commit-Queue: Wei Li <weili@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Diffstat (limited to 'core/fpdftext')
-rw-r--r-- | core/fpdftext/cpdf_linkextract.cpp | 31 | ||||
-rw-r--r-- | core/fpdftext/cpdf_textpage.h | 2 |
2 files changed, 28 insertions, 5 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index 686b6a23b8..47d0754bd2 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp @@ -31,18 +31,36 @@ void CPDF_LinkExtract::ExtractLinks() { } void CPDF_LinkExtract::ParseLink() { - int start = 0, pos = 0; - int TotalChar = m_pTextPage->CountChars(); - while (pos < TotalChar) { + int start = 0; + int pos = 0; + int nTotalChar = m_pTextPage->CountChars(); + bool bAfterHyphen = false; + bool bLineBreak = false; + while (pos < nTotalChar) { FPDF_CHAR_INFO pageChar; m_pTextPage->GetCharInfo(pos, &pageChar); if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || - pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { + pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) { int nCount = pos - start; - if (pos == TotalChar - 1) + if (pos == nTotalChar - 1) { nCount++; + } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR || + pageChar.m_Unicode == TEXT_RETURN_CHAR)) { + // Handle text breaks with a hyphen to the next line. + bLineBreak = true; + pos++; + continue; + } CFX_WideString strBeCheck; strBeCheck = m_pTextPage->GetPageText(start, nCount); + if (bLineBreak) { + strBeCheck.Remove(TEXT_LINEFEED_CHAR); + strBeCheck.Remove(TEXT_RETURN_CHAR); + bLineBreak = false; + } + // Replace the generated code with the hyphen char. + strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN); + if (strBeCheck.GetLength() > 5) { while (strBeCheck.GetLength() > 0) { wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); @@ -60,6 +78,9 @@ void CPDF_LinkExtract::ParseLink() { } start = ++pos; } else { + bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN || + (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL && + pageChar.m_Unicode == TEXT_HYPHEN_CHAR)); pos++; } } diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h index ebe58eb7ff..d7e29edf3b 100644 --- a/core/fpdftext/cpdf_textpage.h +++ b/core/fpdftext/cpdf_textpage.h @@ -34,10 +34,12 @@ class CPDF_TextObject; #define TEXT_SPACE_CHAR L' ' #define TEXT_LINEFEED_CHAR L'\n' #define TEXT_RETURN_CHAR L'\r' +#define TEXT_HYPHEN_CHAR L'-' #define TEXT_EMPTY L"" #define TEXT_SPACE L" " #define TEXT_RETURN_LINEFEED L"\r\n" #define TEXT_LINEFEED L"\n" +#define TEXT_HYPHEN L"-" #define TEXT_CHARRATIO_GAPDELTA 0.070 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay }; |