summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorWei Li <weili@chromium.org>2017-03-16 17:31:03 -0700
committerChromium commit bot <commit-bot@chromium.org>2017-03-17 16:44:55 +0000
commit7630907c7ecbb700e4de287550dbed06f36fbe9e (patch)
tree26a8170173f394dc784f29a054af05723ed682ce /core
parent78616574cedcb52cce8a25bd684bf9638a87de7a (diff)
downloadpdfium-chromium/3045.tar.xz
Handle web links across lineschromium/3045
When a web link has a hyphen at the end of line, we consider it to be continued to the next line. For example, "http://www.abc.com/my-\r\ntest" should be extracted as "http://www.abc.com/my-test". BUG=pdfium:650 Change-Id: I64a93d9c66faf2be0abdaf8cfe8ee496c435d0ca Reviewed-on: https://pdfium-review.googlesource.com/3092 Commit-Queue: Wei Li <weili@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org>
Diffstat (limited to 'core')
-rw-r--r--core/fpdftext/cpdf_linkextract.cpp31
-rw-r--r--core/fpdftext/cpdf_textpage.h2
2 files changed, 28 insertions, 5 deletions
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 686b6a23b8..47d0754bd2 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -31,18 +31,36 @@ void CPDF_LinkExtract::ExtractLinks() {
}
void CPDF_LinkExtract::ParseLink() {
- int start = 0, pos = 0;
- int TotalChar = m_pTextPage->CountChars();
- while (pos < TotalChar) {
+ int start = 0;
+ int pos = 0;
+ int nTotalChar = m_pTextPage->CountChars();
+ bool bAfterHyphen = false;
+ bool bLineBreak = false;
+ while (pos < nTotalChar) {
FPDF_CHAR_INFO pageChar;
m_pTextPage->GetCharInfo(pos, &pageChar);
if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
- pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
+ pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
int nCount = pos - start;
- if (pos == TotalChar - 1)
+ if (pos == nTotalChar - 1) {
nCount++;
+ } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
+ pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
+ // Handle text breaks with a hyphen to the next line.
+ bLineBreak = true;
+ pos++;
+ continue;
+ }
CFX_WideString strBeCheck;
strBeCheck = m_pTextPage->GetPageText(start, nCount);
+ if (bLineBreak) {
+ strBeCheck.Remove(TEXT_LINEFEED_CHAR);
+ strBeCheck.Remove(TEXT_RETURN_CHAR);
+ bLineBreak = false;
+ }
+ // Replace the generated code with the hyphen char.
+ strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
+
if (strBeCheck.GetLength() > 5) {
while (strBeCheck.GetLength() > 0) {
wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
@@ -60,6 +78,9 @@ void CPDF_LinkExtract::ParseLink() {
}
start = ++pos;
} else {
+ bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
+ (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
+ pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
pos++;
}
}
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index ebe58eb7ff..d7e29edf3b 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -34,10 +34,12 @@ class CPDF_TextObject;
#define TEXT_SPACE_CHAR L' '
#define TEXT_LINEFEED_CHAR L'\n'
#define TEXT_RETURN_CHAR L'\r'
+#define TEXT_HYPHEN_CHAR L'-'
#define TEXT_EMPTY L""
#define TEXT_SPACE L" "
#define TEXT_RETURN_LINEFEED L"\r\n"
#define TEXT_LINEFEED L"\n"
+#define TEXT_HYPHEN L"-"
#define TEXT_CHARRATIO_GAPDELTA 0.070
enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };