summaryrefslogtreecommitdiff
path: root/core/fpdftext
diff options
context:
space:
mode:
authorRyan Harrison <rharrison@chromium.org>2018-08-28 20:22:32 +0000
committerChromium commit bot <commit-bot@chromium.org>2018-08-28 20:22:32 +0000
commit4d92af5ace491a2e87a6c73e5afa9ed53ac86bd0 (patch)
tree16bd5583a1d4650d4e757f0f8abe02ed045a70a1 /core/fpdftext
parent3321f15291b3fff3ebd6ef41bf48d883c8b78352 (diff)
downloadpdfium-4d92af5ace491a2e87a6c73e5afa9ed53ac86bd0.tar.xz
Handle non-printing characters at beginning of extraction region
Currently if a text extraction region begins on a non-printing character then "" will be returned. This is the incorrect behaviour, instead the call should scan ahead until a printing character is found and start extracting from there. Also proactively adds a similar check and scan for the end of the extraction region. BUG=pdfium:1139 Change-Id: Ia2001ac89740f3d31d2bb69e8000773f8b01091b Reviewed-on: https://pdfium-review.googlesource.com/41532 Reviewed-by: Henrique Nakashima <hnakashima@chromium.org> Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Diffstat (limited to 'core/fpdftext')
-rw-r--r--core/fpdftext/cpdf_textpage.cpp29
1 files changed, 25 insertions, 4 deletions
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 5465297263..8c110b6814 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -510,15 +510,36 @@ WideString CPDF_TextPage::GetPageText(int start, int count) const {
return L"";
}
+ const int count_chars = CountChars();
int text_start = TextIndexFromCharIndex(start);
- if (text_start < 0)
- return L"";
- count = std::min(count, CountChars() - start);
+ // If the character at |start| is a non-printing character, then
+ // TextIndexFromCharIndex will return -1, so scan ahead to the first printing
+ // character.
+ while (text_start < 0) {
+ if (start >= count_chars)
+ return L"";
+ start++;
+ text_start = TextIndexFromCharIndex(start);
+ }
+
+ count = std::min(count, count_chars - start);
int last = start + count - 1;
int text_last = TextIndexFromCharIndex(last);
- if (text_last < 0 || text_last < text_start)
+
+ // If the character at |last| is a non-printing character, then
+ // TextIndexFromCharIndex will return -1, so scan back to the last printing
+ // character.
+ while (text_last < 0) {
+ if (last < text_start)
+ return L"";
+
+ last--;
+ text_last = TextIndexFromCharIndex(last);
+ }
+
+ if (text_last < text_start)
return L"";
int text_count = text_last - text_start + 1;