From 0f1ca17e45bee4f1cb04d33437eba010b1afcd72 Mon Sep 17 00:00:00 2001 From: Ryan Harrison Date: Tue, 28 Nov 2017 18:19:57 +0000 Subject: Convert from character stream to visible text indices in GetText Most of the API methods FPDFText operate on indices in terms of the underlying stream of characters. This stream includes non-printing control characters, which are not part of the visible text. The majority of files do not appear to have these hidden characters so there is a 1:1 correspondence between them. When they are present conversion needs to occur to make sure that GetText doesn't attempt to retrieve for a span that is out of range. BUG=chromium:788103,chromium:788220 Change-Id: I4c9fa403ea65754ba94e3f15ded49fe0641e9db5 Reviewed-on: https://pdfium-review.googlesource.com/19550 Reviewed-by: dsinclair Commit-Queue: dsinclair --- fpdfsdk/fpdftext.cpp | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp index 9d7d56311f..022b7caa8d 100644 --- a/fpdfsdk/fpdftext.cpp +++ b/fpdfsdk/fpdftext.cpp @@ -160,27 +160,51 @@ FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page, static_cast(yTolerance))); } -FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page, - int start, - int count, +FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE page, + int char_start, + int char_count, unsigned short* result) { - if (start < 0 || count < 0 || !result || !text_page) + if (char_start < 0 || char_count < 0 || !result || !page) return 0; - CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page); - if (start >= textpage->CountChars()) + CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page); + if (char_start >= textpage->CountChars()) + return 0; + + char_count = std::min(char_count, textpage->CountChars() - char_start); + if (char_count == 0) { + // Writting out "" + *result = 0; + return 1; + } + + int char_last = char_start + char_count - 1; + + // char_* values are for a data structure that includes non-printing unicode + // characters, where the text_* values are from a data structure that doesn't + // include these characters, so translation is needed. + int text_start = textpage->TextIndexFromCharIndex(char_start); + if (text_start == -1) + return 0; + + int text_last = textpage->TextIndexFromCharIndex(char_last); + if (text_last == -1) + return 0; + + if (text_start > text_last) return 0; + int text_count = text_last - text_start + 1; - WideString str = textpage->GetPageText(start, count); - if (str.GetLength() > static_cast(count)) - str = str.Left(static_cast(count)); + WideString str = textpage->GetPageText(text_start, text_count); + if (str.GetLength() > static_cast(text_count)) + str = str.Left(static_cast(text_count)); // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected // the number of items to stay the same. ByteString byte_str = str.UTF16LE_Encode(); ASSERT((byte_str.GetLength()) / kBytesPerCharacter <= - static_cast(count + 1)); // +1 to account for the string - // terminator + static_cast(char_count + 1)); // +1 to account for the string + // terminator memcpy(result, byte_str.GetBuffer(byte_str.GetLength()), byte_str.GetLength()); return (byte_str.GetLength() / kBytesPerCharacter); -- cgit v1.2.3