From 49fa50d7e922746c02f7b70c8436466d7f62696a Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 8 Jun 2018 15:31:10 +0000 Subject: Use FPDFText_GetBoundedText() to get the visible text in a test. Add a test PDF with multiple pages, each with a different media box and crop box. Demonstrate how FPDFText_GetText() gets all the text on the page, and how FPDFText_GetBoundedText() with the right bounding boxes gets only the visible text on the page. Also fix a small nit in CPDF_TextPage::GetTextByRect() found while writing this CL. BUG=pdfium:387 Change-Id: I9ce4bb181e2ba5b454ea1341bbccef9ba94c9cd8 Reviewed-on: https://pdfium-review.googlesource.com/34550 Commit-Queue: Ryan Harrison Reviewed-by: Ryan Harrison --- core/fpdftext/cpdf_textpage.cpp | 6 +- fpdfsdk/fpdf_text_embeddertest.cpp | 61 +++++++++++++++++++- testing/resources/cropped_text.in | 98 +++++++++++++++++++++++++++++++ testing/resources/cropped_text.pdf | 114 +++++++++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 4 deletions(-) create mode 100644 testing/resources/cropped_text.in create mode 100644 testing/resources/cropped_text.pdf diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index dae973bea9..60e574558b 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp @@ -446,9 +446,9 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { IsAddLineFeed = false; if (charinfo.m_Unicode) strText += charinfo.m_Unicode; - } else if (charinfo.m_Unicode == 32) { - if (IsContainPreChar && charinfo.m_Unicode) { - strText += charinfo.m_Unicode; + } else if (charinfo.m_Unicode == L' ') { + if (IsContainPreChar) { + strText += L' '; IsContainPreChar = false; IsAddLineFeed = false; } diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp index c7ad8250df..112991f896 100644 --- a/fpdfsdk/fpdf_text_embeddertest.cpp +++ b/fpdfsdk/fpdf_text_embeddertest.cpp @@ -2,10 +2,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include #include +#include #include "core/fxcrt/fx_memory.h" #include "public/fpdf_text.h" +#include "public/fpdf_transformpage.h" #include "public/fpdfview.h" #include "testing/embedder_test.h" #include "testing/gtest/include/gtest/gtest.h" @@ -159,7 +162,8 @@ TEST_F(FPDFTextEmbeddertest, Text) { EXPECT_EQ(0.0, bottom); EXPECT_EQ(0.0, top); - EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, 0, 0)); + EXPECT_EQ( + 9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, nullptr, 0)); // Extract starting at character 4 as above. memset(buffer, 0xbd, sizeof(buffer)); @@ -751,3 +755,58 @@ TEST_F(FPDFTextEmbeddertest, CountRects) { FPDFText_ClosePage(textpage); UnloadPage(page); } + +TEST_F(FPDFTextEmbeddertest, CroppedText) { + static constexpr int kPageCount = 4; + static constexpr FS_RECTF kBoxes[kPageCount] = { + {50.0f, 150.0f, 150.0f, 50.0f}, + {50.0f, 150.0f, 150.0f, 50.0f}, + {60.0f, 150.0f, 150.0f, 60.0f}, + {60.0f, 150.0f, 150.0f, 60.0f}, + }; + static constexpr const char* kExpectedText[kPageCount] = { + " world!\r\ndbye, world!", " world!\r\ndbye, world!", "bye, world!", + "bye, world!", + }; + + ASSERT_TRUE(OpenDocument("cropped_text.pdf")); + ASSERT_EQ(kPageCount, FPDF_GetPageCount(document())); + + for (int i = 0; i < kPageCount; ++i) { + FPDF_PAGE page = LoadPage(i); + ASSERT_TRUE(page); + + FS_RECTF box; + EXPECT_TRUE(FPDF_GetPageBoundingBox(page, &box)); + EXPECT_EQ(kBoxes[i].left, box.left); + EXPECT_EQ(kBoxes[i].top, box.top); + EXPECT_EQ(kBoxes[i].right, box.right); + EXPECT_EQ(kBoxes[i].bottom, box.bottom); + + { + ScopedFPDFTextPage textpage(FPDFText_LoadPage(page)); + ASSERT_TRUE(textpage); + + unsigned short buffer[128]; + memset(buffer, 0xbd, sizeof(buffer)); + int num_chars = FPDFText_GetText(textpage.get(), 0, 128, buffer); + ASSERT_EQ(kHelloGoodbyeTextSize, num_chars); + EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText, buffer, + kHelloGoodbyeTextSize)); + + int expected_char_count = strlen(kExpectedText[i]); + ASSERT_EQ(expected_char_count, + FPDFText_GetBoundedText(textpage.get(), box.left, box.top, + box.right, box.bottom, nullptr, 0)); + + memset(buffer, 0xbd, sizeof(buffer)); + ASSERT_EQ(expected_char_count + 1, + FPDFText_GetBoundedText(textpage.get(), box.left, box.top, + box.right, box.bottom, buffer, 128)); + EXPECT_TRUE( + check_unsigned_shorts(kExpectedText[i], buffer, expected_char_count)); + } + + UnloadPage(page); + } +} diff --git a/testing/resources/cropped_text.in b/testing/resources/cropped_text.in new file mode 100644 index 0000000000..c8632de112 --- /dev/null +++ b/testing/resources/cropped_text.in @@ -0,0 +1,98 @@ +{{header}} +{{object 1 0}} << + /Type /Catalog + /Pages 2 0 R +>> +endobj +{{object 2 0}} << + /Type /Pages + /Count 4 + /Kids [ 6 0 R 7 0 R 8 0 R 9 0 R ] +>> +endobj +{{object 3 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +{{object 4 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +{{object 5 0}} << + {{streamlen}} +>> +stream +BT +20 50 Td +/F1 12 Tf +(Hello, world!) Tj +0 50 Td +/F2 16 Tf +(Goodbye, world!) Tj +ET +endstream +endobj +{{object 6 0}} << + /Type /Page + /Parent 2 0 R + /MediaBox [ 0 0 200 200 ] + /CropBox [ 50 50 150 150 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +{{object 7 0}} << + /Type /Page + /Parent 2 0 R + /MediaBox [ -50 -50 200 200 ] + /CropBox [ 50 50 150 150 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +{{object 8 0}} << + /Type /Page + /Parent 2 0 R + /MediaBox [ 0 0 200 200 ] + /CropBox [ 60 60 150 150 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +{{object 9 0}} << + /Type /Page + /Parent 2 0 R + /MediaBox [ 0 0 200 200 ] + /CropBox [ 150 150 60 60 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +{{xref}} +{{trailer}} +{{startxref}} +%%EOF diff --git a/testing/resources/cropped_text.pdf b/testing/resources/cropped_text.pdf new file mode 100644 index 0000000000..02d50c6545 --- /dev/null +++ b/testing/resources/cropped_text.pdf @@ -0,0 +1,114 @@ +%PDF-1.7 +% ò¤ô +1 0 obj << + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj << + /Type /Pages + /Count 4 + /Kids [ 6 0 R 7 0 R 8 0 R 9 0 R ] +>> +endobj +3 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Times-Roman +>> +endobj +4 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +5 0 obj << + /Length 83 +>> +stream +BT +20 50 Td +/F1 12 Tf +(Hello, world!) Tj +0 50 Td +/F2 16 Tf +(Goodbye, world!) Tj +ET +endstream +endobj +6 0 obj << + /Type /Page + /Parent 2 0 R + /MediaBox [ 0 0 200 200 ] + /CropBox [ 50 50 150 150 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +7 0 obj << + /Type /Page + /Parent 2 0 R + /MediaBox [ -50 -50 200 200 ] + /CropBox [ 50 50 150 150 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +8 0 obj << + /Type /Page + /Parent 2 0 R + /MediaBox [ 0 0 200 200 ] + /CropBox [ 60 60 150 150 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +9 0 obj << + /Type /Page + /Parent 2 0 R + /MediaBox [ 0 0 200 200 ] + /CropBox [ 150 150 60 60 ] + /Resources << + /Font << + /F1 3 0 R + /F2 4 0 R + >> + >> + /Contents 5 0 R +>> +endobj +xref +0 10 +0000000000 65535 f +0000000015 00000 n +0000000068 00000 n +0000000151 00000 n +0000000229 00000 n +0000000305 00000 n +0000000439 00000 n +0000000638 00000 n +0000000841 00000 n +0000001040 00000 n +trailer << + /Root 1 0 R + /Size 10 +>> +startxref +1239 +%%EOF -- cgit v1.2.3