From 84d3394d88c42b798eedc938e6295ad1bf28ac66 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Wed, 25 Jul 2018 02:47:25 +0000 Subject: Fix hint tables parsing. Sample PDF: https://yadi.sk/d/oWLtAEfy3YbEb3 For offsets, equal to the hint stream offset, added hint stream length to determine the actual offset, because linearization inserted the hint stream at the original location of the object. Also the number of bits needed to represent the numerator of the fractional position for each shared object reference may be zero, if each shared group contains only one object with obj num, incremented on 1. Change-Id: I4754d603f388354821e8d0cac97ad99a7578fe4b Reviewed-on: https://pdfium-review.googlesource.com/36610 Commit-Queue: Art Snake Reviewed-by: Lei Zhang --- core/fpdfapi/parser/cpdf_hint_tables.cpp | 24 +++++--- core/fpdfapi/parser/cpdf_hint_tables.h | 2 + core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp | 64 ++++++++++++++++++++-- testing/resources/hint_table_102p.bin | Bin 0 -> 3797 bytes 4 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 testing/resources/hint_table_102p.bin diff --git a/core/fpdfapi/parser/cpdf_hint_tables.cpp b/core/fpdfapi/parser/cpdf_hint_tables.cpp index 71a6d3688e..04e673bc97 100644 --- a/core/fpdfapi/parser/cpdf_hint_tables.cpp +++ b/core/fpdfapi/parser/cpdf_hint_tables.cpp @@ -117,7 +117,7 @@ bool CPDF_HintTables::ReadPageHintTable(CFX_BitStream* hStream) { // shared object referenced from a page, there is an indication of // where in the page's content stream the object is first referenced. const uint32_t dwSharedNumeratorBits = hStream->GetBits(16); - if (!IsValidPageOffsetHintTableBitCount(dwSharedNumeratorBits)) + if (dwSharedNumeratorBits > 32) return false; // Item 13: Skip Item 13 which has 16 bits. @@ -193,15 +193,17 @@ bool CPDF_HintTables::ReadPageHintTable(CFX_BitStream* hStream) { } hStream->ByteAlign(); - for (uint32_t i = 0; i < nPages; i++) { - FX_SAFE_UINT32 safeSize = dwNSharedObjsArray[i]; - safeSize *= dwSharedNumeratorBits; - if (!CanReadFromBitStream(hStream, safeSize)) - return false; + if (dwSharedNumeratorBits) { + for (uint32_t i = 0; i < nPages; i++) { + FX_SAFE_UINT32 safeSize = dwNSharedObjsArray[i]; + safeSize *= dwSharedNumeratorBits; + if (!CanReadFromBitStream(hStream, safeSize)) + return false; - hStream->SkipBits(safeSize.ValueOrDie()); + hStream->SkipBits(safeSize.ValueOrDie()); + } + hStream->ByteAlign(); } - hStream->ByteAlign(); FX_SAFE_UINT32 safeTotalPageLen = nPages; safeTotalPageLen *= dwDeltaPageLenBits; @@ -403,7 +405,11 @@ FX_FILESIZE CPDF_HintTables::HintsOffsetToFileOffset( // offset shall have the hint stream length added to it to determine the // actual offset relative to the beginning of the file. // See specification PDF 32000-1:2008 Annex F.4 (Hint tables). - if (file_offset.ValueOrDie() > m_pLinearized->GetHintStart()) + // Note: The PDF spec does not mention this, but positions equal to the hint + // stream offset also need to have the hint stream length added to it. e.g. + // There exists linearized PDFs generated by Adobe software that have this + // property. + if (file_offset.ValueOrDie() >= m_pLinearized->GetHintStart()) file_offset += m_pLinearized->GetHintLength(); return file_offset.ValueOrDefault(0); diff --git a/core/fpdfapi/parser/cpdf_hint_tables.h b/core/fpdfapi/parser/cpdf_hint_tables.h index a161dc68f0..5b978f99b2 100644 --- a/core/fpdfapi/parser/cpdf_hint_tables.h +++ b/core/fpdfapi/parser/cpdf_hint_tables.h @@ -83,6 +83,8 @@ class CPDF_HintTables { return m_SharedObjGroupInfos; } + FX_FILESIZE GetFirstPageObjOffset() const { return m_szFirstPageObjOffset; } + protected: bool ReadPageHintTable(CFX_BitStream* hStream); bool ReadSharedObjHintTable(CFX_BitStream* hStream, uint32_t offset); diff --git a/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp b/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp index af0e9ff745..8a7331d29b 100644 --- a/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp +++ b/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp @@ -10,9 +10,14 @@ #include "core/fpdfapi/cpdf_modulemgr.h" #include "core/fpdfapi/parser/cpdf_data_avail.h" +#include "core/fpdfapi/parser/cpdf_dictionary.h" +#include "core/fpdfapi/parser/cpdf_linearized_header.h" #include "core/fpdfapi/parser/cpdf_object.h" +#include "core/fpdfapi/parser/cpdf_read_validator.h" +#include "core/fpdfapi/parser/cpdf_stream.h" #include "core/fpdfapi/parser/cpdf_syntax_parser.h" #include "core/fxcrt/fx_stream.h" +#include "testing/fx_string_testhelpers.h" #include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" #include "testing/utils/path_service.h" @@ -20,16 +25,42 @@ namespace { -std::unique_ptr MakeDataAvailFromFile( +RetainPtr MakeValidatorFromFile( const std::string& file_name) { std::string file_path; - if (!PathService::GetTestFilePath(file_name, &file_path)) - return nullptr; + PathService::GetTestFilePath(file_name, &file_path); + ASSERT(!file_path.empty()); + return pdfium::MakeRetain( + IFX_SeekableReadStream::CreateFromFilename(file_path.c_str()), nullptr); +} + +std::unique_ptr MakeDataAvailFromFile( + const std::string& file_name) { return pdfium::MakeUnique( - nullptr, IFX_SeekableReadStream::CreateFromFilename(file_path.c_str()), - true); + nullptr, MakeValidatorFromFile(file_name), true); } +class TestLinearizedHeader : public CPDF_LinearizedHeader { + public: + TestLinearizedHeader(const CPDF_Dictionary* pDict, + FX_FILESIZE szLastXRefOffset) + : CPDF_LinearizedHeader(pDict, szLastXRefOffset) {} + + static std::unique_ptr MakeHeader( + const std::string& inline_data) { + CPDF_SyntaxParser parser; + parser.InitParser( + pdfium::MakeRetain( + reinterpret_cast(inline_data.data()), + inline_data.size()), + 0); + std::unique_ptr dict = + ToDictionary(parser.GetObjectBody(nullptr)); + ASSERT(dict); + return pdfium::MakeUnique(dict.get(), 0); + } +}; + } // namespace class CPDF_HintTablesTest : public testing::Test { @@ -119,3 +150,26 @@ TEST_F(CPDF_HintTablesTest, PageAndGroupInfos) { EXPECT_EQ(10939, hint_tables->SharedGroupInfos()[5].m_szOffset); EXPECT_EQ(544u, hint_tables->SharedGroupInfos()[5].m_dwLength); } + +TEST_F(CPDF_HintTablesTest, FirstPageOffset) { + // Test that valid hint table is loaded, and have correct offset of first page + // object. + const auto linearized_header = TestLinearizedHeader::MakeHeader( + "<< /Linearized 1 /L 19326762 /H [ 123730 3816 ] /O 5932 /E 639518 /N " + "102 /T 19220281 >>"); + ASSERT_TRUE(linearized_header); + // This hint table is extracted from linearized file, generated by qpdf tool. + RetainPtr validator = + MakeValidatorFromFile("hint_table_102p.bin"); + CPDF_SyntaxParser parser; + parser.InitParserWithValidator(validator, 0); + std::unique_ptr stream = ToStream(parser.GetObjectBody(nullptr)); + ASSERT_TRUE(stream); + auto hint_tables = pdfium::MakeUnique( + validator.Get(), linearized_header.get()); + // Check that hint table will load. + ASSERT_TRUE(hint_tables->LoadHintStream(stream.get())); + // Check that hint table have correct first page offset. + // 127546 is predefined real value from original file. + EXPECT_EQ(127546, hint_tables->GetFirstPageObjOffset()); +} diff --git a/testing/resources/hint_table_102p.bin b/testing/resources/hint_table_102p.bin new file mode 100644 index 0000000000..4008b0daca Binary files /dev/null and b/testing/resources/hint_table_102p.bin differ -- cgit v1.2.3