From a5fc8975c865dc3cc90de8ff46ca13fb46c13391 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Mon, 2 Oct 2017 22:08:44 +0300 Subject: Unify parsing of cross ref table v4. We can use 'Prev' value of first-page cross-reference table trailer for load main cross-reference table, instead of 'T' value of Linearized header (Offset of first entry in main cross-reference table). This is better solution, because this is allow us check entry count in main cross-ref table and unify loading of main cross-ref table with loading methods of non linearized document. See PDF specification: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf#678 (page 678, Example 3/part 3) Change-Id: I59dcf3c73a0fb561221ded78e827e40535dbd717 Reviewed-on: https://pdfium-review.googlesource.com/13810 Commit-Queue: Art Snake Reviewed-by: dsinclair --- core/fpdfapi/parser/cpdf_data_avail.cpp | 13 +++++- core/fpdfapi/parser/cpdf_parser.cpp | 58 ++++++------------------ core/fpdfapi/parser/cpdf_parser.h | 4 +- core/fpdfapi/parser/cpdf_parser_embeddertest.cpp | 16 +++++++ 4 files changed, 43 insertions(+), 48 deletions(-) diff --git a/core/fpdfapi/parser/cpdf_data_avail.cpp b/core/fpdfapi/parser/cpdf_data_avail.cpp index 38e857f22f..91fc8c1087 100644 --- a/core/fpdfapi/parser/cpdf_data_avail.cpp +++ b/core/fpdfapi/parser/cpdf_data_avail.cpp @@ -1171,13 +1171,22 @@ CPDF_DataAvail::DocAvailStatus CPDF_DataAvail::CheckLinearizedData() { return DataError; if (!m_bMainXRefLoadTried) { + ASSERT(m_pDocument->GetParser()->GetTrailer()); + const FX_SAFE_FILESIZE main_xref_offset = + m_pDocument->GetParser()->GetTrailer()->GetIntegerFor("Prev"); + if (!main_xref_offset.IsValid()) + return DataError; + + if (main_xref_offset.ValueOrDie() == 0) + return DataAvailable; + FX_SAFE_SIZE_T data_size = m_dwFileLen; - data_size -= m_pLinearized->GetLastXRefOffset(); + data_size -= main_xref_offset.ValueOrDie(); if (!data_size.IsValid()) return DataError; if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable( - m_pLinearized->GetLastXRefOffset(), data_size.ValueOrDie())) + main_xref_offset.ValueOrDie(), data_size.ValueOrDie())) return DataNotAvailable; CPDF_Parser::Error eRet = diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index c443c8baf2..b6231a7a6a 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -102,8 +102,7 @@ CPDF_Parser::CPDF_Parser() m_bXRefStream(false), m_FileVersion(0), m_pEncryptDict(nullptr), - m_TrailerData(pdfium::MakeUnique()), - m_dwLinearizedFirstPageXRefStartObjNum(0) {} + m_TrailerData(pdfium::MakeUnique()) {} CPDF_Parser::~CPDF_Parser() { ReleaseEncryptHandler(); @@ -421,9 +420,8 @@ bool CPDF_Parser::LoadAllCrossRefV4(FX_FILESIZE xrefpos) { return true; } -bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos, - uint32_t dwObjCount) { - if (!LoadLinearizedCrossRefV4(xrefpos, dwObjCount)) +bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos) { + if (!LoadCrossRefV4(xrefpos, false)) return false; std::unique_ptr trailer = LoadTrailerV4(); @@ -477,18 +475,6 @@ bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos, return true; } -bool CPDF_Parser::LoadLinearizedCrossRefV4(FX_FILESIZE pos, - uint32_t dwObjCount) { - FX_FILESIZE dwStartPos = pos - m_pSyntax->m_HeaderOffset; - - m_pSyntax->SetPos(dwStartPos); - std::vector objects; - if (!ParseAndAppendCrossRefSubsectionData(0, dwObjCount, &objects)) - return false; - MergeCrossRefObjectsData(objects); - return true; -} - bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData( uint32_t start_objnum, uint32_t count, @@ -1318,9 +1304,10 @@ bool CPDF_Parser::ParseLinearizedHeader() { if (!m_pLinearized) return false; - m_LastXRefOffset = m_pLinearized->GetLastXRefOffset(); // Move parser onto first page xref table start. m_pSyntax->GetNextWord(nullptr); + + m_LastXRefOffset = m_pSyntax->GetPos(); return true; } @@ -1340,7 +1327,7 @@ CPDF_Parser::Error CPDF_Parser::StartLinearizedParse( m_bHasParsed = true; m_pDocument = pDocument; - FX_FILESIZE dwFirstXRefOffset = m_pSyntax->GetPos(); + FX_FILESIZE dwFirstXRefOffset = m_LastXRefOffset; bool bXRefRebuilt = false; bool bLoadV4 = LoadCrossRefV4(dwFirstXRefOffset, false); if (!bLoadV4 && !LoadCrossRefV5(&dwFirstXRefOffset, true)) { @@ -1350,8 +1337,6 @@ CPDF_Parser::Error CPDF_Parser::StartLinearizedParse( bXRefRebuilt = true; m_LastXRefOffset = 0; } - m_dwLinearizedFirstPageXRefStartObjNum = - m_ObjectInfo.empty() ? 0 : m_ObjectInfo.begin()->first; if (bLoadV4) { std::unique_ptr trailer = LoadTrailerV4(); if (!trailer) @@ -1423,33 +1408,20 @@ bool CPDF_Parser::LoadLinearizedAllCrossRefV5(FX_FILESIZE xrefpos) { } CPDF_Parser::Error CPDF_Parser::LoadLinearizedMainXRefTable() { + const FX_SAFE_FILESIZE main_xref_offset = GetTrailer()->GetIntegerFor("Prev"); + if (!main_xref_offset.IsValid()) + return FORMAT_ERROR; + + if (main_xref_offset.ValueOrDie() == 0) + return SUCCESS; + const AutoRestorer save_metadata_objnum(&m_MetadataObjnum); m_MetadataObjnum = 0; - m_pSyntax->SetPos(m_LastXRefOffset - m_pSyntax->m_HeaderOffset); - - uint8_t ch = 0; - uint32_t dwCount = 0; - m_pSyntax->GetNextChar(ch); - while (PDFCharIsWhitespace(ch)) { - ++dwCount; - if (m_pSyntax->m_FileLen <= - (FX_FILESIZE)(m_pSyntax->GetPos() + m_pSyntax->m_HeaderOffset)) { - break; - } - if (!m_pSyntax->GetNextChar(ch)) - return HANDLER_ERROR; - } - m_LastXRefOffset += dwCount; m_ObjectStreamMap.clear(); m_ObjCache.clear(); - // In linearized document, the main cross ref always should start from 0 - // objnum. - // And should have count equals to first obj number of first page cross ref - // table. - if (!LoadLinearizedAllCrossRefV4(m_LastXRefOffset, - m_dwLinearizedFirstPageXRefStartObjNum) && - !LoadLinearizedAllCrossRefV5(m_LastXRefOffset)) { + if (!LoadLinearizedAllCrossRefV4(main_xref_offset.ValueOrDie()) && + !LoadLinearizedAllCrossRefV5(main_xref_offset.ValueOrDie())) { m_LastXRefOffset = 0; return FORMAT_ERROR; } diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h index 7c6a75d8c5..96bd6f43a9 100644 --- a/core/fpdfapi/parser/cpdf_parser.h +++ b/core/fpdfapi/parser/cpdf_parser.h @@ -162,8 +162,7 @@ class CPDF_Parser { std::unique_ptr LoadTrailerV4(); Error SetEncryptHandler(); void ReleaseEncryptHandler(); - bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount); - bool LoadLinearizedCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount); + bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos); bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos); Error LoadLinearizedMainXRefTable(); RetainPtr GetObjectStream(uint32_t number); @@ -207,7 +206,6 @@ class CPDF_Parser { ByteString m_Password; std::unique_ptr m_TrailerData; std::unique_ptr m_pLinearized; - uint32_t m_dwLinearizedFirstPageXRefStartObjNum; // A map of object numbers to indirect streams. std::map> m_ObjectStreamMap; diff --git a/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp b/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp index 99bc2c2d42..6aa3e2785a 100644 --- a/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp +++ b/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp @@ -57,3 +57,19 @@ TEST_F(CPDFParserEmbeddertest, Bug_602650) { TEST_F(CPDFParserEmbeddertest, Bug_757705) { EXPECT_TRUE(OpenDocument("bug_757705.pdf")); } + +TEST_F(CPDFParserEmbeddertest, LoadMainCrossRefTable) { + EXPECT_TRUE(OpenDocument("feature_linearized_loading.pdf", nullptr, true)); + // To check that main cross ref table is loaded correctly,will be enough to + // check that the second page was correctly loaded. Because it is contains + // crossrefs for second page. + EXPECT_EQ(2, GetPageCount()); + FPDF_PAGE page = LoadPage(1); + EXPECT_NE(nullptr, page); + FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page); + EXPECT_NE(nullptr, text_page); + // The page should not be blank. + EXPECT_LT(0, FPDFText_CountChars(text_page)); + FPDFText_ClosePage(text_page); + UnloadPage(page); +} -- cgit v1.2.3