From 17b1c191da26e477c6898a8b06f2ff624f9e4c6b Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Wed, 2 Aug 2017 14:27:22 +0300 Subject: Unify parsing of cross refs v4 Change-Id: I7e3d45263a0bae61fd86fd4c3710de7fc0b9347d Reviewed-on: https://pdfium-review.googlesource.com/9290 Reviewed-by: Wei Li Commit-Queue: Art Snake --- core/fpdfapi/parser/cpdf_parser.cpp | 173 ++++++++++++++++++++---------------- core/fpdfapi/parser/cpdf_parser.h | 15 ++++ 2 files changed, 111 insertions(+), 77 deletions(-) (limited to 'core/fpdfapi/parser') diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index 2ca820eb98..e33cec0165 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -440,76 +440,93 @@ bool CPDF_Parser::LoadLinearizedCrossRefV4(FX_FILESIZE pos, m_pSyntax->SetPos(dwStartPos); m_SortedOffset.insert(pos); + std::vector objects; + if (!ParseAndAppendCrossRefSubsectionData(0, dwObjCount, &objects)) + return false; + MergeCrossRefObjectsData(objects); + return true; +} - uint32_t start_objnum = 0; - uint32_t count = dwObjCount; - FX_FILESIZE SavedPos = m_pSyntax->GetPos(); +bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData( + uint32_t start_objnum, + uint32_t count, + std::vector* out_objects) { + // Each entry shall be exactly 20 byte. + // A sample entry looks like: + // "0000000000 00007 f\r\n" + static constexpr int32_t kEntryConstSize = 20; + + if (!out_objects) { + m_pSyntax->SetPos(m_pSyntax->GetPos() + count * kEntryConstSize); + return true; + } + const size_t start_obj_index = out_objects->size(); + out_objects->resize(start_obj_index + count); - const int32_t recordsize = 20; - std::vector buf(1024 * recordsize + 1); - buf[1024 * recordsize] = '\0'; + std::vector buf(1024 * kEntryConstSize + 1); + buf[1024 * kEntryConstSize] = '\0'; int32_t nBlocks = count / 1024 + 1; for (int32_t block = 0; block < nBlocks; block++) { int32_t block_size = block == nBlocks - 1 ? count % 1024 : 1024; - uint32_t dwReadSize = block_size * recordsize; - if ((FX_FILESIZE)(dwStartPos + dwReadSize) > m_pSyntax->m_FileLen) - return false; - if (!m_pSyntax->ReadBlock(reinterpret_cast(buf.data()), - dwReadSize)) { + block_size * kEntryConstSize)) return false; - } for (int32_t i = 0; i < block_size; i++) { - uint32_t objnum = start_objnum + block * 1024 + i; - char* pEntry = &buf[i * recordsize]; + CrossRefObjData& obj_data = + (*out_objects)[start_obj_index + block * 1024 + i]; + + const uint32_t objnum = start_objnum + block * 1024 + i; + + obj_data.obj_num = objnum; + + ObjectInfo& info = obj_data.info; + + char* pEntry = &buf[i * kEntryConstSize]; if (pEntry[17] == 'f') { - m_ObjectInfo[objnum].pos = 0; - m_ObjectInfo[objnum].type = ObjectType::kFree; + info.pos = 0; + info.type = ObjectType::kFree; } else { - int32_t offset = FXSYS_atoi(pEntry); - if (offset == 0) { + const FX_SAFE_FILESIZE offset = FXSYS_atoi64(pEntry); + if (!offset.IsValid()) + return false; + + if (offset.ValueOrDie() == 0) { for (int32_t c = 0; c < 10; c++) { if (!std::isdigit(pEntry[c])) return false; } } - m_ObjectInfo[objnum].pos = offset; - int32_t version = FXSYS_atoi(pEntry + 11); - if (version >= 1) - m_bVersionUpdated = true; + info.pos = offset.ValueOrDie(); - m_ObjectInfo[objnum].gennum = version; - if (m_ObjectInfo[objnum].pos < m_pSyntax->m_FileLen) - m_SortedOffset.insert(m_ObjectInfo[objnum].pos); - - m_ObjectInfo[objnum].type = ObjectType::kNotCompressed; + // TODO(art-snake): The info.gennum is uint16_t, but version may be + // greated than max. Needs solve this issue. + const int32_t version = FXSYS_atoi(pEntry + 11); + info.gennum = version; + info.type = ObjectType::kNotCompressed; } } } - m_pSyntax->SetPos(SavedPos + count * recordsize); return true; } -bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos, - FX_FILESIZE streampos, - bool bSkip) { - m_pSyntax->SetPos(pos); +bool CPDF_Parser::ParseCrossRefV4(std::vector* out_objects, + uint32_t* start_obj_num_at_last_block) { + if (out_objects) + out_objects->clear(); + if (m_pSyntax->GetKeyword() != "xref") return false; - - m_SortedOffset.insert(pos); - if (streampos) - m_SortedOffset.insert(streampos); - + std::vector result_objects; while (1) { FX_FILESIZE SavedPos = m_pSyntax->GetPos(); bool bIsNumber; CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber); - if (word.IsEmpty()) + if (word.IsEmpty()) { return false; + } if (!bIsNumber) { m_pSyntax->SetPos(SavedPos); @@ -519,55 +536,57 @@ bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos, uint32_t start_objnum = FXSYS_atoui(word.c_str()); if (start_objnum >= kMaxObjectNumber) return false; + if (start_obj_num_at_last_block) + *start_obj_num_at_last_block = start_objnum; uint32_t count = m_pSyntax->GetDirectNum(); m_pSyntax->ToNextWord(); SavedPos = m_pSyntax->GetPos(); - const int32_t recordsize = 20; - - m_dwXrefStartObjNum = start_objnum; - if (!bSkip) { - std::vector buf(1024 * recordsize + 1); - buf[1024 * recordsize] = '\0'; - - int32_t nBlocks = count / 1024 + 1; - for (int32_t block = 0; block < nBlocks; block++) { - int32_t block_size = block == nBlocks - 1 ? count % 1024 : 1024; - m_pSyntax->ReadBlock(reinterpret_cast(buf.data()), - block_size * recordsize); - - for (int32_t i = 0; i < block_size; i++) { - uint32_t objnum = start_objnum + block * 1024 + i; - char* pEntry = &buf[i * recordsize]; - if (pEntry[17] == 'f') { - m_ObjectInfo[objnum].pos = 0; - m_ObjectInfo[objnum].type = ObjectType::kFree; - } else { - FX_FILESIZE offset = (FX_FILESIZE)FXSYS_atoi64(pEntry); - if (offset == 0) { - for (int32_t c = 0; c < 10; c++) { - if (!std::isdigit(pEntry[c])) - return false; - } - } - m_ObjectInfo[objnum].pos = offset; - int32_t version = FXSYS_atoi(pEntry + 11); - if (version >= 1) - m_bVersionUpdated = true; + if (!ParseAndAppendCrossRefSubsectionData( + start_objnum, count, out_objects ? &result_objects : nullptr)) { + return false; + } + } + if (out_objects) + *out_objects = std::move(result_objects); + return true; +} - m_ObjectInfo[objnum].gennum = version; - if (m_ObjectInfo[objnum].pos < m_pSyntax->m_FileLen) - m_SortedOffset.insert(m_ObjectInfo[objnum].pos); +bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos, + FX_FILESIZE streampos, + bool bSkip) { + m_pSyntax->SetPos(pos); + if (m_pSyntax->GetKeyword() != "xref") + return false; - m_ObjectInfo[objnum].type = ObjectType::kNotCompressed; - } - } + m_SortedOffset.insert(pos); + if (streampos) + m_SortedOffset.insert(streampos); + + m_pSyntax->SetPos(pos); + std::vector objects; + if (!ParseCrossRefV4(bSkip ? nullptr : &objects, &m_dwXrefStartObjNum)) + return false; + + MergeCrossRefObjectsData(objects); + + return !streampos || LoadCrossRefV5(&streampos, false); +} + +void CPDF_Parser::MergeCrossRefObjectsData( + const std::vector& objects) { + for (const auto& obj : objects) { + m_ObjectInfo[obj.obj_num] = obj.info; + if (obj.info.type != ObjectType::kFree) { + if (obj.info.gennum > 0) + m_bVersionUpdated = true; + if (obj.info.type == ObjectType::kNotCompressed && + obj.info.pos < m_pSyntax->m_FileLen) { + m_SortedOffset.insert(obj.info.pos); } } - m_pSyntax->SetPos(SavedPos + count * recordsize); } - return !streampos || LoadCrossRefV5(&streampos, false); } bool CPDF_Parser::LoadAllCrossRefV5(FX_FILESIZE xrefpos) { diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h index ece1e6a2d4..759d042360 100644 --- a/core/fpdfapi/parser/cpdf_parser.h +++ b/core/fpdfapi/parser/cpdf_parser.h @@ -148,6 +148,11 @@ class CPDF_Parser { kEndObj }; + struct CrossRefObjData { + uint32_t obj_num = 0; + ObjectInfo info; + }; + CPDF_Object* ParseDirect(CPDF_Object* pObj); bool LoadAllCrossRefV4(FX_FILESIZE pos); bool LoadAllCrossRefV5(FX_FILESIZE pos); @@ -169,6 +174,16 @@ class CPDF_Parser { // the objects. bool VerifyCrossRefV4(); + // If out_objects is null, the parser position will be moved to end subsection + // without additional validation. + bool ParseAndAppendCrossRefSubsectionData( + uint32_t start_objnum, + uint32_t count, + std::vector* out_objects); + bool ParseCrossRefV4(std::vector* out_objects, + uint32_t* start_obj_num_at_last_block); + void MergeCrossRefObjectsData(const std::vector& objects); + CFX_UnownedPtr m_pDocument; bool m_bHasParsed; bool m_bXRefStream; -- cgit v1.2.3