From e3d3ce04e585c4a6c5596056bdf1ced639c763d7 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Tue, 26 Jun 2018 16:01:38 +0000 Subject: Implement CPDF_ObjStream. It is allow do not store raw objects streams within CPDF_Document for reduce memory usage. Change-Id: I4377bd5119d87314e76f14255171618cf6ee533d Reviewed-on: https://pdfium-review.googlesource.com/35430 Reviewed-by: dsinclair Reviewed-by: Tom Sepez Commit-Queue: Art Snake --- core/fpdfapi/parser/cpdf_object_stream.cpp | 136 +++++++++++++++++++++++++++++ core/fpdfapi/parser/cpdf_object_stream.h | 52 +++++++++++ core/fpdfapi/parser/cpdf_parser.cpp | 70 ++++++--------- core/fpdfapi/parser/cpdf_parser.h | 15 +--- core/fpdfapi/parser/cpdf_syntax_parser.h | 2 +- 5 files changed, 219 insertions(+), 56 deletions(-) create mode 100644 core/fpdfapi/parser/cpdf_object_stream.cpp create mode 100644 core/fpdfapi/parser/cpdf_object_stream.h (limited to 'core/fpdfapi/parser') diff --git a/core/fpdfapi/parser/cpdf_object_stream.cpp b/core/fpdfapi/parser/cpdf_object_stream.cpp new file mode 100644 index 0000000000..779fbffd83 --- /dev/null +++ b/core/fpdfapi/parser/cpdf_object_stream.cpp @@ -0,0 +1,136 @@ +// Copyright 2018 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "core/fpdfapi/parser/cpdf_object_stream.h" + +#include + +#include "core/fpdfapi/parser/cpdf_dictionary.h" +#include "core/fpdfapi/parser/cpdf_number.h" +#include "core/fpdfapi/parser/cpdf_parser.h" +#include "core/fpdfapi/parser/cpdf_reference.h" +#include "core/fpdfapi/parser/cpdf_stream.h" +#include "core/fpdfapi/parser/cpdf_stream_acc.h" +#include "core/fxcrt/cfx_memorystream.h" +#include "third_party/base/stl_util.h" + +// static +bool CPDF_ObjectStream::IsObjectsStreamObject(const CPDF_Object* object) { + const CPDF_Stream* stream = ToStream(object); + if (!stream) + return false; + + const CPDF_Dictionary* stream_dict = stream->GetDict(); + if (!stream_dict) + return false; + + if (stream_dict->GetStringFor("Type") != "ObjStm") + return false; + + const CPDF_Number* number_of_objects = + ToNumber(stream_dict->GetObjectFor("N")); + if (!number_of_objects || !number_of_objects->IsInteger() || + number_of_objects->GetInteger() < 0 || + number_of_objects->GetInteger() >= + static_cast(CPDF_Parser::kMaxObjectNumber)) { + return false; + } + + const CPDF_Number* first_object_offset = + ToNumber(stream_dict->GetObjectFor("First")); + if (!first_object_offset || !first_object_offset->IsInteger() || + first_object_offset->GetInteger() < 0) { + return false; + } + + return true; +} + +// static +std::unique_ptr CPDF_ObjectStream::Create( + const CPDF_Stream* stream) { + if (!IsObjectsStreamObject(stream)) + return nullptr; + // The ctor of CPDF_ObjectStream is protected. Use WrapUnique instead + // MakeUnique. + return pdfium::WrapUnique(new CPDF_ObjectStream(stream)); +} + +CPDF_ObjectStream::CPDF_ObjectStream(const CPDF_Stream* obj_stream) + : obj_num_(obj_stream->GetObjNum()), + first_object_offset_(obj_stream->GetDict()->GetIntegerFor("First")) { + DCHECK(IsObjectsStreamObject(obj_stream)); + if (const auto* extends_ref = + ToReference(obj_stream->GetDict()->GetObjectFor("Extends"))) { + extends_obj_num_ = extends_ref->GetRefObjNum(); + } + Init(obj_stream); +} + +CPDF_ObjectStream::~CPDF_ObjectStream() = default; + +bool CPDF_ObjectStream::HasObject(uint32_t obj_number) const { + return pdfium::ContainsKey(objects_offsets_, obj_number); +} + +std::unique_ptr CPDF_ObjectStream::ParseObject( + CPDF_IndirectObjectHolder* pObjList, + uint32_t obj_number) const { + const auto it = objects_offsets_.find(obj_number); + if (it == objects_offsets_.end()) + return nullptr; + + std::unique_ptr result = + ParseObjectAtOffset(pObjList, it->second); + if (!result) + return nullptr; + + result->SetObjNum(obj_number); + return result; +} + +void CPDF_ObjectStream::Init(const CPDF_Stream* stream) { + { + auto stream_acc = pdfium::MakeRetain(stream); + stream_acc->LoadAllDataFiltered(); + const uint32_t data_size = stream_acc->GetSize(); + data_stream_ = pdfium::MakeRetain( + stream_acc->DetachData().release(), static_cast(data_size), + true); + } + + CPDF_SyntaxParser syntax; + syntax.InitParser(data_stream_, 0); + + const int object_count = stream->GetDict()->GetIntegerFor("N"); + for (int32_t i = object_count; i > 0; --i) { + if (syntax.GetPos() >= data_stream_->GetSize()) + break; + + const uint32_t obj_num = syntax.GetDirectNum(); + const uint32_t obj_offset = syntax.GetDirectNum(); + if (!obj_num) + continue; + + objects_offsets_[obj_num] = obj_offset; + } +} + +std::unique_ptr CPDF_ObjectStream::ParseObjectAtOffset( + CPDF_IndirectObjectHolder* pObjList, + uint32_t object_offset) const { + FX_SAFE_FILESIZE offset_in_stream = first_object_offset_; + offset_in_stream += object_offset; + + if (!offset_in_stream.IsValid()) + return nullptr; + + if (offset_in_stream.ValueOrDie() >= data_stream_->GetSize()) + return nullptr; + + CPDF_SyntaxParser syntax; + syntax.InitParser(data_stream_, 0); + syntax.SetPos(offset_in_stream.ValueOrDie()); + return syntax.GetObjectBody(pObjList); +} diff --git a/core/fpdfapi/parser/cpdf_object_stream.h b/core/fpdfapi/parser/cpdf_object_stream.h new file mode 100644 index 0000000000..816c1be77e --- /dev/null +++ b/core/fpdfapi/parser/cpdf_object_stream.h @@ -0,0 +1,52 @@ +// Copyright 2018 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CORE_FPDFAPI_PARSER_CPDF_OBJECT_STREAM_H_ +#define CORE_FPDFAPI_PARSER_CPDF_OBJECT_STREAM_H_ + +#include +#include +#include + +#include "core/fpdfapi/parser/cpdf_object.h" +#include "core/fxcrt/retain_ptr.h" + +class CPDF_IndirectObjectHolder; +class CPDF_Stream; +class CPDF_StreamAcc; +class IFX_SeekableReadStream; + +// Implementation of logic of PDF "Object Streams". +// See "PDF 32000-1:2008" Spec. section 7.5.7. +class CPDF_ObjectStream { + public: + static bool IsObjectsStreamObject(const CPDF_Object* object); + + static std::unique_ptr Create(const CPDF_Stream* stream); + + ~CPDF_ObjectStream(); + + uint32_t obj_num() const { return obj_num_; } + uint32_t extends_obj_num() const { return extends_obj_num_; } + + bool HasObject(uint32_t obj_number) const; + std::unique_ptr ParseObject(CPDF_IndirectObjectHolder* pObjList, + uint32_t obj_number) const; + + protected: + explicit CPDF_ObjectStream(const CPDF_Stream* stream); + + void Init(const CPDF_Stream* stream); + std::unique_ptr ParseObjectAtOffset( + CPDF_IndirectObjectHolder* pObjList, + uint32_t object_offset) const; + + uint32_t obj_num_ = CPDF_Object::kInvalidObjNum; + uint32_t extends_obj_num_ = CPDF_Object::kInvalidObjNum; + RetainPtr data_stream_; + int first_object_offset_ = 0; + std::map objects_offsets_; +}; + +#endif // CORE_FPDFAPI_PARSER_CPDF_OBJECT_STREAM_H_ diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index 73b4bcdc0c..6cdd6ab789 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -16,6 +16,7 @@ #include "core/fpdfapi/parser/cpdf_document.h" #include "core/fpdfapi/parser/cpdf_linearized_header.h" #include "core/fpdfapi/parser/cpdf_number.h" +#include "core/fpdfapi/parser/cpdf_object_stream.h" #include "core/fpdfapi/parser/cpdf_read_validator.h" #include "core/fpdfapi/parser/cpdf_reference.h" #include "core/fpdfapi/parser/cpdf_security_handler.h" @@ -46,14 +47,6 @@ uint32_t GetVarInt(const uint8_t* p, int32_t n) { return result; } -int32_t GetStreamNCount(const RetainPtr& pObjStream) { - return pObjStream->GetDict()->GetIntegerFor("N"); -} - -int32_t GetStreamFirst(const RetainPtr& pObjStream) { - return pObjStream->GetDict()->GetIntegerFor("First"); -} - } // namespace class CPDF_Parser::TrailerData { @@ -1198,8 +1191,7 @@ std::unique_ptr CPDF_Parser::ParseIndirectObject( return nullptr; pdfium::ScopedSetInsertion local_insert(&m_ParsingObjNums, objnum); - if (GetObjectType(objnum) == ObjectType::kNotCompressed || - GetObjectType(objnum) == ObjectType::kNull) { + if (GetObjectType(objnum) == ObjectType::kNotCompressed) { FX_FILESIZE pos = m_ObjectInfo[objnum].pos; if (pos <= 0) return nullptr; @@ -1208,52 +1200,43 @@ std::unique_ptr CPDF_Parser::ParseIndirectObject( if (GetObjectType(objnum) != ObjectType::kCompressed) return nullptr; - RetainPtr pObjStream = - GetObjectStream(m_ObjectInfo[objnum].pos); + const CPDF_ObjectStream* pObjStream = + GetObjectStream(pObjList, m_ObjectInfo[objnum].pos); if (!pObjStream) return nullptr; - auto file = pdfium::MakeRetain( - const_cast(pObjStream->GetData()), - static_cast(pObjStream->GetSize()), false); - CPDF_SyntaxParser syntax; - syntax.InitParser(file, 0); - const int32_t offset = GetStreamFirst(pObjStream); - - // Read object numbers from |pObjStream| into a cache. - if (!pdfium::ContainsKey(m_ObjCache, pObjStream)) { - for (int32_t i = GetStreamNCount(pObjStream); i > 0; --i) { - uint32_t thisnum = syntax.GetDirectNum(); - uint32_t thisoff = syntax.GetDirectNum(); - m_ObjCache[pObjStream][thisnum] = thisoff; - } - } + return pObjStream->ParseObject(pObjList, objnum); +} - const auto it = m_ObjCache[pObjStream].find(objnum); - if (it == m_ObjCache[pObjStream].end()) +const CPDF_ObjectStream* CPDF_Parser::GetObjectStream( + CPDF_IndirectObjectHolder* pObjList, + uint32_t object_number) { + // Prevent circular parsing the same object. + if (pdfium::ContainsKey(m_ParsingObjNums, object_number)) return nullptr; - syntax.SetPos(offset + it->second); - return syntax.GetObjectBody(pObjList); -} + pdfium::ScopedSetInsertion local_insert(&m_ParsingObjNums, + object_number); -RetainPtr CPDF_Parser::GetObjectStream(uint32_t objnum) { - auto it = m_ObjectStreamMap.find(objnum); + auto it = m_ObjectStreamMap.find(object_number); if (it != m_ObjectStreamMap.end()) - return it->second; + return it->second.get(); - if (!m_pDocument) + const FX_FILESIZE object_pos = GetObjectPositionOrZero(object_number); + if (object_pos <= 0) return nullptr; - const CPDF_Stream* pStream = - ToStream(m_pDocument->GetOrParseIndirectObject(objnum)); - if (!pStream) + std::unique_ptr object = + ParseIndirectObjectAt(pObjList, object_pos, object_number); + if (!object) return nullptr; - auto pStreamAcc = pdfium::MakeRetain(pStream); - pStreamAcc->LoadAllDataFiltered(); - m_ObjectStreamMap[objnum] = pStreamAcc; - return pStreamAcc; + std::unique_ptr objs_stream = + CPDF_ObjectStream::Create(ToStream(object.get())); + const CPDF_ObjectStream* result = objs_stream.get(); + m_ObjectStreamMap[object_number] = std::move(objs_stream); + + return result; } std::unique_ptr CPDF_Parser::ParseIndirectObjectAt( @@ -1437,7 +1420,6 @@ CPDF_Parser::Error CPDF_Parser::LoadLinearizedMainXRefTable() { const AutoRestorer save_metadata_objnum(&m_MetadataObjnum); m_MetadataObjnum = 0; m_ObjectStreamMap.clear(); - m_ObjCache.clear(); if (!LoadLinearizedAllCrossRefV4(main_xref_offset.ValueOrDie()) && !LoadLinearizedAllCrossRefV5(main_xref_offset.ValueOrDie())) { diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h index 5f0a4a1cf3..705fad9bdb 100644 --- a/core/fpdfapi/parser/cpdf_parser.h +++ b/core/fpdfapi/parser/cpdf_parser.h @@ -26,9 +26,9 @@ class CPDF_Document; class CPDF_IndirectObjectHolder; class CPDF_LinearizedHeader; class CPDF_Object; +class CPDF_ObjectStream; class CPDF_ReadValidator; class CPDF_SecurityHandler; -class CPDF_StreamAcc; class CPDF_SyntaxParser; class IFX_SeekableReadStream; @@ -172,7 +172,8 @@ class CPDF_Parser { bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos); bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos); Error LoadLinearizedMainXRefTable(); - RetainPtr GetObjectStream(uint32_t number); + const CPDF_ObjectStream* GetObjectStream(CPDF_IndirectObjectHolder* pObjList, + uint32_t object_number); std::unique_ptr ParseLinearizedHeader(); void SetEncryptDictionary(CPDF_Dictionary* pDict); void ShrinkObjectMap(uint32_t size); @@ -218,15 +219,7 @@ class CPDF_Parser { std::unique_ptr m_pLinearized; // A map of object numbers to indirect streams. - std::map> m_ObjectStreamMap; - - // Mapping of object numbers to offsets. The offsets are relative to the first - // object in the stream. - using StreamObjectCache = std::map; - - // Mapping of streams to their object caches. This is valid as long as the - // streams in |m_ObjectStreamMap| are valid. - std::map, StreamObjectCache> m_ObjCache; + std::map> m_ObjectStreamMap; // All indirect object numbers that are being parsed. std::set m_ParsingObjNums; diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.h b/core/fpdfapi/parser/cpdf_syntax_parser.h index 5aad2e463c..ed760934b9 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.h +++ b/core/fpdfapi/parser/cpdf_syntax_parser.h @@ -60,6 +60,7 @@ class CPDF_SyntaxParser { const RetainPtr& GetValidator() const { return m_pFileAccess; } + uint32_t GetDirectNum(); private: friend class CPDF_Parser; @@ -69,7 +70,6 @@ class CPDF_SyntaxParser { static const int kParserMaxRecursionDepth = 64; static int s_CurrentRecursionDepth; - uint32_t GetDirectNum(); bool ReadBlockAt(FX_FILESIZE read_pos); bool GetNextChar(uint8_t& ch); bool GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch); -- cgit v1.2.3