From 70ddc1ca22ad44a77006491b604a75f6514a4aa8 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Wed, 25 Jul 2018 02:28:35 +0000 Subject: Use document size instead of file size while parsing. We should use document size instead of File size, because all offsets and sizes was read from document should take into account of header offset. Added some tests of parsing of documents with header offset. Also drop friendship of CPDF_SyntaxParser with CPDF_Parser. Change-Id: Iebec75ab2ee07fb644a6c653b4ef5c2e09af09fe Reviewed-on: https://pdfium-review.googlesource.com/35830 Commit-Queue: Art Snake Reviewed-by: Lei Zhang --- core/fpdfapi/edit/cpdf_creator.cpp | 14 ++--- core/fpdfapi/parser/cpdf_linearized_header.cpp | 17 +++--- core/fpdfapi/parser/cpdf_parser.cpp | 11 ++-- core/fpdfapi/parser/cpdf_parser.h | 6 ++- core/fpdfapi/parser/cpdf_parser_unittest.cpp | 71 ++++++++++++++++++++++++-- core/fpdfapi/parser/cpdf_syntax_parser.cpp | 9 ++-- core/fpdfapi/parser/cpdf_syntax_parser.h | 19 +++++-- 7 files changed, 110 insertions(+), 37 deletions(-) diff --git a/core/fpdfapi/edit/cpdf_creator.cpp b/core/fpdfapi/edit/cpdf_creator.cpp index aa8c751046..11b0628b01 100644 --- a/core/fpdfapi/edit/cpdf_creator.cpp +++ b/core/fpdfapi/edit/cpdf_creator.cpp @@ -17,6 +17,7 @@ #include "core/fpdfapi/parser/cpdf_parser.h" #include "core/fpdfapi/parser/cpdf_security_handler.h" #include "core/fpdfapi/parser/cpdf_string.h" +#include "core/fpdfapi/parser/cpdf_syntax_parser.h" #include "core/fpdfapi/parser/fpdf_parser_decode.h" #include "core/fpdfapi/parser/fpdf_parser_utility.h" #include "core/fxcrt/fx_extension.h" @@ -254,20 +255,19 @@ CPDF_Creator::Stage CPDF_Creator::WriteDoc_Stage1() { } m_iStage = Stage::kInitWriteObjs20; } else { - m_SavedOffset = m_pParser->GetFileAccess()->GetSize(); + m_SavedOffset = m_pParser->GetSyntax()->GetDocumentSize(); m_iStage = Stage::kWriteIncremental15; } } if (m_iStage == Stage::kWriteIncremental15) { if (m_IsOriginal && m_SavedOffset > 0) { - RetainPtr pSrcFile = m_pParser->GetFileAccess(); - std::vector buffer(4096); + static constexpr FX_FILESIZE kBufferSize = 4096; + std::vector buffer(kBufferSize); FX_FILESIZE src_size = m_SavedOffset; + m_pParser->GetSyntax()->SetPos(0); while (src_size) { - uint32_t block_size = src_size > 4096 ? 4096 : src_size; - if (!pSrcFile->ReadBlock(buffer.data(), - m_Archive->CurrentOffset() - src_size, - block_size)) { + const FX_FILESIZE block_size = std::min(kBufferSize, src_size); + if (!m_pParser->GetSyntax()->ReadBlock(buffer.data(), block_size)) { return Stage::kInvalid; } if (!m_Archive->WriteBlock(buffer.data(), block_size)) diff --git a/core/fpdfapi/parser/cpdf_linearized_header.cpp b/core/fpdfapi/parser/cpdf_linearized_header.cpp index 5032bc3807..8286e0fbe3 100644 --- a/core/fpdfapi/parser/cpdf_linearized_header.cpp +++ b/core/fpdfapi/parser/cpdf_linearized_header.cpp @@ -38,17 +38,17 @@ bool IsValidNumericDictionaryValue(const CPDF_Dictionary* pDict, } bool IsLinearizedHeaderValid(const CPDF_LinearizedHeader* header, - FX_FILESIZE file_size) { + FX_FILESIZE document_size) { ASSERT(header); - return header->GetFileSize() == file_size && + return header->GetFileSize() == document_size && static_cast(header->GetFirstPageNo()) >= 0 && header->GetFirstPageNo() < kMaxInt && header->GetFirstPageNo() < header->GetPageCount() && - header->GetMainXRefTableFirstEntryOffset() < file_size && + header->GetMainXRefTableFirstEntryOffset() < document_size && header->GetPageCount() > 0 && - header->GetFirstPageEndOffset() < file_size && - header->GetLastXRefOffset() < file_size && - header->GetHintStart() < file_size; + header->GetFirstPageEndOffset() < document_size && + header->GetLastXRefOffset() < document_size && + header->GetHintStart() < document_size; } } // namespace @@ -78,10 +78,9 @@ std::unique_ptr CPDF_LinearizedHeader::Parse( auto result = pdfium::WrapUnique( new CPDF_LinearizedHeader(pDict.get(), parser->GetPos())); - if (!IsLinearizedHeaderValid(result.get(), - parser->GetFileAccess()->GetSize())) { + if (!IsLinearizedHeaderValid(result.get(), parser->GetDocumentSize())) return nullptr; - } + return result; } diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index 16077fd39e..be18560ffc 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -119,10 +119,6 @@ bool CPDF_Parser::IsObjectFree(uint32_t objnum) const { return GetObjectType(objnum) == ObjectType::kFree; } -RetainPtr CPDF_Parser::GetFileAccess() const { - return m_pSyntax->GetFileAccess(); -} - void CPDF_Parser::ShrinkObjectMap(uint32_t objnum) { m_CrossRefTable->ShrinkObjectMap(objnum); } @@ -230,8 +226,7 @@ CPDF_Parser::Error CPDF_Parser::StartParseInternal() { FX_FILESIZE CPDF_Parser::ParseStartXRef() { static constexpr char kStartXRefKeyword[] = "startxref"; - m_pSyntax->SetPos(m_pSyntax->m_FileLen - m_pSyntax->m_HeaderOffset - - strlen(kStartXRefKeyword)); + m_pSyntax->SetPos(m_pSyntax->GetDocumentSize() - strlen(kStartXRefKeyword)); if (!m_pSyntax->BackwardsSearchToWord(kStartXRefKeyword, 4096)) return 0; @@ -245,7 +240,7 @@ FX_FILESIZE CPDF_Parser::ParseStartXRef() { return 0; const FX_SAFE_FILESIZE result = FXSYS_atoi64(xrefpos_str.c_str()); - if (!result.IsValid() || result.ValueOrDie() >= GetFileAccess()->GetSize()) + if (!result.IsValid() || result.ValueOrDie() >= m_pSyntax->GetDocumentSize()) return 0; return result.ValueOrDie(); @@ -453,7 +448,7 @@ bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData( return false; const size_t max_entries_in_file = - m_pSyntax->GetFileAccess()->GetSize() / kEntryConstSize; + m_pSyntax->GetDocumentSize() / kEntryConstSize; if (new_size.ValueOrDie() > max_entries_in_file) return false; diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h index 0af39d7529..7cd98c2c26 100644 --- a/core/fpdfapi/parser/cpdf_parser.h +++ b/core/fpdfapi/parser/cpdf_parser.h @@ -90,7 +90,6 @@ class CPDF_Parser { CPDF_SecurityHandler* GetSecurityHandler() const { return m_pSecurityHandler.get(); } - RetainPtr GetFileAccess() const; bool IsObjectFree(uint32_t objnum) const; int GetFileVersion() const { return m_FileVersion; } @@ -109,6 +108,8 @@ class CPDF_Parser { return m_CrossRefTable.get(); } + CPDF_SyntaxParser* GetSyntax() const { return m_pSyntax.get(); } + void SetLinearizedHeader(std::unique_ptr pLinearized); protected: @@ -121,6 +122,9 @@ class CPDF_Parser { bool RebuildCrossRef(); private: + friend class cpdf_parser_ParseStartXRefWithHeaderOffset_Test; + friend class cpdf_parser_ParseStartXRef_Test; + friend class cpdf_parser_ParseLinearizedWithHeaderOffset_Test; friend class CPDF_DataAvail; struct CrossRefObjData { diff --git a/core/fpdfapi/parser/cpdf_parser_unittest.cpp b/core/fpdfapi/parser/cpdf_parser_unittest.cpp index db48493c2f..4ce1238ee3 100644 --- a/core/fpdfapi/parser/cpdf_parser_unittest.cpp +++ b/core/fpdfapi/parser/cpdf_parser_unittest.cpp @@ -3,8 +3,12 @@ // found in the LICENSE file. #include +#include #include +#include +#include "core/fpdfapi/parser/cpdf_linearized_header.h" +#include "core/fpdfapi/parser/cpdf_object.h" #include "core/fpdfapi/parser/cpdf_parser.h" #include "core/fpdfapi/parser/cpdf_syntax_parser.h" #include "core/fxcrt/fx_extension.h" @@ -43,13 +47,19 @@ class CPDF_TestParser : public CPDF_Parser { } // Setup reading from a buffer and initial states. - bool InitTestFromBuffer(const unsigned char* buffer, size_t len) { - // For the test file, the header is set at the beginning. + bool InitTestFromBufferWithOffset(const unsigned char* buffer, + size_t len, + int header_offset) { m_pSyntax->InitParser( - pdfium::MakeRetain(buffer, len), 0); + pdfium::MakeRetain(buffer, len), + header_offset); return true; } + bool InitTestFromBuffer(const unsigned char* buffer, size_t len) { + return InitTestFromBufferWithOffset(buffer, len, 0 /*header_offset*/); + } + private: // Add test cases here as private friend so that protected members in // CPDF_Parser can be accessed by test cases. @@ -226,3 +236,58 @@ TEST(cpdf_parser, LoadCrossRefV4) { } } } + +TEST(cpdf_parser, ParseStartXRef) { + CPDF_TestParser parser; + std::string test_file; + ASSERT_TRUE( + PathService::GetTestFilePath("annotation_stamp_with_ap.pdf", &test_file)); + ASSERT_TRUE(parser.InitTestFromFile(test_file.c_str())) << test_file; + + EXPECT_EQ(100940, parser.ParseStartXRef()); + std::unique_ptr cross_ref_v5_obj = + parser.ParseIndirectObjectAt(100940, 0); + ASSERT_TRUE(cross_ref_v5_obj); + EXPECT_EQ(75u, cross_ref_v5_obj->GetObjNum()); +} + +TEST(cpdf_parser, ParseStartXRefWithHeaderOffset) { + static constexpr FX_FILESIZE kTestHeaderOffset = 765; + std::string test_file; + ASSERT_TRUE( + PathService::GetTestFilePath("annotation_stamp_with_ap.pdf", &test_file)); + RetainPtr pFileAccess = + IFX_SeekableReadStream::CreateFromFilename(test_file.c_str()); + ASSERT_TRUE(pFileAccess); + + std::vector data(pFileAccess->GetSize() + kTestHeaderOffset); + ASSERT_TRUE(pFileAccess->ReadBlock(&data.front() + kTestHeaderOffset, 0, + pFileAccess->GetSize())); + CPDF_TestParser parser; + parser.InitTestFromBufferWithOffset(&data.front(), data.size(), + kTestHeaderOffset); + + EXPECT_EQ(100940, parser.ParseStartXRef()); + std::unique_ptr cross_ref_v5_obj = + parser.ParseIndirectObjectAt(100940, 0); + ASSERT_TRUE(cross_ref_v5_obj); + EXPECT_EQ(75u, cross_ref_v5_obj->GetObjNum()); +} + +TEST(cpdf_parser, ParseLinearizedWithHeaderOffset) { + static constexpr FX_FILESIZE kTestHeaderOffset = 765; + std::string test_file; + ASSERT_TRUE(PathService::GetTestFilePath("linearized.pdf", &test_file)); + RetainPtr pFileAccess = + IFX_SeekableReadStream::CreateFromFilename(test_file.c_str()); + ASSERT_TRUE(pFileAccess); + + std::vector data(pFileAccess->GetSize() + kTestHeaderOffset); + ASSERT_TRUE(pFileAccess->ReadBlock(&data.front() + kTestHeaderOffset, 0, + pFileAccess->GetSize())); + CPDF_TestParser parser; + parser.InitTestFromBufferWithOffset(&data.front(), data.size(), + kTestHeaderOffset); + + EXPECT_TRUE(parser.ParseLinearizedHeader()); +} diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.cpp b/core/fpdfapi/parser/cpdf_syntax_parser.cpp index f8f36aee11..af5ff488db 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.cpp +++ b/core/fpdfapi/parser/cpdf_syntax_parser.cpp @@ -112,6 +112,10 @@ bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) { return true; } +FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const { + return m_FileLen - m_HeaderOffset; +} + bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) { pos += m_HeaderOffset; if (pos >= m_FileLen) @@ -714,6 +718,7 @@ void CPDF_SyntaxParser::InitParserWithValidator( m_pFileBuf.clear(); m_HeaderOffset = HeaderOffset; m_FileLen = validator->GetSize(); + ASSERT(m_HeaderOffset <= m_FileLen); m_Pos = 0; m_pFileAccess = validator; m_BufOffset = 0; @@ -813,10 +818,6 @@ FX_FILESIZE CPDF_SyntaxParser::FindTag(const ByteStringView& tag) { return -1; } -RetainPtr CPDF_SyntaxParser::GetFileAccess() const { - return m_pFileAccess; -} - bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const { return m_BufOffset <= pos && pos < static_cast(m_BufOffset + m_pFileBuf.size()); diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.h b/core/fpdfapi/parser/cpdf_syntax_parser.h index 962c32753d..ebd844c3e0 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.h +++ b/core/fpdfapi/parser/cpdf_syntax_parser.h @@ -60,16 +60,24 @@ class CPDF_SyntaxParser { ByteString GetNextWord(bool* bIsNumber); ByteString PeekNextWord(bool* bIsNumber); - RetainPtr GetFileAccess() const; - const RetainPtr& GetValidator() const { return m_pFileAccess; } uint32_t GetDirectNum(); bool GetNextChar(uint8_t& ch); + // The document size may be smaller than the file size. + // The syntax parser use position relative to document + // offset (|m_HeaderOffset|). + // The document size will be FileSize - "Header offset". + // All offsets was readed from document, should not be great than document + // size. Use it for checks instead of real file size. + FX_FILESIZE GetDocumentSize() const; + + ByteString ReadString(); + ByteString ReadHexString(); + private: - friend class CPDF_Parser; friend class CPDF_DataAvail; friend class cpdf_syntax_parser_ReadHexString_Test; @@ -84,8 +92,6 @@ class CPDF_SyntaxParser { const ByteStringView& tag, bool checkKeyword); - ByteString ReadString(); - ByteString ReadHexString(); unsigned int ReadEOLMarkers(FX_FILESIZE pos); FX_FILESIZE FindWordPos(const ByteStringView& word); FX_FILESIZE FindStreamEndPos(); @@ -99,6 +105,9 @@ class CPDF_SyntaxParser { ParseType parse_type); FX_FILESIZE m_Pos; + // The syntax parser use position relative to header offset. + // The header contains at file start, and can follow after some stuff. We + // ignore this stuff. FX_FILESIZE m_HeaderOffset; FX_FILESIZE m_FileLen; WeakPtr m_pPool; -- cgit v1.2.3