From 1beb4a9c5ff7ac58450310493783ef7869f4de71 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Thu, 2 Nov 2017 14:40:38 +0000 Subject: Unify parsing of linearized header. Change-Id: I3b55b1331ee97af254c248d4ac91b627c9603b59 Reviewed-on: https://pdfium-review.googlesource.com/13831 Commit-Queue: Art Snake Reviewed-by: dsinclair --- core/fpdfapi/parser/cpdf_data_avail.cpp | 100 ++++++++++--------------- core/fpdfapi/parser/cpdf_data_avail.h | 4 +- core/fpdfapi/parser/cpdf_linearized_header.cpp | 40 ++++++++-- core/fpdfapi/parser/cpdf_linearized_header.h | 8 +- core/fpdfapi/parser/cpdf_parser.cpp | 34 ++------- core/fpdfapi/parser/cpdf_parser.h | 2 +- 6 files changed, 89 insertions(+), 99 deletions(-) diff --git a/core/fpdfapi/parser/cpdf_data_avail.cpp b/core/fpdfapi/parser/cpdf_data_avail.cpp index 93dd39b87b..2f79e56678 100644 --- a/core/fpdfapi/parser/cpdf_data_avail.cpp +++ b/core/fpdfapi/parser/cpdf_data_avail.cpp @@ -108,6 +108,7 @@ CPDF_DataAvail::CPDF_DataAvail( m_bCurPageDictLoadOK = false; m_bLinearedDataOK = false; m_bSupportHintTable = bSupportHintTable; + m_bHeaderAvail = false; } CPDF_DataAvail::~CPDF_DataAvail() { @@ -413,25 +414,19 @@ bool CPDF_DataAvail::CheckPages() { } bool CPDF_DataAvail::CheckHeader() { - ASSERT(m_dwFileLen >= 0); - const uint32_t kReqSize = std::min(static_cast(m_dwFileLen), 1024U); - std::vector buffer(kReqSize); - { - const CPDF_ReadValidator::Session read_session(GetValidator().Get()); - m_pFileRead->ReadBlock(buffer.data(), 0, kReqSize); - if (GetValidator()->has_read_problems()) + switch (CheckHeaderAndLinearized()) { + case DocAvailStatus::DataAvailable: + m_docStatus = m_pLinearized ? PDF_DATAAVAIL_FIRSTPAGE : PDF_DATAAVAIL_END; + return true; + case DocAvailStatus::DataNotAvailable: + return false; + case DocAvailStatus::DataError: + m_docStatus = PDF_DATAAVAIL_ERROR; + return true; + default: + NOTREACHED(); return false; } - - if (IsLinearizedFile(buffer.data(), kReqSize)) { - m_docStatus = PDF_DATAAVAIL_FIRSTPAGE; - return true; - } - if (m_docStatus == PDF_DATAAVAIL_ERROR) - return false; - - m_docStatus = PDF_DATAAVAIL_END; - return true; } bool CPDF_DataAvail::CheckFirstPage() { @@ -504,56 +499,41 @@ std::unique_ptr CPDF_DataAvail::ParseIndirectObjectAt( } CPDF_DataAvail::DocLinearizationStatus CPDF_DataAvail::IsLinearizedPDF() { - const uint32_t kReqSize = 1024; - if (!m_pFileAvail->IsDataAvail(0, kReqSize)) - return LinearizationUnknown; - - FX_FILESIZE dwSize = m_pFileRead->GetSize(); - if (dwSize < (FX_FILESIZE)kReqSize) - return LinearizationUnknown; - - std::vector buffer(kReqSize); - m_pFileRead->ReadBlock(buffer.data(), 0, kReqSize); - if (IsLinearizedFile(buffer.data(), kReqSize)) - return Linearized; - - return NotLinearized; + switch (CheckHeaderAndLinearized()) { + case DocAvailStatus::DataAvailable: + return m_pLinearized ? DocLinearizationStatus::Linearized + : DocLinearizationStatus::NotLinearized; + case DocAvailStatus::DataNotAvailable: + return DocLinearizationStatus::LinearizationUnknown; + case DocAvailStatus::DataError: + return DocLinearizationStatus::NotLinearized; + default: + NOTREACHED(); + return DocLinearizationStatus::LinearizationUnknown; + } } -bool CPDF_DataAvail::IsLinearized() { - return !!m_pLinearized; -} +CPDF_DataAvail::DocAvailStatus CPDF_DataAvail::CheckHeaderAndLinearized() { + if (m_bHeaderAvail) + return DocAvailStatus::DataAvailable; -bool CPDF_DataAvail::IsLinearizedFile(uint8_t* pData, uint32_t dwLen) { - if (m_pLinearized) - return true; + const CPDF_ReadValidator::Session read_session(GetValidator().Get()); + const int32_t header_offset = GetHeaderOffset(GetValidator()); + if (GetValidator()->has_read_problems()) + return DocAvailStatus::DataNotAvailable; - auto file = pdfium::MakeRetain( - pData, static_cast(dwLen), false); - int32_t offset = GetHeaderOffset(file); - if (offset == kInvalidHeaderOffset) { - m_docStatus = PDF_DATAAVAIL_ERROR; - return false; - } + if (header_offset == kInvalidHeaderOffset) + return DocAvailStatus::DataError; - m_dwHeaderOffset = offset; - m_syntaxParser.InitParser(file, offset); - m_syntaxParser.SetPos(m_syntaxParser.m_HeaderOffset + 9); + m_dwHeaderOffset = header_offset; - bool bNumber; - ByteString wordObjNum = m_syntaxParser.GetNextWord(&bNumber); - if (!bNumber) - return false; + m_syntaxParser.InitParserWithValidator(GetValidator(), header_offset); + m_pLinearized = CPDF_LinearizedHeader::Parse(&m_syntaxParser); + if (GetValidator()->has_read_problems()) + return DocAvailStatus::DataNotAvailable; - uint32_t objnum = FXSYS_atoui(wordObjNum.c_str()); - m_pLinearized = CPDF_LinearizedHeader::CreateForObject( - ParseIndirectObjectAt(m_syntaxParser.m_HeaderOffset + 9, objnum)); - if (!m_pLinearized || - m_pLinearized->GetFileSize() != m_pFileRead->GetSize()) { - m_pLinearized.reset(); - return false; - } - return true; + m_bHeaderAvail = true; + return DocAvailStatus::DataAvailable; } bool CPDF_DataAvail::CheckEnd() { diff --git a/core/fpdfapi/parser/cpdf_data_avail.h b/core/fpdfapi/parser/cpdf_data_avail.h index 2d46be1152..0481408b36 100644 --- a/core/fpdfapi/parser/cpdf_data_avail.h +++ b/core/fpdfapi/parser/cpdf_data_avail.h @@ -103,7 +103,6 @@ class CPDF_DataAvail final { DocAvailStatus IsPageAvail(uint32_t dwPage, DownloadHints* pHints); DocFormStatus IsFormAvail(DownloadHints* pHints); DocLinearizationStatus IsLinearizedPDF(); - bool IsLinearized(); RetainPtr GetFileRead() const; int GetPageCount() const; CPDF_Dictionary* GetPage(int index); @@ -140,7 +139,7 @@ class CPDF_DataAvail final { DocFormStatus CheckAcroForm(); bool CheckPageStatus(); - bool IsLinearizedFile(uint8_t* pData, uint32_t dwLen); + DocAvailStatus CheckHeaderAndLinearized(); void SetStartOffset(FX_FILESIZE dwOffset); bool GetNextToken(ByteString* token); bool GetNextChar(uint8_t& ch); @@ -219,6 +218,7 @@ class CPDF_DataAvail final { std::map> m_PagesObjAvail; std::map> m_PagesResourcesAvail; + bool m_bHeaderAvail; }; #endif // CORE_FPDFAPI_PARSER_CPDF_DATA_AVAIL_H_ diff --git a/core/fpdfapi/parser/cpdf_linearized_header.cpp b/core/fpdfapi/parser/cpdf_linearized_header.cpp index 98cdcc450f..ce22c55f48 100644 --- a/core/fpdfapi/parser/cpdf_linearized_header.cpp +++ b/core/fpdfapi/parser/cpdf_linearized_header.cpp @@ -12,10 +12,13 @@ #include "core/fpdfapi/parser/cpdf_array.h" #include "core/fpdfapi/parser/cpdf_dictionary.h" #include "core/fpdfapi/parser/cpdf_number.h" +#include "core/fpdfapi/parser/cpdf_syntax_parser.h" #include "third_party/base/ptr_util.h" namespace { +constexpr FX_FILESIZE kLinearizedHeaderOffset = 9; + template bool IsValidNumericDictionaryValue(const CPDF_Dictionary* pDict, const char* key, @@ -32,21 +35,48 @@ bool IsValidNumericDictionaryValue(const CPDF_Dictionary* pDict, return static_cast(raw_value) >= min_value; } +bool IsLinearizedHeaderValid(const CPDF_LinearizedHeader* header, + FX_FILESIZE file_size) { + ASSERT(header); + return header->GetFileSize() == file_size && + header->GetMainXRefTableFirstEntryOffset() < file_size && + header->GetPageCount() > 0 && + header->GetFirstPageEndOffset() < file_size && + header->GetLastXRefOffset() < file_size && + header->GetHintStart() < file_size; +} + } // namespace // static -std::unique_ptr CPDF_LinearizedHeader::CreateForObject( - std::unique_ptr pObj) { - auto pDict = ToDictionary(std::move(pObj)); +std::unique_ptr CPDF_LinearizedHeader::Parse( + CPDF_SyntaxParser* parser) { + parser->SetPos(kLinearizedHeaderOffset); + + const auto pDict = ToDictionary( + parser->GetIndirectObject(nullptr, CPDF_SyntaxParser::ParseType::kLoose)); + if (!pDict || !pDict->KeyExist("Linearized") || !IsValidNumericDictionaryValue(pDict.get(), "L", 1) || !IsValidNumericDictionaryValue(pDict.get(), "P", 0, false) || !IsValidNumericDictionaryValue(pDict.get(), "T", 1) || !IsValidNumericDictionaryValue(pDict.get(), "N", 0) || !IsValidNumericDictionaryValue(pDict.get(), "E", 1) || - !IsValidNumericDictionaryValue(pDict.get(), "O", 1)) + !IsValidNumericDictionaryValue(pDict.get(), "O", 1)) { return nullptr; - return pdfium::WrapUnique(new CPDF_LinearizedHeader(pDict.get())); + } + // Move parser to the start of the xref table for the documents first page. + // (skpping endobj keyword) + if (parser->GetNextWord(nullptr) != "endobj") + return nullptr; + + auto result = pdfium::WrapUnique(new CPDF_LinearizedHeader(pDict.get())); + result->m_szLastXRefOffset = parser->GetPos(); + + return IsLinearizedHeaderValid(result.get(), + parser->GetFileAccess()->GetSize()) + ? std::move(result) + : nullptr; } CPDF_LinearizedHeader::CPDF_LinearizedHeader(const CPDF_Dictionary* pDict) { diff --git a/core/fpdfapi/parser/cpdf_linearized_header.h b/core/fpdfapi/parser/cpdf_linearized_header.h index 98ae9c650f..d73216059f 100644 --- a/core/fpdfapi/parser/cpdf_linearized_header.h +++ b/core/fpdfapi/parser/cpdf_linearized_header.h @@ -14,12 +14,13 @@ class CPDF_Dictionary; class CPDF_Object; +class CPDF_SyntaxParser; class CPDF_LinearizedHeader { public: ~CPDF_LinearizedHeader(); - static std::unique_ptr CreateForObject( - std::unique_ptr pObj); + static std::unique_ptr Parse( + CPDF_SyntaxParser* parser); // Will only return values > 0. FX_FILESIZE GetFileSize() const { return m_szFileSize; } @@ -33,6 +34,8 @@ class CPDF_LinearizedHeader { FX_FILESIZE GetFirstPageEndOffset() const { return m_szFirstPageEndOffset; } // Will only return values > 0. uint32_t GetFirstPageObjNum() const { return m_FirstPageObjNum; } + // Will only return values > 0. + FX_FILESIZE GetLastXRefOffset() const { return m_szLastXRefOffset; } bool HasHintTable() const; // Will only return values > 0. @@ -51,6 +54,7 @@ class CPDF_LinearizedHeader { uint32_t m_FirstPageObjNum = 0; FX_FILESIZE m_szHintStart = 0; uint32_t m_HintLength = 0; + FX_FILESIZE m_szLastXRefOffset = 0; }; #endif // CORE_FPDFAPI_PARSER_CPDF_LINEARIZED_HEADER_H_ diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index 6957c84071..7a8f4f9ce7 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -1280,34 +1280,8 @@ uint32_t CPDF_Parser::GetPermissions() const { return dwPermission; } -bool CPDF_Parser::ParseLinearizedHeader() { - m_pSyntax->SetPos(m_pSyntax->m_HeaderOffset + 9); - - FX_FILESIZE SavedPos = m_pSyntax->GetPos(); - bool bIsNumber; - ByteString word = m_pSyntax->GetNextWord(&bIsNumber); - if (!bIsNumber) - return false; - - word = m_pSyntax->GetNextWord(&bIsNumber); - if (!bIsNumber) - return false; - - if (m_pSyntax->GetKeyword() != "obj") { - m_pSyntax->SetPos(SavedPos); - return false; - } - - m_pLinearized = - CPDF_LinearizedHeader::CreateForObject(m_pSyntax->GetObjectBody(nullptr)); - if (!m_pLinearized) - return false; - - // Move parser onto first page xref table start. - m_pSyntax->GetNextWord(nullptr); - - m_LastXRefOffset = m_pSyntax->GetPos(); - return true; +std::unique_ptr CPDF_Parser::ParseLinearizedHeader() { + return CPDF_LinearizedHeader::Parse(m_pSyntax.get()); } CPDF_Parser::Error CPDF_Parser::StartLinearizedParse( @@ -1320,12 +1294,14 @@ CPDF_Parser::Error CPDF_Parser::StartLinearizedParse( if (!InitSyntaxParser(pFileAccess)) return FORMAT_ERROR; - if (!ParseLinearizedHeader()) + m_pLinearized = ParseLinearizedHeader(); + if (!m_pLinearized) return StartParseInternal(std::move(pDocument)); m_bHasParsed = true; m_pDocument = pDocument; + m_LastXRefOffset = m_pLinearized->GetLastXRefOffset(); FX_FILESIZE dwFirstXRefOffset = m_LastXRefOffset; bool bXRefRebuilt = false; bool bLoadV4 = LoadCrossRefV4(dwFirstXRefOffset, false); diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h index a58838e5fb..6c8cfbd0f8 100644 --- a/core/fpdfapi/parser/cpdf_parser.h +++ b/core/fpdfapi/parser/cpdf_parser.h @@ -166,7 +166,7 @@ class CPDF_Parser { bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos); Error LoadLinearizedMainXRefTable(); RetainPtr GetObjectStream(uint32_t number); - bool ParseLinearizedHeader(); + std::unique_ptr ParseLinearizedHeader(); void SetEncryptDictionary(CPDF_Dictionary* pDict); void ShrinkObjectMap(uint32_t size); // A simple check whether the cross reference table matches with -- cgit v1.2.3