// Copyright 2016 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include "core/fpdfapi/parser/cpdf_parser.h" #include #include #include #include "core/fpdfapi/parser/cpdf_array.h" #include "core/fpdfapi/parser/cpdf_crypto_handler.h" #include "core/fpdfapi/parser/cpdf_dictionary.h" #include "core/fpdfapi/parser/cpdf_document.h" #include "core/fpdfapi/parser/cpdf_linearized_header.h" #include "core/fpdfapi/parser/cpdf_number.h" #include "core/fpdfapi/parser/cpdf_reference.h" #include "core/fpdfapi/parser/cpdf_security_handler.h" #include "core/fpdfapi/parser/cpdf_stream.h" #include "core/fpdfapi/parser/cpdf_stream_acc.h" #include "core/fpdfapi/parser/cpdf_syntax_parser.h" #include "core/fpdfapi/parser/fpdf_parser_utility.h" #include "core/fxcrt/fx_ext.h" #include "core/fxcrt/fx_safe_types.h" #include "third_party/base/ptr_util.h" #include "third_party/base/stl_util.h" namespace { // A limit on the size of the xref table. Theoretical limits are higher, but // this may be large enough in practice. const int32_t kMaxXRefSize = 1048576; uint32_t GetVarInt(const uint8_t* p, int32_t n) { uint32_t result = 0; for (int32_t i = 0; i < n; ++i) result = result * 256 + p[i]; return result; } int32_t GetStreamNCount(const CFX_RetainPtr& pObjStream) { return pObjStream->GetDict()->GetIntegerFor("N"); } int32_t GetStreamFirst(const CFX_RetainPtr& pObjStream) { return pObjStream->GetDict()->GetIntegerFor("First"); } } // namespace CPDF_Parser::CPDF_Parser() : m_pDocument(nullptr), m_bHasParsed(false), m_bXRefStream(false), m_bVersionUpdated(false), m_FileVersion(0), m_pEncryptDict(nullptr), m_dwXrefStartObjNum(0) { m_pSyntax = pdfium::MakeUnique(); } CPDF_Parser::~CPDF_Parser() { ReleaseEncryptHandler(); SetEncryptDictionary(nullptr); } uint32_t CPDF_Parser::GetLastObjNum() const { return m_ObjectInfo.empty() ? 0 : m_ObjectInfo.rbegin()->first; } bool CPDF_Parser::IsValidObjectNumber(uint32_t objnum) const { return !m_ObjectInfo.empty() && objnum <= m_ObjectInfo.rbegin()->first; } FX_FILESIZE CPDF_Parser::GetObjectPositionOrZero(uint32_t objnum) const { auto it = m_ObjectInfo.find(objnum); return it != m_ObjectInfo.end() ? it->second.pos : 0; } uint8_t CPDF_Parser::GetObjectType(uint32_t objnum) const { ASSERT(IsValidObjectNumber(objnum)); auto it = m_ObjectInfo.find(objnum); return it != m_ObjectInfo.end() ? it->second.type : 0; } uint16_t CPDF_Parser::GetObjectGenNum(uint32_t objnum) const { ASSERT(IsValidObjectNumber(objnum)); auto it = m_ObjectInfo.find(objnum); return it != m_ObjectInfo.end() ? it->second.gennum : 0; } bool CPDF_Parser::IsObjectFreeOrNull(uint32_t objnum) const { uint8_t type = GetObjectType(objnum); return type == 0 || type == 255; } void CPDF_Parser::SetEncryptDictionary(CPDF_Dictionary* pDict) { m_pEncryptDict = pDict; } CFX_RetainPtr CPDF_Parser::GetCryptoHandler() const { return m_pSyntax->m_pCryptoHandler; } CFX_RetainPtr CPDF_Parser::GetFileAccess() const { return m_pSyntax->m_pFileAccess; } void CPDF_Parser::ShrinkObjectMap(uint32_t objnum) { if (objnum == 0) { m_ObjectInfo.clear(); return; } auto it = m_ObjectInfo.lower_bound(objnum); while (it != m_ObjectInfo.end()) { auto saved_it = it++; m_ObjectInfo.erase(saved_it); } if (!pdfium::ContainsKey(m_ObjectInfo, objnum - 1)) m_ObjectInfo[objnum - 1].pos = 0; } CPDF_Parser::Error CPDF_Parser::StartParse( const CFX_RetainPtr& pFileAccess, CPDF_Document* pDocument) { ASSERT(!m_bHasParsed); m_bHasParsed = true; m_bXRefStream = false; m_LastXRefOffset = 0; int32_t offset = GetHeaderOffset(pFileAccess); if (offset == -1) return FORMAT_ERROR; m_pSyntax->InitParser(pFileAccess, offset); uint8_t ch; if (!m_pSyntax->GetCharAt(5, ch)) return FORMAT_ERROR; if (std::isdigit(ch)) m_FileVersion = FXSYS_toDecimalDigit(static_cast(ch)) * 10; if (!m_pSyntax->GetCharAt(7, ch)) return FORMAT_ERROR; if (std::isdigit(ch)) m_FileVersion += FXSYS_toDecimalDigit(static_cast(ch)); if (m_pSyntax->m_FileLen < m_pSyntax->m_HeaderOffset + 9) return FORMAT_ERROR; m_pSyntax->RestorePos(m_pSyntax->m_FileLen - m_pSyntax->m_HeaderOffset - 9); m_pDocument = pDocument; bool bXRefRebuilt = false; if (m_pSyntax->SearchWord("startxref", true, false, 4096)) { m_SortedOffset.insert(m_pSyntax->SavePos()); m_pSyntax->GetKeyword(); bool bNumber; CFX_ByteString xrefpos_str = m_pSyntax->GetNextWord(&bNumber); if (!bNumber) return FORMAT_ERROR; m_LastXRefOffset = (FX_FILESIZE)FXSYS_atoi64(xrefpos_str.c_str()); if (!LoadAllCrossRefV4(m_LastXRefOffset) && !LoadAllCrossRefV5(m_LastXRefOffset)) { if (!RebuildCrossRef()) return FORMAT_ERROR; bXRefRebuilt = true; m_LastXRefOffset = 0; } } else { if (!RebuildCrossRef()) return FORMAT_ERROR; bXRefRebuilt = true; } Error eRet = SetEncryptHandler(); if (eRet != SUCCESS) return eRet; m_pDocument->LoadDoc(); if (!m_pDocument->GetRoot() || m_pDocument->GetPageCount() == 0) { if (bXRefRebuilt) return FORMAT_ERROR; ReleaseEncryptHandler(); if (!RebuildCrossRef()) return FORMAT_ERROR; eRet = SetEncryptHandler(); if (eRet != SUCCESS) return eRet; m_pDocument->LoadDoc(); if (!m_pDocument->GetRoot()) return FORMAT_ERROR; } if (GetRootObjNum() == 0) { ReleaseEncryptHandler(); if (!RebuildCrossRef() || GetRootObjNum() == 0) return FORMAT_ERROR; eRet = SetEncryptHandler(); if (eRet != SUCCESS) return eRet; } if (m_pSecurityHandler && !m_pSecurityHandler->IsMetadataEncrypted()) { CPDF_Reference* pMetadata = ToReference(m_pDocument->GetRoot()->GetObjectFor("Metadata")); if (pMetadata) m_pSyntax->m_MetadataObjnum = pMetadata->GetRefObjNum(); } return SUCCESS; } CPDF_Parser::Error CPDF_Parser::SetEncryptHandler() { ReleaseEncryptHandler(); SetEncryptDictionary(nullptr); if (!m_pTrailer) return FORMAT_ERROR; CPDF_Object* pEncryptObj = m_pTrailer->GetObjectFor("Encrypt"); if (pEncryptObj) { if (CPDF_Dictionary* pEncryptDict = pEncryptObj->AsDictionary()) { SetEncryptDictionary(pEncryptDict); } else if (CPDF_Reference* pRef = pEncryptObj->AsReference()) { pEncryptObj = m_pDocument->GetOrParseIndirectObject(pRef->GetRefObjNum()); if (pEncryptObj) SetEncryptDictionary(pEncryptObj->GetDict()); } } if (m_pEncryptDict) { CFX_ByteString filter = m_pEncryptDict->GetStringFor("Filter"); if (filter != "Standard") return HANDLER_ERROR; std::unique_ptr pSecurityHandler = pdfium::MakeUnique(); if (!pSecurityHandler->OnInit(this, m_pEncryptDict)) return PASSWORD_ERROR; m_pSecurityHandler = std::move(pSecurityHandler); auto pCryptoHandler = pdfium::MakeRetain(); if (!pCryptoHandler->Init(m_pEncryptDict, m_pSecurityHandler.get())) return HANDLER_ERROR; m_pSyntax->SetEncrypt(pCryptoHandler); } return SUCCESS; } void CPDF_Parser::ReleaseEncryptHandler() { m_pSyntax->m_pCryptoHandler.Reset(); m_pSecurityHandler.reset(); } FX_FILESIZE CPDF_Parser::GetObjectOffset(uint32_t objnum) const { if (!IsValidObjectNumber(objnum)) return 0; if (GetObjectType(objnum) == 1) return GetObjectPositionOrZero(objnum); if (GetObjectType(objnum) == 2) { FX_FILESIZE pos = GetObjectPositionOrZero(objnum); return GetObjectPositionOrZero(pos); } return 0; } // Ideally, all the cross reference entries should be verified. // In reality, we rarely see well-formed cross references don't match // with the objects. crbug/602650 showed a case where object numbers // in the cross reference table are all off by one. bool CPDF_Parser::VerifyCrossRefV4() { for (const auto& it : m_ObjectInfo) { if (it.second.pos == 0) continue; // Find the first non-zero position. FX_FILESIZE SavedPos = m_pSyntax->SavePos(); m_pSyntax->RestorePos(it.second.pos); bool is_num = false; CFX_ByteString num_str = m_pSyntax->GetNextWord(&is_num); m_pSyntax->RestorePos(SavedPos); if (!is_num || num_str.IsEmpty() || FXSYS_atoui(num_str.c_str()) != it.first) { // If the object number read doesn't match the one stored, // something is wrong with the cross reference table. return false; } else { return true; } } return true; } bool CPDF_Parser::LoadAllCrossRefV4(FX_FILESIZE xrefpos) { if (!LoadCrossRefV4(xrefpos, 0, true)) return false; m_pTrailer = LoadTrailerV4(); if (!m_pTrailer) return false; int32_t xrefsize = GetDirectInteger(m_pTrailer.get(), "Size"); if (xrefsize > 0 && xrefsize <= kMaxXRefSize) ShrinkObjectMap(xrefsize); std::vector CrossRefList; std::vector XRefStreamList; std::set seen_xrefpos; CrossRefList.push_back(xrefpos); XRefStreamList.push_back(GetDirectInteger(m_pTrailer.get(), "XRefStm")); seen_xrefpos.insert(xrefpos); // When |m_pTrailer| doesn't have Prev entry or Prev entry value is not // numerical, GetDirectInteger() returns 0. Loading will end. xrefpos = GetDirectInteger(m_pTrailer.get(), "Prev"); while (xrefpos) { // Check for circular references. if (pdfium::ContainsKey(seen_xrefpos, xrefpos)) return false; seen_xrefpos.insert(xrefpos); // SLOW ... CrossRefList.insert(CrossRefList.begin(), xrefpos); LoadCrossRefV4(xrefpos, 0, true); std::unique_ptr pDict(LoadTrailerV4()); if (!pDict) return false; xrefpos = GetDirectInteger(pDict.get(), "Prev"); // SLOW ... XRefStreamList.insert(XRefStreamList.begin(), pDict->GetIntegerFor("XRefStm")); m_Trailers.push_back(std::move(pDict)); } for (size_t i = 0; i < CrossRefList.size(); ++i) { if (!LoadCrossRefV4(CrossRefList[i], XRefStreamList[i], false)) return false; if (i == 0 && !VerifyCrossRefV4()) return false; } return true; } bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos, uint32_t dwObjCount) { if (!LoadLinearizedCrossRefV4(xrefpos, dwObjCount)) return false; m_pTrailer = LoadTrailerV4(); if (!m_pTrailer) return false; int32_t xrefsize = GetDirectInteger(m_pTrailer.get(), "Size"); if (xrefsize == 0) return false; std::vector CrossRefList; std::vector XRefStreamList; std::set seen_xrefpos; CrossRefList.push_back(xrefpos); XRefStreamList.push_back(GetDirectInteger(m_pTrailer.get(), "XRefStm")); seen_xrefpos.insert(xrefpos); xrefpos = GetDirectInteger(m_pTrailer.get(), "Prev"); while (xrefpos) { // Check for circular references. if (pdfium::ContainsKey(seen_xrefpos, xrefpos)) return false; seen_xrefpos.insert(xrefpos); // SLOW ... CrossRefList.insert(CrossRefList.begin(), xrefpos); LoadCrossRefV4(xrefpos, 0, true); std::unique_ptr pDict(LoadTrailerV4()); if (!pDict) return false; xrefpos = GetDirectInteger(pDict.get(), "Prev"); // SLOW ... XRefStreamList.insert(XRefStreamList.begin(), pDict->GetIntegerFor("XRefStm")); m_Trailers.push_back(std::move(pDict)); } for (size_t i = 1; i < CrossRefList.size(); ++i) { if (!LoadCrossRefV4(CrossRefList[i], XRefStreamList[i], false)) return false; } return true; } bool CPDF_Parser::LoadLinearizedCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount) { FX_FILESIZE dwStartPos = pos - m_pSyntax->m_HeaderOffset; m_pSyntax->RestorePos(dwStartPos); m_SortedOffset.insert(pos); uint32_t start_objnum = 0; uint32_t count = dwObjCount; FX_FILESIZE SavedPos = m_pSyntax->SavePos(); const int32_t recordsize = 20; std::vector buf(1024 * recordsize + 1); buf[1024 * recordsize] = '\0'; int32_t nBlocks = count / 1024 + 1; for (int32_t block = 0; block < nBlocks; block++) { int32_t block_size = block == nBlocks - 1 ? count % 1024 : 1024; uint32_t dwReadSize = block_size * recordsize; if ((FX_FILESIZE)(dwStartPos + dwReadSize) > m_pSyntax->m_FileLen) return false; if (!m_pSyntax->ReadBlock(reinterpret_cast(buf.data()), dwReadSize)) { return false; } for (int32_t i = 0; i < block_size; i++) { uint32_t objnum = start_objnum + block * 1024 + i; char* pEntry = &buf[i * recordsize]; if (pEntry[17] == 'f') { m_ObjectInfo[objnum].pos = 0; m_ObjectInfo[objnum].type = 0; } else { int32_t offset = FXSYS_atoi(pEntry); if (offset == 0) { for (int32_t c = 0; c < 10; c++) { if (!std::isdigit(pEntry[c])) return false; } } m_ObjectInfo[objnum].pos = offset; int32_t version = FXSYS_atoi(pEntry + 11); if (version >= 1) m_bVersionUpdated = true; m_ObjectInfo[objnum].gennum = version; if (m_ObjectInfo[objnum].pos < m_pSyntax->m_FileLen) m_SortedOffset.insert(m_ObjectInfo[objnum].pos); m_ObjectInfo[objnum].type = 1; } } } m_pSyntax->RestorePos(SavedPos + count * recordsize); return true; } bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos, FX_FILESIZE streampos, bool bSkip) { m_pSyntax->RestorePos(pos); if (m_pSyntax->GetKeyword() != "xref") return false; m_SortedOffset.insert(pos); if (streampos) m_SortedOffset.insert(streampos); while (1) { FX_FILESIZE SavedPos = m_pSyntax->SavePos(); bool bIsNumber; CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber); if (word.IsEmpty()) return false; if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); break; } uint32_t start_objnum = FXSYS_atoui(word.c_str()); if (start_objnum >= kMaxObjectNumber) return false; uint32_t count = m_pSyntax->GetDirectNum(); m_pSyntax->ToNextWord(); SavedPos = m_pSyntax->SavePos(); const int32_t recordsize = 20; m_dwXrefStartObjNum = start_objnum; if (!bSkip) { std::vector buf(1024 * recordsize + 1); buf[1024 * recordsize] = '\0'; int32_t nBlocks = count / 1024 + 1; for (int32_t block = 0; block < nBlocks; block++) { int32_t block_size = block == nBlocks - 1 ? count % 1024 : 1024; m_pSyntax->ReadBlock(reinterpret_cast(buf.data()), block_size * recordsize); for (int32_t i = 0; i < block_size; i++) { uint32_t objnum = start_objnum + block * 1024 + i; char* pEntry = &buf[i * recordsize]; if (pEntry[17] == 'f') { m_ObjectInfo[objnum].pos = 0; m_ObjectInfo[objnum].type = 0; } else { FX_FILESIZE offset = (FX_FILESIZE)FXSYS_atoi64(pEntry); if (offset == 0) { for (int32_t c = 0; c < 10; c++) { if (!std::isdigit(pEntry[c])) return false; } } m_ObjectInfo[objnum].pos = offset; int32_t version = FXSYS_atoi(pEntry + 11); if (version >= 1) m_bVersionUpdated = true; m_ObjectInfo[objnum].gennum = version; if (m_ObjectInfo[objnum].pos < m_pSyntax->m_FileLen) m_SortedOffset.insert(m_ObjectInfo[objnum].pos); m_ObjectInfo[objnum].type = 1; } } } } m_pSyntax->RestorePos(SavedPos + count * recordsize); } return !streampos || LoadCrossRefV5(&streampos, false); } bool CPDF_Parser::LoadAllCrossRefV5(FX_FILESIZE xrefpos) { if (!LoadCrossRefV5(&xrefpos, true)) return false; std::set seen_xrefpos; while (xrefpos) { seen_xrefpos.insert(xrefpos); if (!LoadCrossRefV5(&xrefpos, false)) return false; // Check for circular references. if (pdfium::ContainsKey(seen_xrefpos, xrefpos)) return false; } m_ObjectStreamMap.clear(); m_bXRefStream = true; return true; } bool CPDF_Parser::RebuildCrossRef() { m_ObjectInfo.clear(); m_SortedOffset.clear(); m_pTrailer.reset(); ParserState state = ParserState::kDefault; int32_t inside_index = 0; uint32_t objnum = 0; uint32_t gennum = 0; int32_t depth = 0; const uint32_t kBufferSize = 4096; std::vector buffer(kBufferSize); FX_FILESIZE pos = m_pSyntax->m_HeaderOffset; FX_FILESIZE start_pos = 0; FX_FILESIZE start_pos1 = 0; FX_FILESIZE last_obj = -1; FX_FILESIZE last_xref = -1; FX_FILESIZE last_trailer = -1; while (pos < m_pSyntax->m_FileLen) { const FX_FILESIZE saved_pos = pos; bool bOverFlow = false; uint32_t size = std::min((uint32_t)(m_pSyntax->m_FileLen - pos), kBufferSize); if (!m_pSyntax->m_pFileAccess->ReadBlock(buffer.data(), pos, size)) break; for (uint32_t i = 0; i < size; i++) { uint8_t byte = buffer[i]; switch (state) { case ParserState::kDefault: if (PDFCharIsWhitespace(byte)) { state = ParserState::kWhitespace; } else if (std::isdigit(byte)) { --i; state = ParserState::kWhitespace; } else if (byte == '%') { inside_index = 0; state = ParserState::kComment; } else if (byte == '(') { state = ParserState::kString; depth = 1; } else if (byte == '<') { inside_index = 1; state = ParserState::kHexString; } else if (byte == '\\') { state = ParserState::kEscapedString; } else if (byte == 't') { state = ParserState::kTrailer; inside_index = 1; } break; case ParserState::kWhitespace: if (std::isdigit(byte)) { start_pos = pos + i; state = ParserState::kObjNum; objnum = FXSYS_toDecimalDigit(static_cast(byte)); } else if (byte == 't') { state = ParserState::kTrailer; inside_index = 1; } else if (byte == 'x') { state = ParserState::kXref; inside_index = 1; } else if (!PDFCharIsWhitespace(byte)) { --i; state = ParserState::kDefault; } break; case ParserState::kObjNum: if (std::isdigit(byte)) { objnum = objnum * 10 + FXSYS_toDecimalDigit(static_cast(byte)); } else if (PDFCharIsWhitespace(byte)) { state = ParserState::kPostObjNum; } else { --i; state = ParserState::kEndObj; inside_index = 0; } break; case ParserState::kPostObjNum: if (std::isdigit(byte)) { start_pos1 = pos + i; state = ParserState::kGenNum; gennum = FXSYS_toDecimalDigit(static_cast(byte)); } else if (byte == 't') { state = ParserState::kTrailer; inside_index = 1; } else if (!PDFCharIsWhitespace(byte)) { --i; state = ParserState::kDefault; } break; case ParserState::kGenNum: if (std::isdigit(byte)) { gennum = gennum * 10 + FXSYS_toDecimalDigit(static_cast(byte)); } else if (PDFCharIsWhitespace(byte)) { state = ParserState::kPostGenNum; } else { --i; state = ParserState::kDefault; } break; case ParserState::kPostGenNum: if (byte == 'o') { state = ParserState::kBeginObj; inside_index = 1; } else if (std::isdigit(byte)) { objnum = gennum; gennum = FXSYS_toDecimalDigit(static_cast(byte)); start_pos = start_pos1; start_pos1 = pos + i; state = ParserState::kGenNum; } else if (byte == 't') { state = ParserState::kTrailer; inside_index = 1; } else if (!PDFCharIsWhitespace(byte)) { --i; state = ParserState::kDefault; } break; case ParserState::kBeginObj: switch (inside_index) { case 1: if (byte != 'b') { --i; state = ParserState::kDefault; } else { inside_index++; } break; case 2: if (byte != 'j') { --i; state = ParserState::kDefault; } else { inside_index++; } break; case 3: if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { FX_FILESIZE obj_pos = start_pos - m_pSyntax->m_HeaderOffset; m_SortedOffset.insert(obj_pos); last_obj = start_pos; FX_FILESIZE obj_end = 0; std::unique_ptr pObject = ParseIndirectObjectAtByStrict(m_pDocument, obj_pos, objnum, &obj_end); if (CPDF_Stream* pStream = ToStream(pObject.get())) { if (CPDF_Dictionary* pDict = pStream->GetDict()) { if ((pDict->KeyExist("Type")) && (pDict->GetStringFor("Type") == "XRef" && pDict->KeyExist("Size"))) { CPDF_Object* pRoot = pDict->GetObjectFor("Root"); if (pRoot && pRoot->GetDict() && pRoot->GetDict()->GetObjectFor("Pages")) { m_pTrailer = ToDictionary(pDict->Clone()); } } } } FX_FILESIZE offset = 0; m_pSyntax->RestorePos(obj_pos); offset = m_pSyntax->FindTag("obj", 0); if (offset == -1) offset = 0; else offset += 3; FX_FILESIZE nLen = obj_end - obj_pos - offset; if ((uint32_t)nLen > size - i) { pos = obj_end + m_pSyntax->m_HeaderOffset; bOverFlow = true; } else { i += (uint32_t)nLen; } if (!m_ObjectInfo.empty() && IsValidObjectNumber(objnum) && m_ObjectInfo[objnum].pos) { if (pObject) { uint32_t oldgen = GetObjectGenNum(objnum); m_ObjectInfo[objnum].pos = obj_pos; m_ObjectInfo[objnum].gennum = gennum; if (oldgen != gennum) m_bVersionUpdated = true; } } else { m_ObjectInfo[objnum].pos = obj_pos; m_ObjectInfo[objnum].type = 1; m_ObjectInfo[objnum].gennum = gennum; } } --i; state = ParserState::kDefault; break; } break; case ParserState::kTrailer: if (inside_index == 7) { if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { last_trailer = pos + i - 7; m_pSyntax->RestorePos(pos + i - m_pSyntax->m_HeaderOffset); std::unique_ptr pObj = m_pSyntax->GetObject(m_pDocument, 0, 0, true); if (pObj) { if (pObj->IsDictionary() || pObj->AsStream()) { CPDF_Stream* pStream = pObj->AsStream(); if (CPDF_Dictionary* pTrailer = pStream ? pStream->GetDict() : pObj->AsDictionary()) { if (m_pTrailer) { CPDF_Object* pRoot = pTrailer->GetObjectFor("Root"); CPDF_Reference* pRef = ToReference(pRoot); if (!pRoot || (pRef && IsValidObjectNumber(pRef->GetRefObjNum()) && m_ObjectInfo[pRef->GetRefObjNum()].pos != 0)) { auto it = pTrailer->begin(); while (it != pTrailer->end()) { const CFX_ByteString& key = it->first; CPDF_Object* pElement = it->second.get(); ++it; uint32_t dwObjNum = pElement ? pElement->GetObjNum() : 0; if (dwObjNum) { m_pTrailer->SetNewFor( key, m_pDocument, dwObjNum); } else { m_pTrailer->SetFor(key, pElement->Clone()); } } } } else { if (pObj->IsStream()) { m_pTrailer = ToDictionary(pTrailer->Clone()); } else { m_pTrailer = ToDictionary(std::move(pObj)); } FX_FILESIZE dwSavePos = m_pSyntax->SavePos(); CFX_ByteString strWord = m_pSyntax->GetKeyword(); if (!strWord.Compare("startxref")) { bool bNumber; CFX_ByteString bsOffset = m_pSyntax->GetNextWord(&bNumber); if (bNumber) m_LastXRefOffset = FXSYS_atoi(bsOffset.c_str()); } m_pSyntax->RestorePos(dwSavePos); } } } } } --i; state = ParserState::kDefault; } else if (byte == "trailer"[inside_index]) { inside_index++; } else { --i; state = ParserState::kDefault; } break; case ParserState::kXref: if (inside_index == 4) { last_xref = pos + i - 4; state = ParserState::kWhitespace; } else if (byte == "xref"[inside_index]) { inside_index++; } else { --i; state = ParserState::kDefault; } break; case ParserState::kComment: if (PDFCharIsLineEnding(byte)) state = ParserState::kDefault; break; case ParserState::kString: if (byte == ')') { if (depth > 0) depth--; } else if (byte == '(') { depth++; } if (!depth) state = ParserState::kDefault; break; case ParserState::kHexString: if (byte == '>' || (byte == '<' && inside_index == 1)) state = ParserState::kDefault; inside_index = 0; break; case ParserState::kEscapedString: if (PDFCharIsDelimiter(byte) || PDFCharIsWhitespace(byte)) { --i; state = ParserState::kDefault; } break; case ParserState::kEndObj: if (PDFCharIsWhitespace(byte)) { state = ParserState::kDefault; } else if (byte == '%' || byte == '(' || byte == '<' || byte == '\\') { state = ParserState::kDefault; --i; } else if (inside_index == 6) { state = ParserState::kDefault; --i; } else if (byte == "endobj"[inside_index]) { inside_index++; } break; } if (bOverFlow) { size = 0; break; } } pos += size; // If the position has not changed at all or went backwards in a loop // iteration, then break out to prevent infinite looping. if (pos <= saved_pos) break; } if (last_xref != -1 && last_xref > last_obj) last_trailer = last_xref; else if (last_trailer == -1 || last_xref < last_obj) last_trailer = m_pSyntax->m_FileLen; m_SortedOffset.insert(last_trailer - m_pSyntax->m_HeaderOffset); return m_pTrailer && !m_ObjectInfo.empty(); } bool CPDF_Parser::LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef) { std::unique_ptr pObject( ParseIndirectObjectAt(m_pDocument, *pos, 0)); if (!pObject) return false; uint32_t objnum = pObject->m_ObjNum; if (!objnum) return false; CPDF_Object* pUnownedObject = pObject.get(); if (m_pDocument) { CPDF_Dictionary* pRootDict = m_pDocument->GetRoot(); if (pRootDict && pRootDict->GetObjNum() == objnum) return false; if (!m_pDocument->ReplaceIndirectObjectIfHigherGeneration( objnum, std::move(pObject))) { return false; } } CPDF_Stream* pStream = pUnownedObject->AsStream(); if (!pStream) return false; CPDF_Dictionary* pDict = pStream->GetDict(); *pos = pDict->GetIntegerFor("Prev"); int32_t size = pDict->GetIntegerFor("Size"); if (size < 0) return false; std::unique_ptr pNewTrailer = ToDictionary(pDict->Clone()); if (bMainXRef) { m_pTrailer = std::move(pNewTrailer); ShrinkObjectMap(size); for (auto& it : m_ObjectInfo) it.second.type = 0; } else { m_Trailers.push_back(std::move(pNewTrailer)); } std::vector> arrIndex; CPDF_Array* pArray = pDict->GetArrayFor("Index"); if (pArray) { for (size_t i = 0; i < pArray->GetCount() / 2; i++) { CPDF_Object* pStartNumObj = pArray->GetObjectAt(i * 2); CPDF_Object* pCountObj = pArray->GetObjectAt(i * 2 + 1); if (ToNumber(pStartNumObj) && ToNumber(pCountObj)) { int nStartNum = pStartNumObj->GetInteger(); int nCount = pCountObj->GetInteger(); if (nStartNum >= 0 && nCount > 0) arrIndex.push_back(std::make_pair(nStartNum, nCount)); } } } if (arrIndex.size() == 0) arrIndex.push_back(std::make_pair(0, size)); pArray = pDict->GetArrayFor("W"); if (!pArray) return false; std::vector WidthArray; FX_SAFE_UINT32 dwAccWidth = 0; for (size_t i = 0; i < pArray->GetCount(); ++i) { WidthArray.push_back(pArray->GetIntegerAt(i)); dwAccWidth += WidthArray[i]; } if (!dwAccWidth.IsValid() || WidthArray.size() < 3) return false; uint32_t totalWidth = dwAccWidth.ValueOrDie(); auto pAcc = pdfium::MakeRetain(pStream); pAcc->LoadAllData(); const uint8_t* pData = pAcc->GetData(); uint32_t dwTotalSize = pAcc->GetSize(); uint32_t segindex = 0; for (uint32_t i = 0; i < arrIndex.size(); i++) { int32_t startnum = arrIndex[i].first; if (startnum < 0) continue; m_dwXrefStartObjNum = pdfium::base::checked_cast(startnum); uint32_t count = pdfium::base::checked_cast(arrIndex[i].second); FX_SAFE_UINT32 dwCaculatedSize = segindex; dwCaculatedSize += count; dwCaculatedSize *= totalWidth; if (!dwCaculatedSize.IsValid() || dwCaculatedSize.ValueOrDie() > dwTotalSize) { continue; } const uint8_t* segstart = pData + segindex * totalWidth; FX_SAFE_UINT32 dwMaxObjNum = startnum; dwMaxObjNum += count; uint32_t dwV5Size = m_ObjectInfo.empty() ? 0 : GetLastObjNum() + 1; if (!dwMaxObjNum.IsValid() || dwMaxObjNum.ValueOrDie() > dwV5Size) continue; for (uint32_t j = 0; j < count; j++) { int32_t type = 1; const uint8_t* entrystart = segstart + j * totalWidth; if (WidthArray[0]) type = GetVarInt(entrystart, WidthArray[0]); if (GetObjectType(startnum + j) == 255) { FX_FILESIZE offset = GetVarInt(entrystart + WidthArray[0], WidthArray[1]); m_ObjectInfo[startnum + j].pos = offset; m_SortedOffset.insert(offset); continue; } if (GetObjectType(startnum + j)) continue; m_ObjectInfo[startnum + j].type = type; if (type == 0) { m_ObjectInfo[startnum + j].pos = 0; } else { FX_FILESIZE offset = GetVarInt(entrystart + WidthArray[0], WidthArray[1]); m_ObjectInfo[startnum + j].pos = offset; if (type == 1) { m_SortedOffset.insert(offset); } else { if (offset < 0 || !IsValidObjectNumber(offset)) return false; m_ObjectInfo[offset].type = 255; } } } segindex += count; } return true; } CPDF_Array* CPDF_Parser::GetIDArray() { if (!m_pTrailer) return nullptr; CPDF_Object* pID = m_pTrailer->GetObjectFor("ID"); if (!pID) return nullptr; CPDF_Reference* pRef = pID->AsReference(); if (!pRef) return ToArray(pID); std::unique_ptr pNewObj = ParseIndirectObject(nullptr, pRef->GetRefObjNum()); pID = pNewObj.get(); m_pTrailer->SetFor("ID", std::move(pNewObj)); return ToArray(pID); } uint32_t CPDF_Parser::GetRootObjNum() { CPDF_Reference* pRef = ToReference(m_pTrailer ? m_pTrailer->GetObjectFor("Root") : nullptr); return pRef ? pRef->GetRefObjNum() : 0; } uint32_t CPDF_Parser::GetInfoObjNum() { CPDF_Reference* pRef = ToReference(m_pTrailer ? m_pTrailer->GetObjectFor("Info") : nullptr); return pRef ? pRef->GetRefObjNum() : 0; } std::unique_ptr CPDF_Parser::ParseIndirectObject( CPDF_IndirectObjectHolder* pObjList, uint32_t objnum) { if (!IsValidObjectNumber(objnum)) return nullptr; // Prevent circular parsing the same object. if (pdfium::ContainsKey(m_ParsingObjNums, objnum)) return nullptr; pdfium::ScopedSetInsertion local_insert(&m_ParsingObjNums, objnum); if (GetObjectType(objnum) == 1 || GetObjectType(objnum) == 255) { FX_FILESIZE pos = m_ObjectInfo[objnum].pos; if (pos <= 0) return nullptr; return ParseIndirectObjectAt(pObjList, pos, objnum); } if (GetObjectType(objnum) != 2) return nullptr; CFX_RetainPtr pObjStream = GetObjectStream(m_ObjectInfo[objnum].pos); if (!pObjStream) return nullptr; CFX_RetainPtr file = IFX_MemoryStream::Create( (uint8_t*)pObjStream->GetData(), (size_t)pObjStream->GetSize(), false); CPDF_SyntaxParser syntax; syntax.InitParser(file, 0); const int32_t offset = GetStreamFirst(pObjStream); // Read object numbers from |pObjStream| into a cache. if (!pdfium::ContainsKey(m_ObjCache, pObjStream)) { for (int32_t i = GetStreamNCount(pObjStream); i > 0; --i) { uint32_t thisnum = syntax.GetDirectNum(); uint32_t thisoff = syntax.GetDirectNum(); m_ObjCache[pObjStream][thisnum] = thisoff; } } const auto it = m_ObjCache[pObjStream].find(objnum); if (it == m_ObjCache[pObjStream].end()) return nullptr; syntax.RestorePos(offset + it->second); return syntax.GetObject(pObjList, 0, 0, true); } CFX_RetainPtr CPDF_Parser::GetObjectStream(uint32_t objnum) { auto it = m_ObjectStreamMap.find(objnum); if (it != m_ObjectStreamMap.end()) return it->second; if (!m_pDocument) return nullptr; const CPDF_Stream* pStream = ToStream(m_pDocument->GetOrParseIndirectObject(objnum)); if (!pStream) return nullptr; auto pStreamAcc = pdfium::MakeRetain(pStream); pStreamAcc->LoadAllData(); m_ObjectStreamMap[objnum] = pStreamAcc; return pStreamAcc; } FX_FILESIZE CPDF_Parser::GetObjectSize(uint32_t objnum) const { if (!IsValidObjectNumber(objnum)) return 0; if (GetObjectType(objnum) == 2) objnum = GetObjectPositionOrZero(objnum); if (GetObjectType(objnum) != 1 && GetObjectType(objnum) != 255) return 0; FX_FILESIZE offset = GetObjectPositionOrZero(objnum); if (offset == 0) return 0; auto it = m_SortedOffset.find(offset); if (it == m_SortedOffset.end() || ++it == m_SortedOffset.end()) return 0; return *it - offset; } void CPDF_Parser::GetIndirectBinary(uint32_t objnum, uint8_t*& pBuffer, uint32_t& size) { pBuffer = nullptr; size = 0; if (!IsValidObjectNumber(objnum)) return; if (GetObjectType(objnum) == 2) { CFX_RetainPtr pObjStream = GetObjectStream(m_ObjectInfo[objnum].pos); if (!pObjStream) return; int32_t offset = GetStreamFirst(pObjStream); const uint8_t* pData = pObjStream->GetData(); uint32_t totalsize = pObjStream->GetSize(); CFX_RetainPtr file = IFX_MemoryStream::Create((uint8_t*)pData, (size_t)totalsize, false); CPDF_SyntaxParser syntax; syntax.InitParser(file, 0); for (int i = GetStreamNCount(pObjStream); i > 0; --i) { uint32_t thisnum = syntax.GetDirectNum(); uint32_t thisoff = syntax.GetDirectNum(); if (thisnum != objnum) continue; if (i == 1) { size = totalsize - (thisoff + offset); } else { syntax.GetDirectNum(); // Skip nextnum. uint32_t nextoff = syntax.GetDirectNum(); size = nextoff - thisoff; } pBuffer = FX_Alloc(uint8_t, size); memcpy(pBuffer, pData + thisoff + offset, size); return; } return; } if (GetObjectType(objnum) != 1) return; FX_FILESIZE pos = m_ObjectInfo[objnum].pos; if (pos == 0) return; FX_FILESIZE SavedPos = m_pSyntax->SavePos(); m_pSyntax->RestorePos(pos); bool bIsNumber; CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); return; } uint32_t parser_objnum = FXSYS_atoui(word.c_str()); if (parser_objnum && parser_objnum != objnum) { m_pSyntax->RestorePos(SavedPos); return; } word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); return; } if (m_pSyntax->GetKeyword() != "obj") { m_pSyntax->RestorePos(SavedPos); return; } auto it = m_SortedOffset.find(pos); if (it == m_SortedOffset.end() || ++it == m_SortedOffset.end()) { m_pSyntax->RestorePos(SavedPos); return; } FX_FILESIZE nextoff = *it; bool bNextOffValid = false; if (nextoff != pos) { m_pSyntax->RestorePos(nextoff); word = m_pSyntax->GetNextWord(&bIsNumber); if (word == "xref") { bNextOffValid = true; } else if (bIsNumber) { word = m_pSyntax->GetNextWord(&bIsNumber); if (bIsNumber && m_pSyntax->GetKeyword() == "obj") { bNextOffValid = true; } } } if (!bNextOffValid) { m_pSyntax->RestorePos(pos); while (1) { if (m_pSyntax->GetKeyword() == "endobj") break; if (m_pSyntax->SavePos() == m_pSyntax->m_FileLen) break; } nextoff = m_pSyntax->SavePos(); } size = (uint32_t)(nextoff - pos); pBuffer = FX_Alloc(uint8_t, size); m_pSyntax->RestorePos(pos); m_pSyntax->ReadBlock(pBuffer, size); m_pSyntax->RestorePos(SavedPos); } std::unique_ptr CPDF_Parser::ParseIndirectObjectAt( CPDF_IndirectObjectHolder* pObjList, FX_FILESIZE pos, uint32_t objnum) { FX_FILESIZE SavedPos = m_pSyntax->SavePos(); m_pSyntax->RestorePos(pos); bool bIsNumber; CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); return nullptr; } FX_FILESIZE objOffset = m_pSyntax->SavePos(); objOffset -= word.GetLength(); uint32_t parser_objnum = FXSYS_atoui(word.c_str()); if (objnum && parser_objnum != objnum) { m_pSyntax->RestorePos(SavedPos); return nullptr; } word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); return nullptr; } uint32_t parser_gennum = FXSYS_atoui(word.c_str()); if (m_pSyntax->GetKeyword() != "obj") { m_pSyntax->RestorePos(SavedPos); return nullptr; } std::unique_ptr pObj = m_pSyntax->GetObject(pObjList, objnum, parser_gennum, true); m_pSyntax->SavePos(); CFX_ByteString bsWord = m_pSyntax->GetKeyword(); if (bsWord == "endobj") m_pSyntax->SavePos(); m_pSyntax->RestorePos(SavedPos); if (pObj) { if (!objnum) pObj->m_ObjNum = parser_objnum; pObj->m_GenNum = parser_gennum; } return pObj; } std::unique_ptr CPDF_Parser::ParseIndirectObjectAtByStrict( CPDF_IndirectObjectHolder* pObjList, FX_FILESIZE pos, uint32_t objnum, FX_FILESIZE* pResultPos) { FX_FILESIZE SavedPos = m_pSyntax->SavePos(); m_pSyntax->RestorePos(pos); bool bIsNumber; CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); return nullptr; } uint32_t parser_objnum = FXSYS_atoui(word.c_str()); if (objnum && parser_objnum != objnum) { m_pSyntax->RestorePos(SavedPos); return nullptr; } word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) { m_pSyntax->RestorePos(SavedPos); return nullptr; } uint32_t gennum = FXSYS_atoui(word.c_str()); if (m_pSyntax->GetKeyword() != "obj") { m_pSyntax->RestorePos(SavedPos); return nullptr; } std::unique_ptr pObj = m_pSyntax->GetObjectForStrict(pObjList, objnum, gennum); if (pResultPos) *pResultPos = m_pSyntax->m_Pos; m_pSyntax->RestorePos(SavedPos); return pObj; } uint32_t CPDF_Parser::GetFirstPageNo() const { return m_pLinearized ? m_pLinearized->GetFirstPageNo() : 0; } std::unique_ptr CPDF_Parser::LoadTrailerV4() { if (m_pSyntax->GetKeyword() != "trailer") return nullptr; return ToDictionary(m_pSyntax->GetObject(m_pDocument, 0, 0, true)); } uint32_t CPDF_Parser::GetPermissions() const { if (!m_pSecurityHandler) return 0xFFFFFFFF; uint32_t dwPermission = m_pSecurityHandler->GetPermissions(); if (m_pEncryptDict && m_pEncryptDict->GetStringFor("Filter") == "Standard") { // See PDF Reference 1.7, page 123, table 3.20. dwPermission &= 0xFFFFFFFC; dwPermission |= 0xFFFFF0C0; } return dwPermission; } bool CPDF_Parser::IsLinearizedFile( const CFX_RetainPtr& pFileAccess, uint32_t offset) { m_pSyntax->InitParser(pFileAccess, offset); m_pSyntax->RestorePos(m_pSyntax->m_HeaderOffset + 9); FX_FILESIZE SavedPos = m_pSyntax->SavePos(); bool bIsNumber; CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) return false; uint32_t objnum = FXSYS_atoui(word.c_str()); word = m_pSyntax->GetNextWord(&bIsNumber); if (!bIsNumber) return false; uint32_t gennum = FXSYS_atoui(word.c_str()); if (m_pSyntax->GetKeyword() != "obj") { m_pSyntax->RestorePos(SavedPos); return false; } m_pLinearized = CPDF_LinearizedHeader::CreateForObject( m_pSyntax->GetObject(nullptr, objnum, gennum, true)); if (!m_pLinearized) return false; m_LastXRefOffset = m_pLinearized->GetLastXRefOffset(); // Move parser onto first page xref table start. m_pSyntax->GetNextWord(nullptr); return true; } CPDF_Parser::Error CPDF_Parser::StartLinearizedParse( const CFX_RetainPtr& pFileAccess, CPDF_Document* pDocument) { ASSERT(!m_bHasParsed); m_bXRefStream = false; m_LastXRefOffset = 0; int32_t offset = GetHeaderOffset(pFileAccess); if (offset == -1) return FORMAT_ERROR; if (!IsLinearizedFile(pFileAccess, offset)) { m_pSyntax->m_pFileAccess = nullptr; return StartParse(pFileAccess, std::move(pDocument)); } m_bHasParsed = true; m_pDocument = pDocument; FX_FILESIZE dwFirstXRefOffset = m_pSyntax->SavePos(); bool bXRefRebuilt = false; bool bLoadV4 = LoadCrossRefV4(dwFirstXRefOffset, 0, false); if (!bLoadV4 && !LoadCrossRefV5(&dwFirstXRefOffset, true)) { if (!RebuildCrossRef()) return FORMAT_ERROR; bXRefRebuilt = true; m_LastXRefOffset = 0; } if (bLoadV4) { m_pTrailer = LoadTrailerV4(); if (!m_pTrailer) return SUCCESS; int32_t xrefsize = GetDirectInteger(m_pTrailer.get(), "Size"); if (xrefsize > 0) ShrinkObjectMap(xrefsize); } Error eRet = SetEncryptHandler(); if (eRet != SUCCESS) return eRet; m_pDocument->LoadLinearizedDoc(m_pLinearized.get()); if (!m_pDocument->GetRoot() || m_pDocument->GetPageCount() == 0) { if (bXRefRebuilt) return FORMAT_ERROR; ReleaseEncryptHandler(); if (!RebuildCrossRef()) return FORMAT_ERROR; eRet = SetEncryptHandler(); if (eRet != SUCCESS) return eRet; m_pDocument->LoadLinearizedDoc(m_pLinearized.get()); if (!m_pDocument->GetRoot()) return FORMAT_ERROR; } if (GetRootObjNum() == 0) { ReleaseEncryptHandler(); if (!RebuildCrossRef() || GetRootObjNum() == 0) return FORMAT_ERROR; eRet = SetEncryptHandler(); if (eRet != SUCCESS) return eRet; } if (m_pSecurityHandler && m_pSecurityHandler->IsMetadataEncrypted()) { if (CPDF_Reference* pMetadata = ToReference(m_pDocument->GetRoot()->GetObjectFor("Metadata"))) m_pSyntax->m_MetadataObjnum = pMetadata->GetRefObjNum(); } return SUCCESS; } bool CPDF_Parser::LoadLinearizedAllCrossRefV5(FX_FILESIZE xrefpos) { if (!LoadCrossRefV5(&xrefpos, false)) return false; std::set seen_xrefpos; while (xrefpos) { seen_xrefpos.insert(xrefpos); if (!LoadCrossRefV5(&xrefpos, false)) return false; // Check for circular references. if (pdfium::ContainsKey(seen_xrefpos, xrefpos)) return false; } m_ObjectStreamMap.clear(); m_bXRefStream = true; return true; } CPDF_Parser::Error CPDF_Parser::LoadLinearizedMainXRefTable() { uint32_t dwSaveMetadataObjnum = m_pSyntax->m_MetadataObjnum; m_pSyntax->m_MetadataObjnum = 0; m_pTrailer.reset(); m_pSyntax->RestorePos(m_LastXRefOffset - m_pSyntax->m_HeaderOffset); uint8_t ch = 0; uint32_t dwCount = 0; m_pSyntax->GetNextChar(ch); while (PDFCharIsWhitespace(ch)) { ++dwCount; if (m_pSyntax->m_FileLen <= (FX_FILESIZE)(m_pSyntax->SavePos() + m_pSyntax->m_HeaderOffset)) { break; } m_pSyntax->GetNextChar(ch); } m_LastXRefOffset += dwCount; m_ObjectStreamMap.clear(); m_ObjCache.clear(); if (!LoadLinearizedAllCrossRefV4(m_LastXRefOffset, m_dwXrefStartObjNum) && !LoadLinearizedAllCrossRefV5(m_LastXRefOffset)) { m_LastXRefOffset = 0; m_pSyntax->m_MetadataObjnum = dwSaveMetadataObjnum; return FORMAT_ERROR; } m_pSyntax->m_MetadataObjnum = dwSaveMetadataObjnum; return SUCCESS; }