From 0145b89ac060870dd70f3d2f41f318a68721a086 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Tue, 26 Jun 2018 17:02:48 +0000 Subject: Simplify CPDF_Parser::RebuildCrossRef. Use CPDF_SyntaxParser to read data, instead manual cache. Simplify merging trailers logic. Change-Id: Icaa569f1082cde6dc9437a375c5c27e187fd6c79 Reviewed-on: https://pdfium-review.googlesource.com/35431 Commit-Queue: Art Snake Reviewed-by: dsinclair --- core/fpdfapi/parser/cpdf_parser.cpp | 153 +++++++++++------------------ core/fpdfapi/parser/cpdf_syntax_parser.cpp | 3 +- core/fpdfapi/parser/cpdf_syntax_parser.h | 8 +- 3 files changed, 66 insertions(+), 98 deletions(-) (limited to 'core/fpdfapi') diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index 599b04199d..854d6e7f60 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -679,31 +679,24 @@ bool CPDF_Parser::RebuildCrossRef() { uint32_t gennum = 0; int32_t depth = 0; const uint32_t kBufferSize = 4096; - std::vector buffer(kBufferSize); - - FX_FILESIZE pos = m_pSyntax->m_HeaderOffset; + m_pSyntax->SetReadBufferSize(kBufferSize); FX_FILESIZE start_pos = 0; FX_FILESIZE start_pos1 = 0; FX_FILESIZE last_obj = -1; FX_FILESIZE last_xref = -1; FX_FILESIZE last_trailer = -1; - while (pos < m_pSyntax->m_FileLen) { - const FX_FILESIZE saved_pos = pos; - bool bOverFlow = false; - uint32_t size = - std::min((uint32_t)(m_pSyntax->m_FileLen - pos), kBufferSize); - if (!m_pSyntax->GetFileAccess()->ReadBlock(buffer.data(), pos, size)) - break; - - for (uint32_t i = 0; i < size; i++) { - uint8_t byte = buffer[i]; + uint8_t byte = 0; + m_pSyntax->SetPos(0); + { + while (m_pSyntax->GetNextChar(byte)) { + const FX_FILESIZE current_char_pos = m_pSyntax->GetPos() - 1; switch (state) { case ParserState::kDefault: if (PDFCharIsWhitespace(byte)) { state = ParserState::kWhitespace; } else if (std::isdigit(byte)) { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kWhitespace; } else if (byte == '%') { inside_index = 0; @@ -724,7 +717,7 @@ bool CPDF_Parser::RebuildCrossRef() { case ParserState::kWhitespace: if (std::isdigit(byte)) { - start_pos = pos + i; + start_pos = current_char_pos; state = ParserState::kObjNum; objnum = FXSYS_DecimalCharToInt(static_cast(byte)); } else if (byte == 't') { @@ -734,7 +727,7 @@ bool CPDF_Parser::RebuildCrossRef() { state = ParserState::kXref; inside_index = 1; } else if (!PDFCharIsWhitespace(byte)) { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; @@ -746,7 +739,7 @@ bool CPDF_Parser::RebuildCrossRef() { } else if (PDFCharIsWhitespace(byte)) { state = ParserState::kPostObjNum; } else { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kEndObj; inside_index = 0; } @@ -754,14 +747,14 @@ bool CPDF_Parser::RebuildCrossRef() { case ParserState::kPostObjNum: if (std::isdigit(byte)) { - start_pos1 = pos + i; + start_pos1 = current_char_pos; state = ParserState::kGenNum; gennum = FXSYS_DecimalCharToInt(static_cast(byte)); } else if (byte == 't') { state = ParserState::kTrailer; inside_index = 1; } else if (!PDFCharIsWhitespace(byte)) { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; @@ -773,7 +766,7 @@ bool CPDF_Parser::RebuildCrossRef() { } else if (PDFCharIsWhitespace(byte)) { state = ParserState::kPostGenNum; } else { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; @@ -786,13 +779,13 @@ bool CPDF_Parser::RebuildCrossRef() { objnum = gennum; gennum = FXSYS_DecimalCharToInt(static_cast(byte)); start_pos = start_pos1; - start_pos1 = pos + i; + start_pos1 = current_char_pos; state = ParserState::kGenNum; } else if (byte == 't') { state = ParserState::kTrailer; inside_index = 1; } else if (!PDFCharIsWhitespace(byte)) { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; @@ -801,7 +794,7 @@ bool CPDF_Parser::RebuildCrossRef() { switch (inside_index) { case 1: if (byte != 'b') { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } else { inside_index++; @@ -809,7 +802,7 @@ bool CPDF_Parser::RebuildCrossRef() { break; case 2: if (byte != 'j') { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } else { inside_index++; @@ -817,12 +810,14 @@ bool CPDF_Parser::RebuildCrossRef() { break; case 3: if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { - FX_FILESIZE obj_pos = start_pos - m_pSyntax->m_HeaderOffset; + const FX_FILESIZE obj_pos = start_pos; last_obj = start_pos; - FX_FILESIZE obj_end = 0; - std::unique_ptr pObject = - ParseIndirectObjectAtByStrict(m_pDocument.Get(), obj_pos, - objnum, &obj_end); + m_pSyntax->SetPos(obj_pos); + auto pObject = m_pSyntax->GetIndirectObject( + nullptr, CPDF_SyntaxParser::ParseType::kStrict); + m_pSyntax->SetPos( + std::max(current_char_pos, m_pSyntax->GetPos())); + if (CPDF_Stream* pStream = ToStream(pObject.get())) { if (CPDF_Dictionary* pDict = pStream->GetDict()) { if ((pDict->KeyExist("Type")) && @@ -837,36 +832,18 @@ bool CPDF_Parser::RebuildCrossRef() { } } } - - FX_FILESIZE offset = 0; - m_pSyntax->SetPos(obj_pos); - offset = m_pSyntax->FindTag("obj", 0); - if (offset == -1) - offset = 0; - else - offset += 3; - - FX_FILESIZE nLen = obj_end - obj_pos - offset; - if ((uint32_t)nLen > size - i) { - pos = obj_end + m_pSyntax->m_HeaderOffset; - bOverFlow = true; - } else { - i += (uint32_t)nLen; - } - - if (!m_ObjectInfo.empty() && IsValidObjectNumber(objnum) && - m_ObjectInfo[objnum].pos) { - if (pObject) { - m_ObjectInfo[objnum].pos = obj_pos; - m_ObjectInfo[objnum].gennum = gennum; - } - } else { - m_ObjectInfo[objnum].pos = obj_pos; - m_ObjectInfo[objnum].type = ObjectType::kNotCompressed; - m_ObjectInfo[objnum].gennum = gennum; + ObjectInfo& info = m_ObjectInfo[objnum]; + if (pObject || !info.pos) { + info.pos = obj_pos; + info.type = ObjectType::kNotCompressed; + // The newer version of object should be located after old + // version. + // Do not worry about gennum in this case, because we read + // file + // in front order. + info.gennum = gennum; } } - --i; state = ParserState::kDefault; break; } @@ -875,35 +852,31 @@ bool CPDF_Parser::RebuildCrossRef() { case ParserState::kTrailer: if (inside_index == 7) { if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { - last_trailer = pos + i - 7; - m_pSyntax->SetPos(pos + i - m_pSyntax->m_HeaderOffset); - + last_trailer = current_char_pos - 7; + m_pSyntax->SetPos(current_char_pos); std::unique_ptr pObj = - m_pSyntax->GetObjectBody(m_pDocument.Get()); - if (pObj) { - if (pObj->IsDictionary() || pObj->AsStream()) { - CPDF_Stream* pStream = pObj->AsStream(); + m_pSyntax->GetObjectBody(nullptr); + if (!pObj) + m_pSyntax->SetPos(current_char_pos); + + { + if (pObj && (pObj->IsDictionary() || pObj->IsStream())) { if (CPDF_Dictionary* pTrailer = - pStream ? pStream->GetDict() : pObj->AsDictionary()) { + pObj->IsStream() ? pObj->AsStream()->GetDict() + : pObj->AsDictionary()) { if (GetTrailer()) { CPDF_Object* pRoot = pTrailer->GetObjectFor("Root"); CPDF_Reference* pRef = ToReference(pRoot); if (!pRoot || (pRef && IsValidObjectNumber(pRef->GetRefObjNum()) && m_ObjectInfo[pRef->GetRefObjNum()].pos != 0)) { - auto it = pTrailer->begin(); - while (it != pTrailer->end()) { - const ByteString& key = it->first; - CPDF_Object* pElement = it->second.get(); + // This is newer version of trailer. Merge it with old. + for (auto it = pTrailer->begin(); + it != pTrailer->end();) { + DCHECK(it->second->IsInline()); + const ByteString key = it->first; ++it; - uint32_t dwObjNum = - pElement ? pElement->GetObjNum() : 0; - if (dwObjNum) { - GetTrailer()->SetNewFor( - key, m_pDocument.Get(), dwObjNum); - } else { - GetTrailer()->SetFor(key, pElement->Clone()); - } + GetTrailer()->SetFor(key, pTrailer->RemoveFor(key)); } } } else { @@ -925,24 +898,23 @@ bool CPDF_Parser::RebuildCrossRef() { } } } - --i; state = ParserState::kDefault; } else if (byte == "trailer"[inside_index]) { inside_index++; } else { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; case ParserState::kXref: if (inside_index == 4) { - last_xref = pos + i - 4; + last_xref = current_char_pos - 4; state = ParserState::kWhitespace; } else if (byte == "xref"[inside_index]) { inside_index++; } else { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; @@ -972,7 +944,7 @@ bool CPDF_Parser::RebuildCrossRef() { case ParserState::kEscapedString: if (PDFCharIsDelimiter(byte) || PDFCharIsWhitespace(byte)) { - --i; + m_pSyntax->SetPos(current_char_pos); state = ParserState::kDefault; } break; @@ -983,27 +955,16 @@ bool CPDF_Parser::RebuildCrossRef() { } else if (byte == '%' || byte == '(' || byte == '<' || byte == '\\') { state = ParserState::kDefault; - --i; + m_pSyntax->SetPos(current_char_pos); } else if (inside_index == 6) { state = ParserState::kDefault; - --i; + m_pSyntax->SetPos(current_char_pos); } else if (byte == "endobj"[inside_index]) { inside_index++; } break; } - - if (bOverFlow) { - size = 0; - break; - } } - pos += size; - - // If the position has not changed at all or went backwards in a loop - // iteration, then break out to prevent infinite looping. - if (pos <= saved_pos) - break; } if (last_xref != -1 && last_xref > last_obj) @@ -1011,6 +972,8 @@ bool CPDF_Parser::RebuildCrossRef() { else if (last_trailer == -1 || last_xref < last_obj) last_trailer = m_pSyntax->m_FileLen; + // Resore default buffer size. + m_pSyntax->SetReadBufferSize(CPDF_ModuleMgr::kFileBufSize); return GetTrailer() && !m_ObjectInfo.empty(); } diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.cpp b/core/fpdfapi/parser/cpdf_syntax_parser.cpp index 245617dca1..3cce446aae 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.cpp +++ b/core/fpdfapi/parser/cpdf_syntax_parser.cpp @@ -11,7 +11,6 @@ #include #include -#include "core/fpdfapi/cpdf_modulemgr.h" #include "core/fpdfapi/parser/cpdf_array.h" #include "core/fpdfapi/parser/cpdf_boolean.h" #include "core/fpdfapi/parser/cpdf_crypto_handler.h" @@ -53,7 +52,7 @@ bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) { bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) { if (read_pos >= m_FileLen) return false; - size_t read_size = CPDF_ModuleMgr::kFileBufSize; + size_t read_size = m_ReadBufferSize; FX_SAFE_FILESIZE safe_end = read_pos; safe_end += read_size; if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen) diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.h b/core/fpdfapi/parser/cpdf_syntax_parser.h index ed760934b9..a8f6bb2033 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.h +++ b/core/fpdfapi/parser/cpdf_syntax_parser.h @@ -11,6 +11,7 @@ #include #include +#include "core/fpdfapi/cpdf_modulemgr.h" #include "core/fxcrt/string_pool_template.h" #include "core/fxcrt/weak_ptr.h" @@ -35,6 +36,10 @@ class CPDF_SyntaxParser { void InitParserWithValidator(const RetainPtr& pValidator, uint32_t HeaderOffset); + void SetReadBufferSize(uint32_t read_buffer_size) { + m_ReadBufferSize = read_buffer_size; + } + FX_FILESIZE GetPos() const { return m_Pos; } void SetPos(FX_FILESIZE pos) { m_Pos = std::min(pos, m_FileLen); } @@ -61,6 +66,7 @@ class CPDF_SyntaxParser { return m_pFileAccess; } uint32_t GetDirectNum(); + bool GetNextChar(uint8_t& ch); private: friend class CPDF_Parser; @@ -71,7 +77,6 @@ class CPDF_SyntaxParser { static int s_CurrentRecursionDepth; bool ReadBlockAt(FX_FILESIZE read_pos); - bool GetNextChar(uint8_t& ch); bool GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch); void GetNextWordInternal(bool* bIsNumber); bool IsWholeWord(FX_FILESIZE startpos, @@ -100,6 +105,7 @@ class CPDF_SyntaxParser { FX_FILESIZE m_BufOffset; uint32_t m_WordSize; uint8_t m_WordBuffer[257]; + uint32_t m_ReadBufferSize = CPDF_ModuleMgr::kFileBufSize; }; #endif // CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_ -- cgit v1.2.3