From c68109a2dac3be544b7753d1fd677255d859745b Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Fri, 20 Jul 2018 12:03:16 +0000 Subject: Rework of CPDF_Parser::RebuildCrossRef. Use CPDF_SyntaxParser logic to rebuild crossref. Change-Id: I394f64e76294b97c6a7c2b8984a880712fd193a7 Reviewed-on: https://pdfium-review.googlesource.com/37314 Reviewed-by: Lei Zhang Commit-Queue: Art Snake --- core/fpdfapi/parser/cpdf_parser.cpp | 303 +++++------------------------------- core/fpdfapi/parser/cpdf_parser.h | 17 -- 2 files changed, 42 insertions(+), 278 deletions(-) (limited to 'core') diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp index ecc0546de0..d4ad0656e6 100644 --- a/core/fpdfapi/parser/cpdf_parser.cpp +++ b/core/fpdfapi/parser/cpdf_parser.cpp @@ -617,276 +617,57 @@ bool CPDF_Parser::LoadAllCrossRefV5(FX_FILESIZE xrefpos) { bool CPDF_Parser::RebuildCrossRef() { auto cross_ref_table = pdfium::MakeUnique(); - ParserState state = ParserState::kDefault; - int32_t inside_index = 0; - uint32_t objnum = 0; - uint32_t gennum = 0; - int32_t depth = 0; const uint32_t kBufferSize = 4096; m_pSyntax->SetReadBufferSize(kBufferSize); - FX_FILESIZE start_pos = 0; - FX_FILESIZE start_pos1 = 0; - FX_FILESIZE last_obj = -1; - FX_FILESIZE last_xref = -1; - FX_FILESIZE last_trailer = -1; - - uint8_t byte = 0; m_pSyntax->SetPos(0); - { - while (m_pSyntax->GetNextChar(byte)) { - const FX_FILESIZE current_char_pos = m_pSyntax->GetPos() - 1; - switch (state) { - case ParserState::kDefault: - if (PDFCharIsWhitespace(byte)) { - state = ParserState::kWhitespace; - } else if (std::isdigit(byte)) { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kWhitespace; - } else if (byte == '%') { - inside_index = 0; - state = ParserState::kComment; - } else if (byte == '(') { - state = ParserState::kString; - depth = 1; - } else if (byte == '<') { - inside_index = 1; - state = ParserState::kHexString; - } else if (byte == '\\') { - state = ParserState::kEscapedString; - } else if (byte == 't') { - state = ParserState::kTrailer; - inside_index = 1; - } - break; - - case ParserState::kWhitespace: - if (std::isdigit(byte)) { - start_pos = current_char_pos; - state = ParserState::kObjNum; - objnum = FXSYS_DecimalCharToInt(static_cast(byte)); - } else if (byte == 't') { - state = ParserState::kTrailer; - inside_index = 1; - } else if (byte == 'x') { - state = ParserState::kXref; - inside_index = 1; - } else if (!PDFCharIsWhitespace(byte)) { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kObjNum: - if (std::isdigit(byte)) { - objnum = objnum * 10 + - FXSYS_DecimalCharToInt(static_cast(byte)); - } else if (PDFCharIsWhitespace(byte)) { - state = ParserState::kPostObjNum; - } else { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kEndObj; - inside_index = 0; - } - break; - - case ParserState::kPostObjNum: - if (std::isdigit(byte)) { - start_pos1 = current_char_pos; - state = ParserState::kGenNum; - gennum = FXSYS_DecimalCharToInt(static_cast(byte)); - } else if (byte == 't') { - state = ParserState::kTrailer; - inside_index = 1; - } else if (!PDFCharIsWhitespace(byte)) { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kGenNum: - if (std::isdigit(byte)) { - gennum = gennum * 10 + - FXSYS_DecimalCharToInt(static_cast(byte)); - } else if (PDFCharIsWhitespace(byte)) { - state = ParserState::kPostGenNum; - } else { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kPostGenNum: - if (byte == 'o') { - state = ParserState::kBeginObj; - inside_index = 1; - } else if (std::isdigit(byte)) { - objnum = gennum; - gennum = FXSYS_DecimalCharToInt(static_cast(byte)); - start_pos = start_pos1; - start_pos1 = current_char_pos; - state = ParserState::kGenNum; - } else if (byte == 't') { - state = ParserState::kTrailer; - inside_index = 1; - } else if (!PDFCharIsWhitespace(byte)) { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kBeginObj: - switch (inside_index) { - case 1: - if (byte != 'b') { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } else { - inside_index++; - } - break; - case 2: - if (byte != 'j') { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } else { - inside_index++; - } - break; - case 3: - if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { - const FX_FILESIZE obj_pos = start_pos; - last_obj = start_pos; - m_pSyntax->SetPos(obj_pos); - auto pObject = m_pSyntax->GetIndirectObject( - nullptr, CPDF_SyntaxParser::ParseType::kStrict); - m_pSyntax->SetPos( - std::max(current_char_pos, m_pSyntax->GetPos())); - - if (CPDF_Stream* pStream = ToStream(pObject.get())) { - if (CPDF_Dictionary* pDict = pStream->GetDict()) { - if ((pDict->KeyExist("Type")) && - (pDict->GetStringFor("Type") == "XRef" && - pDict->KeyExist("Size"))) { - CPDF_Object* pRoot = pDict->GetObjectFor("Root"); - if (pRoot && pRoot->GetDict() && - pRoot->GetDict()->GetObjectFor("Pages")) { - cross_ref_table->Update( - pdfium::MakeUnique( - ToDictionary(pDict->Clone()))); - } - } - } - } - if (objnum < kMaxObjectNumber) - cross_ref_table->AddNormal(objnum, gennum, obj_pos); - } - state = ParserState::kDefault; - break; - } - break; - - case ParserState::kTrailer: - if (inside_index == 7) { - if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { - last_trailer = current_char_pos - 7; - m_pSyntax->SetPos(current_char_pos); - std::unique_ptr pObj = - m_pSyntax->GetObjectBody(nullptr); - if (!pObj) - m_pSyntax->SetPos(current_char_pos); - - if (pObj) { - cross_ref_table->Update(pdfium::MakeUnique( - ToDictionary(pObj->IsStream() - ? pObj->AsStream()->GetDict()->Clone() - : std::move(pObj)))); - - FX_FILESIZE dwSavePos = m_pSyntax->GetPos(); - ByteString strWord = m_pSyntax->GetKeyword(); - if (!strWord.Compare("startxref")) { - bool bNumber; - ByteString bsOffset = m_pSyntax->GetNextWord(&bNumber); - if (bNumber) - last_xref = FXSYS_atoi(bsOffset.c_str()); - } else { - m_pSyntax->SetPos(dwSavePos); - } - } - } - state = ParserState::kDefault; - } else if (byte == "trailer"[inside_index]) { - inside_index++; - } else { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kXref: - if (inside_index == 4) { - last_xref = current_char_pos - 4; - state = ParserState::kWhitespace; - } else if (byte == "xref"[inside_index]) { - inside_index++; - } else { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kComment: - if (PDFCharIsLineEnding(byte)) - state = ParserState::kDefault; - break; - - case ParserState::kString: - if (byte == ')') { - if (depth > 0) - depth--; - } else if (byte == '(') { - depth++; - } - - if (!depth) - state = ParserState::kDefault; - break; - case ParserState::kHexString: - if (byte == '>' || (byte == '<' && inside_index == 1)) - state = ParserState::kDefault; - inside_index = 0; - break; + bool bIsNumber; + std::vector> numbers; + for (ByteString word = m_pSyntax->GetNextWord(&bIsNumber); !word.IsEmpty(); + word = m_pSyntax->GetNextWord(&bIsNumber)) { + if (bIsNumber) { + numbers.emplace_back(FXSYS_atoui(word.c_str()), + m_pSyntax->GetPos() - word.GetLength()); + if (numbers.size() > 2u) + numbers.erase(numbers.begin()); + continue; + } - case ParserState::kEscapedString: - if (PDFCharIsDelimiter(byte) || PDFCharIsWhitespace(byte)) { - m_pSyntax->SetPos(current_char_pos); - state = ParserState::kDefault; - } - break; - - case ParserState::kEndObj: - if (PDFCharIsWhitespace(byte)) { - state = ParserState::kDefault; - } else if (byte == '%' || byte == '(' || byte == '<' || - byte == '\\') { - state = ParserState::kDefault; - m_pSyntax->SetPos(current_char_pos); - } else if (inside_index == 6) { - state = ParserState::kDefault; - m_pSyntax->SetPos(current_char_pos); - } else if (byte == "endobj"[inside_index]) { - inside_index++; - } - break; + if (word == "(") { + m_pSyntax->ReadString(); + } else if (word == "<") { + m_pSyntax->ReadHexString(); + } else if (word == "trailer") { + std::unique_ptr pTrailer = m_pSyntax->GetObjectBody(nullptr); + if (pTrailer) { + cross_ref_table = CPDF_CrossRefTable::MergeUp( + std::move(cross_ref_table), + pdfium::MakeUnique(ToDictionary( + pTrailer->IsStream() ? pTrailer->AsStream()->GetDict()->Clone() + : std::move(pTrailer)))); + } + } else if (word == "obj" && numbers.size() == 2u) { + const FX_FILESIZE obj_pos = numbers[0].second; + const uint32_t obj_num = numbers[0].first; + const uint32_t gen_num = numbers[1].first; + if (obj_num < kMaxObjectNumber) + cross_ref_table->AddNormal(obj_num, gen_num, obj_pos); + + m_pSyntax->SetPos(obj_pos); + const std::unique_ptr pStream = + ToStream(m_pSyntax->GetIndirectObject( + nullptr, CPDF_SyntaxParser::ParseType::kStrict)); + + if (pStream && pStream->GetDict()->GetStringFor("Type") == "XRef") { + cross_ref_table = CPDF_CrossRefTable::MergeUp( + std::move(cross_ref_table), + pdfium::MakeUnique( + ToDictionary(pStream->GetDict()->Clone()))); } } + numbers.clear(); } - if (last_xref != -1 && last_xref > last_obj) - last_trailer = last_xref; - else if (last_trailer == -1 || last_xref < last_obj) - last_trailer = m_pSyntax->m_FileLen; - m_CrossRefTable = CPDF_CrossRefTable::MergeUp(std::move(m_CrossRefTable), std::move(cross_ref_table)); // Resore default buffer size. diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h index fc4e49910d..92aaf7df6a 100644 --- a/core/fpdfapi/parser/cpdf_parser.h +++ b/core/fpdfapi/parser/cpdf_parser.h @@ -123,23 +123,6 @@ class CPDF_Parser { private: friend class CPDF_DataAvail; - enum class ParserState { - kDefault, - kComment, - kWhitespace, - kString, - kHexString, - kEscapedString, - kXref, - kObjNum, - kPostObjNum, - kGenNum, - kPostGenNum, - kTrailer, - kBeginObj, - kEndObj - }; - struct CrossRefObjData { uint32_t obj_num = 0; ObjectInfo info; -- cgit v1.2.3