From c205b6da9307232594bcb3f30c89306c9b1362a8 Mon Sep 17 00:00:00 2001 From: Artem Strygin Date: Tue, 3 Jul 2018 18:18:34 +0000 Subject: Do data request for CPDF_Stream more smoothly. For DocumentLoader we should do reconnect to skip non-requested blocks on each requested offset jump. To reduce reconnections, read stream data first, then do all checks. Thereby the DocumentLoader will continue loading data without reconnections. Change-Id: I344d045e59c5de9e1a4aed0002ea122caa92f240 Reviewed-on: https://pdfium-review.googlesource.com/13450 Commit-Queue: Art Snake Reviewed-by: Lei Zhang --- core/fpdfapi/parser/cpdf_syntax_parser.cpp | 175 +++++++++++++++-------------- core/fpdfapi/parser/cpdf_syntax_parser.h | 2 + 2 files changed, 92 insertions(+), 85 deletions(-) diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.cpp b/core/fpdfapi/parser/cpdf_syntax_parser.cpp index afdac58257..00eed49300 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.cpp +++ b/core/fpdfapi/parser/cpdf_syntax_parser.cpp @@ -518,6 +518,56 @@ unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) { return 0; } +FX_FILESIZE CPDF_SyntaxParser::FindWordPos(const ByteStringView& word) { + AutoRestorer pos_restorer(&m_Pos); + FX_FILESIZE end_offset = FindTag(word, 0); + while (end_offset >= 0) { + // Stop searching when word is found. + if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true)) + return GetPos() - word.GetLength(); + + end_offset = FindTag(word, 0); + } + return -1; +} + +FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() { + const ByteStringView kEndStreamStr("endstream"); + const ByteStringView kEndObjStr("endobj"); + + FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr); + FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr); + + // Can't find "endstream" or "endobj". + if (endStreamWordOffset < 0 && endObjWordOffset < 0) { + return -1; + } + + if (endStreamWordOffset < 0 && endObjWordOffset >= 0) { + // Correct the position of end stream. + endStreamWordOffset = endObjWordOffset; + } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) { + // Correct the position of end obj. + endObjWordOffset = endStreamWordOffset; + } else if (endStreamWordOffset > endObjWordOffset) { + endStreamWordOffset = endObjWordOffset; + } + + int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2); + if (numMarkers == 2) { + endStreamWordOffset -= 2; + } else { + numMarkers = ReadEOLMarkers(endStreamWordOffset - 1); + if (numMarkers == 1) { + endStreamWordOffset -= 1; + } + } + if (endStreamWordOffset < GetPos()) { + return -1; + } + return endStreamWordOffset; +} + std::unique_ptr CPDF_SyntaxParser::ReadStream( std::unique_ptr pDict) { const CPDF_Number* pLenObj = ToNumber(pDict->GetDirectObjectFor("Length")); @@ -525,113 +575,68 @@ std::unique_ptr CPDF_SyntaxParser::ReadStream( // Locate the start of stream. ToNextLine(); - FX_FILESIZE streamStartPos = GetPos(); + const FX_FILESIZE streamStartPos = GetPos(); + + std::unique_ptr pData; + if (len > 0) { + FX_SAFE_FILESIZE pos = GetPos(); + pos += len; + if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen) + len = -1; + } + + if (len > 0) { + pData.reset(FX_Alloc(uint8_t, len)); + // We should try read data first to allow the Validator to request data + // smoothly, without jumps. + if (!ReadBlock(pData.get(), len)) + return nullptr; + } const ByteStringView kEndStreamStr("endstream"); const ByteStringView kEndObjStr("endobj"); - bool bSearchForKeyword = true; + // Note, we allow zero length streams as we need to pass them through when we + // are importing pages into a new document. if (len >= 0) { - pdfium::base::CheckedNumeric pos = GetPos(); - pos += len; - if (pos.IsValid() && pos.ValueOrDie() < m_FileLen) - m_Pos = pos.ValueOrDie(); - + const CPDF_ReadValidator::Session read_session(GetValidator().Get()); m_Pos += ReadEOLMarkers(GetPos()); memset(m_WordBuffer, 0, kEndStreamStr.GetLength() + 1); GetNextWordInternal(nullptr); + if (GetValidator()->has_read_problems()) + return nullptr; + // Earlier version of PDF specification doesn't require EOL marker before // 'endstream' keyword. If keyword 'endstream' follows the bytes in // specified length, it signals the end of stream. if (memcmp(m_WordBuffer, kEndStreamStr.raw_str(), - kEndStreamStr.GetLength()) == 0) { - bSearchForKeyword = false; + kEndStreamStr.GetLength()) != 0) { + pData.reset(); + len = -1; + SetPos(streamStartPos); } } - if (bSearchForKeyword) { - // If len is not available, len needs to be calculated + if (len < 0) { + // If len is not available or incorrect, len needs to be calculated // by searching the keywords "endstream" or "endobj". - m_Pos = streamStartPos; - FX_FILESIZE endStreamOffset = 0; - while (endStreamOffset >= 0) { - endStreamOffset = FindTag(kEndStreamStr, 0); - - // Can't find "endstream". - if (endStreamOffset < 0) - break; - - // Stop searching when "endstream" is found. - if (IsWholeWord(GetPos() - kEndStreamStr.GetLength(), m_FileLen, - kEndStreamStr, true)) { - endStreamOffset = GetPos() - streamStartPos - kEndStreamStr.GetLength(); - break; - } - } - - m_Pos = streamStartPos; - FX_FILESIZE endObjOffset = 0; - while (endObjOffset >= 0) { - endObjOffset = FindTag(kEndObjStr, 0); - - // Can't find "endobj". - if (endObjOffset < 0) - break; - - // Stop searching when "endobj" is found. - if (IsWholeWord(GetPos() - kEndObjStr.GetLength(), m_FileLen, kEndObjStr, - true)) { - endObjOffset = GetPos() - streamStartPos - kEndObjStr.GetLength(); - break; - } - } - - // Can't find "endstream" or "endobj". - if (endStreamOffset < 0 && endObjOffset < 0) + const FX_FILESIZE streamEndPos = FindStreamEndPos(); + if (streamEndPos < 0) return nullptr; - if (endStreamOffset < 0 && endObjOffset >= 0) { - // Correct the position of end stream. - endStreamOffset = endObjOffset; - } else if (endStreamOffset >= 0 && endObjOffset < 0) { - // Correct the position of end obj. - endObjOffset = endStreamOffset; - } else if (endStreamOffset > endObjOffset) { - endStreamOffset = endObjOffset; - } - len = endStreamOffset; - - int numMarkers = ReadEOLMarkers(streamStartPos + endStreamOffset - 2); - if (numMarkers == 2) { - len -= 2; - } else { - numMarkers = ReadEOLMarkers(streamStartPos + endStreamOffset - 1); - if (numMarkers == 1) { - len -= 1; - } + len = streamEndPos - streamStartPos; + ASSERT(len >= 0); + if (len > 0) { + SetPos(streamStartPos); + pData.reset(FX_Alloc(uint8_t, len)); + if (!ReadBlock(pData.get(), len)) + return nullptr; } - if (len < 0) - return nullptr; - - pDict->SetNewFor("Length", static_cast(len)); } - m_Pos = streamStartPos; - - // Read up to the end of the buffer. Note, we allow zero length streams as - // we need to pass them through when we are importing pages into a new - // document. - len = std::min(len, m_FileLen - GetPos() - m_HeaderOffset); - if (len < 0) - return nullptr; - std::unique_ptr pData; - if (len > 0) { - pData.reset(FX_Alloc(uint8_t, len)); - ReadBlock(pData.get(), len); - } auto pStream = pdfium::MakeUnique(std::move(pData), len, std::move(pDict)); - streamStartPos = GetPos(); + const FX_FILESIZE end_stream_offset = GetPos(); memset(m_WordBuffer, 0, kEndObjStr.GetLength() + 1); GetNextWordInternal(nullptr); @@ -639,7 +644,7 @@ std::unique_ptr CPDF_SyntaxParser::ReadStream( if (m_WordSize == static_cast(kEndObjStr.GetLength()) && numMarkers != 0 && memcmp(m_WordBuffer, kEndObjStr.raw_str(), kEndObjStr.GetLength()) == 0) { - m_Pos = streamStartPos; + SetPos(end_stream_offset); } return pStream; } diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.h b/core/fpdfapi/parser/cpdf_syntax_parser.h index a8f6bb2033..a29d631c95 100644 --- a/core/fpdfapi/parser/cpdf_syntax_parser.h +++ b/core/fpdfapi/parser/cpdf_syntax_parser.h @@ -87,6 +87,8 @@ class CPDF_SyntaxParser { ByteString ReadString(); ByteString ReadHexString(); unsigned int ReadEOLMarkers(FX_FILESIZE pos); + FX_FILESIZE FindWordPos(const ByteStringView& word); + FX_FILESIZE FindStreamEndPos(); std::unique_ptr ReadStream( std::unique_ptr pDict); -- cgit v1.2.3