From 488b7ad845d6de212d89cd957303b294ecfa5922 Mon Sep 17 00:00:00 2001 From: dsinclair Date: Tue, 4 Oct 2016 11:55:50 -0700 Subject: Move core/fpdfapi/fpdf_parser to core/fpdfapi/parser BUG=pdfium:603 Review-Url: https://codereview.chromium.org/2392603004 --- core/fpdfapi/parser/cpdf_syntax_parser.cpp | 997 +++++++++++++++++++++++++++++ 1 file changed, 997 insertions(+) create mode 100644 core/fpdfapi/parser/cpdf_syntax_parser.cpp (limited to 'core/fpdfapi/parser/cpdf_syntax_parser.cpp') diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.cpp b/core/fpdfapi/parser/cpdf_syntax_parser.cpp new file mode 100644 index 0000000000..e7f25d15c2 --- /dev/null +++ b/core/fpdfapi/parser/cpdf_syntax_parser.cpp @@ -0,0 +1,997 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fpdfapi/parser/cpdf_syntax_parser.h" + +#include + +#include "core/fpdfapi/cpdf_modulemgr.h" +#include "core/fpdfapi/parser/cpdf_array.h" +#include "core/fpdfapi/parser/cpdf_boolean.h" +#include "core/fpdfapi/parser/cpdf_crypto_handler.h" +#include "core/fpdfapi/parser/cpdf_dictionary.h" +#include "core/fpdfapi/parser/cpdf_name.h" +#include "core/fpdfapi/parser/cpdf_null.h" +#include "core/fpdfapi/parser/cpdf_number.h" +#include "core/fpdfapi/parser/cpdf_reference.h" +#include "core/fpdfapi/parser/cpdf_stream.h" +#include "core/fpdfapi/parser/cpdf_string.h" +#include "core/fpdfapi/parser/fpdf_parser_decode.h" +#include "core/fpdfapi/parser/fpdf_parser_utility.h" +#include "core/fxcrt/fx_ext.h" +#include "third_party/base/numerics/safe_math.h" + +namespace { + +struct SearchTagRecord { + CFX_ByteStringC m_bsTag; + FX_STRSIZE m_Offset; +}; + +} // namespace + +// static +int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0; + +CPDF_SyntaxParser::CPDF_SyntaxParser() + : CPDF_SyntaxParser(CFX_WeakPtr()) {} + +CPDF_SyntaxParser::CPDF_SyntaxParser( + const CFX_WeakPtr& pPool) + : m_MetadataObjnum(0), + m_pFileAccess(nullptr), + m_pFileBuf(nullptr), + m_BufSize(CPDF_ModuleMgr::kFileBufSize), + m_pPool(pPool) {} + +CPDF_SyntaxParser::~CPDF_SyntaxParser() { + FX_Free(m_pFileBuf); +} + +FX_BOOL CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) { + CFX_AutoRestorer save_pos(&m_Pos); + m_Pos = pos; + return GetNextChar(ch); +} + +FX_BOOL CPDF_SyntaxParser::GetNextChar(uint8_t& ch) { + FX_FILESIZE pos = m_Pos + m_HeaderOffset; + if (pos >= m_FileLen) + return FALSE; + + if (m_BufOffset >= pos || (FX_FILESIZE)(m_BufOffset + m_BufSize) <= pos) { + FX_FILESIZE read_pos = pos; + uint32_t read_size = m_BufSize; + if ((FX_FILESIZE)read_size > m_FileLen) + read_size = (uint32_t)m_FileLen; + + if ((FX_FILESIZE)(read_pos + read_size) > m_FileLen) { + if (m_FileLen < (FX_FILESIZE)read_size) { + read_pos = 0; + read_size = (uint32_t)m_FileLen; + } else { + read_pos = m_FileLen - read_size; + } + } + + if (!m_pFileAccess->ReadBlock(m_pFileBuf, read_pos, read_size)) + return FALSE; + + m_BufOffset = read_pos; + } + ch = m_pFileBuf[pos - m_BufOffset]; + m_Pos++; + return TRUE; +} + +FX_BOOL CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t& ch) { + pos += m_HeaderOffset; + if (pos >= m_FileLen) + return FALSE; + + if (m_BufOffset >= pos || (FX_FILESIZE)(m_BufOffset + m_BufSize) <= pos) { + FX_FILESIZE read_pos; + if (pos < (FX_FILESIZE)m_BufSize) + read_pos = 0; + else + read_pos = pos - m_BufSize + 1; + + uint32_t read_size = m_BufSize; + if ((FX_FILESIZE)(read_pos + read_size) > m_FileLen) { + if (m_FileLen < (FX_FILESIZE)read_size) { + read_pos = 0; + read_size = (uint32_t)m_FileLen; + } else { + read_pos = m_FileLen - read_size; + } + } + + if (!m_pFileAccess->ReadBlock(m_pFileBuf, read_pos, read_size)) + return FALSE; + + m_BufOffset = read_pos; + } + ch = m_pFileBuf[pos - m_BufOffset]; + return TRUE; +} + +FX_BOOL CPDF_SyntaxParser::ReadBlock(uint8_t* pBuf, uint32_t size) { + if (!m_pFileAccess->ReadBlock(pBuf, m_Pos + m_HeaderOffset, size)) + return FALSE; + m_Pos += size; + return TRUE; +} + +void CPDF_SyntaxParser::GetNextWordInternal(bool* bIsNumber) { + m_WordSize = 0; + if (bIsNumber) + *bIsNumber = true; + + uint8_t ch; + if (!GetNextChar(ch)) + return; + + while (1) { + while (PDFCharIsWhitespace(ch)) { + if (!GetNextChar(ch)) + return; + } + + if (ch != '%') + break; + + while (1) { + if (!GetNextChar(ch)) + return; + if (PDFCharIsLineEnding(ch)) + break; + } + } + + if (PDFCharIsDelimiter(ch)) { + if (bIsNumber) + *bIsNumber = false; + + m_WordBuffer[m_WordSize++] = ch; + if (ch == '/') { + while (1) { + if (!GetNextChar(ch)) + return; + + if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) { + m_Pos--; + return; + } + + if (m_WordSize < sizeof(m_WordBuffer) - 1) + m_WordBuffer[m_WordSize++] = ch; + } + } else if (ch == '<') { + if (!GetNextChar(ch)) + return; + + if (ch == '<') + m_WordBuffer[m_WordSize++] = ch; + else + m_Pos--; + } else if (ch == '>') { + if (!GetNextChar(ch)) + return; + + if (ch == '>') + m_WordBuffer[m_WordSize++] = ch; + else + m_Pos--; + } + return; + } + + while (1) { + if (m_WordSize < sizeof(m_WordBuffer) - 1) + m_WordBuffer[m_WordSize++] = ch; + + if (!PDFCharIsNumeric(ch)) { + if (bIsNumber) + *bIsNumber = false; + } + + if (!GetNextChar(ch)) + return; + + if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) { + m_Pos--; + break; + } + } +} + +CFX_ByteString CPDF_SyntaxParser::ReadString() { + uint8_t ch; + if (!GetNextChar(ch)) + return CFX_ByteString(); + + CFX_ByteTextBuf buf; + int32_t parlevel = 0; + int32_t status = 0; + int32_t iEscCode = 0; + while (1) { + switch (status) { + case 0: + if (ch == ')') { + if (parlevel == 0) { + return buf.MakeString(); + } + parlevel--; + buf.AppendChar(')'); + } else if (ch == '(') { + parlevel++; + buf.AppendChar('('); + } else if (ch == '\\') { + status = 1; + } else { + buf.AppendChar(ch); + } + break; + case 1: + if (ch >= '0' && ch <= '7') { + iEscCode = FXSYS_toDecimalDigit(static_cast(ch)); + status = 2; + break; + } + + if (ch == 'n') { + buf.AppendChar('\n'); + } else if (ch == 'r') { + buf.AppendChar('\r'); + } else if (ch == 't') { + buf.AppendChar('\t'); + } else if (ch == 'b') { + buf.AppendChar('\b'); + } else if (ch == 'f') { + buf.AppendChar('\f'); + } else if (ch == '\r') { + status = 4; + break; + } else if (ch != '\n') { + buf.AppendChar(ch); + } + status = 0; + break; + case 2: + if (ch >= '0' && ch <= '7') { + iEscCode = + iEscCode * 8 + FXSYS_toDecimalDigit(static_cast(ch)); + status = 3; + } else { + buf.AppendChar(iEscCode); + status = 0; + continue; + } + break; + case 3: + if (ch >= '0' && ch <= '7') { + iEscCode = + iEscCode * 8 + FXSYS_toDecimalDigit(static_cast(ch)); + buf.AppendChar(iEscCode); + status = 0; + } else { + buf.AppendChar(iEscCode); + status = 0; + continue; + } + break; + case 4: + status = 0; + if (ch != '\n') + continue; + break; + } + + if (!GetNextChar(ch)) + break; + } + + GetNextChar(ch); + return buf.MakeString(); +} + +CFX_ByteString CPDF_SyntaxParser::ReadHexString() { + uint8_t ch; + if (!GetNextChar(ch)) + return CFX_ByteString(); + + CFX_ByteTextBuf buf; + bool bFirst = true; + uint8_t code = 0; + while (1) { + if (ch == '>') + break; + + if (std::isxdigit(ch)) { + int val = FXSYS_toHexDigit(ch); + if (bFirst) { + code = val * 16; + } else { + code += val; + buf.AppendByte(code); + } + bFirst = !bFirst; + } + + if (!GetNextChar(ch)) + break; + } + if (!bFirst) + buf.AppendByte(code); + + return buf.MakeString(); +} + +void CPDF_SyntaxParser::ToNextLine() { + uint8_t ch; + while (GetNextChar(ch)) { + if (ch == '\n') + break; + + if (ch == '\r') { + GetNextChar(ch); + if (ch != '\n') + --m_Pos; + break; + } + } +} + +void CPDF_SyntaxParser::ToNextWord() { + uint8_t ch; + if (!GetNextChar(ch)) + return; + + while (1) { + while (PDFCharIsWhitespace(ch)) { + if (!GetNextChar(ch)) + return; + } + + if (ch != '%') + break; + + while (1) { + if (!GetNextChar(ch)) + return; + if (PDFCharIsLineEnding(ch)) + break; + } + } + m_Pos--; +} + +CFX_ByteString CPDF_SyntaxParser::GetNextWord(bool* bIsNumber) { + GetNextWordInternal(bIsNumber); + return CFX_ByteString((const FX_CHAR*)m_WordBuffer, m_WordSize); +} + +CFX_ByteString CPDF_SyntaxParser::GetKeyword() { + return GetNextWord(nullptr); +} + +CPDF_Object* CPDF_SyntaxParser::GetObject(CPDF_IndirectObjectHolder* pObjList, + uint32_t objnum, + uint32_t gennum, + FX_BOOL bDecrypt) { + CFX_AutoRestorer restorer(&s_CurrentRecursionDepth); + if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth) + return nullptr; + + FX_FILESIZE SavedObjPos = m_Pos; + bool bIsNumber; + CFX_ByteString word = GetNextWord(&bIsNumber); + if (word.GetLength() == 0) + return nullptr; + + if (bIsNumber) { + FX_FILESIZE SavedPos = m_Pos; + CFX_ByteString nextword = GetNextWord(&bIsNumber); + if (bIsNumber) { + CFX_ByteString nextword2 = GetNextWord(nullptr); + if (nextword2 == "R") + return new CPDF_Reference(pObjList, FXSYS_atoui(word.c_str())); + } + m_Pos = SavedPos; + return new CPDF_Number(word.AsStringC()); + } + + if (word == "true" || word == "false") + return new CPDF_Boolean(word == "true"); + + if (word == "null") + return new CPDF_Null; + + if (word == "(") { + CFX_ByteString str = ReadString(); + if (m_pCryptoHandler && bDecrypt) + m_pCryptoHandler->Decrypt(objnum, gennum, str); + return new CPDF_String(MaybeIntern(str), FALSE); + } + + if (word == "<") { + CFX_ByteString str = ReadHexString(); + if (m_pCryptoHandler && bDecrypt) + m_pCryptoHandler->Decrypt(objnum, gennum, str); + return new CPDF_String(MaybeIntern(str), TRUE); + } + + if (word == "[") { + CPDF_Array* pArray = new CPDF_Array; + while (CPDF_Object* pObj = GetObject(pObjList, objnum, gennum, true)) + pArray->Add(pObj); + + return pArray; + } + + if (word[0] == '/') { + return new CPDF_Name(MaybeIntern( + PDF_NameDecode(CFX_ByteStringC(m_WordBuffer + 1, m_WordSize - 1)))); + } + + if (word == "<<") { + int32_t nKeys = 0; + FX_FILESIZE dwSignValuePos = 0; + + std::unique_ptr> pDict( + new CPDF_Dictionary(m_pPool)); + while (1) { + CFX_ByteString key = GetNextWord(nullptr); + if (key.IsEmpty()) + return nullptr; + + FX_FILESIZE SavedPos = m_Pos - key.GetLength(); + if (key == ">>") + break; + + if (key == "endobj") { + m_Pos = SavedPos; + break; + } + + if (key[0] != '/') + continue; + + ++nKeys; + key = PDF_NameDecode(key); + if (key.IsEmpty()) + continue; + + if (key == "/Contents") + dwSignValuePos = m_Pos; + + CPDF_Object* pObj = GetObject(pObjList, objnum, gennum, true); + if (!pObj) + continue; + + CFX_ByteString keyNoSlash(key.raw_str() + 1, key.GetLength() - 1); + pDict->SetFor(keyNoSlash, pObj); + } + + // Only when this is a signature dictionary and has contents, we reset the + // contents to the un-decrypted form. + if (pDict->IsSignatureDict() && dwSignValuePos) { + CFX_AutoRestorer save_pos(&m_Pos); + m_Pos = dwSignValuePos; + pDict->SetFor("Contents", GetObject(pObjList, objnum, gennum, false)); + } + + FX_FILESIZE SavedPos = m_Pos; + CFX_ByteString nextword = GetNextWord(nullptr); + if (nextword != "stream") { + m_Pos = SavedPos; + return pDict.release(); + } + return ReadStream(pDict.release(), objnum, gennum); + } + + if (word == ">>") + m_Pos = SavedObjPos; + + return nullptr; +} + +CPDF_Object* CPDF_SyntaxParser::GetObjectForStrict( + CPDF_IndirectObjectHolder* pObjList, + uint32_t objnum, + uint32_t gennum) { + CFX_AutoRestorer restorer(&s_CurrentRecursionDepth); + if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth) + return nullptr; + + FX_FILESIZE SavedObjPos = m_Pos; + bool bIsNumber; + CFX_ByteString word = GetNextWord(&bIsNumber); + if (word.GetLength() == 0) + return nullptr; + + if (bIsNumber) { + FX_FILESIZE SavedPos = m_Pos; + CFX_ByteString nextword = GetNextWord(&bIsNumber); + if (bIsNumber) { + CFX_ByteString nextword2 = GetNextWord(nullptr); + if (nextword2 == "R") + return new CPDF_Reference(pObjList, FXSYS_atoui(word.c_str())); + } + m_Pos = SavedPos; + return new CPDF_Number(word.AsStringC()); + } + + if (word == "true" || word == "false") + return new CPDF_Boolean(word == "true"); + + if (word == "null") + return new CPDF_Null; + + if (word == "(") { + CFX_ByteString str = ReadString(); + if (m_pCryptoHandler) + m_pCryptoHandler->Decrypt(objnum, gennum, str); + return new CPDF_String(MaybeIntern(str), FALSE); + } + + if (word == "<") { + CFX_ByteString str = ReadHexString(); + if (m_pCryptoHandler) + m_pCryptoHandler->Decrypt(objnum, gennum, str); + return new CPDF_String(MaybeIntern(str), TRUE); + } + + if (word == "[") { + std::unique_ptr> pArray( + new CPDF_Array); + while (CPDF_Object* pObj = GetObject(pObjList, objnum, gennum, true)) + pArray->Add(pObj); + + return m_WordBuffer[0] == ']' ? pArray.release() : nullptr; + } + + if (word[0] == '/') { + return new CPDF_Name(MaybeIntern( + PDF_NameDecode(CFX_ByteStringC(m_WordBuffer + 1, m_WordSize - 1)))); + } + + if (word == "<<") { + std::unique_ptr> pDict( + new CPDF_Dictionary(m_pPool)); + while (1) { + FX_FILESIZE SavedPos = m_Pos; + CFX_ByteString key = GetNextWord(nullptr); + if (key.IsEmpty()) + return nullptr; + + if (key == ">>") + break; + + if (key == "endobj") { + m_Pos = SavedPos; + break; + } + + if (key[0] != '/') + continue; + + key = PDF_NameDecode(key); + std::unique_ptr> obj( + GetObject(pObjList, objnum, gennum, true)); + if (!obj) { + uint8_t ch; + while (GetNextChar(ch) && ch != 0x0A && ch != 0x0D) { + continue; + } + return nullptr; + } + + if (key.GetLength() > 1) { + pDict->SetFor(CFX_ByteString(key.c_str() + 1, key.GetLength() - 1), + obj.release()); + } + } + + FX_FILESIZE SavedPos = m_Pos; + CFX_ByteString nextword = GetNextWord(nullptr); + if (nextword != "stream") { + m_Pos = SavedPos; + return pDict.release(); + } + + return ReadStream(pDict.release(), objnum, gennum); + } + + if (word == ">>") + m_Pos = SavedObjPos; + + return nullptr; +} + +unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) { + unsigned char byte1 = 0; + unsigned char byte2 = 0; + + GetCharAt(pos, byte1); + GetCharAt(pos + 1, byte2); + + if (byte1 == '\r' && byte2 == '\n') + return 2; + + if (byte1 == '\r' || byte1 == '\n') + return 1; + + return 0; +} + +CPDF_Stream* CPDF_SyntaxParser::ReadStream(CPDF_Dictionary* pDict, + uint32_t objnum, + uint32_t gennum) { + CPDF_Object* pLenObj = pDict->GetObjectFor("Length"); + FX_FILESIZE len = -1; + CPDF_Reference* pLenObjRef = ToReference(pLenObj); + + bool differingObjNum = !pLenObjRef || (pLenObjRef->GetObjList() && + pLenObjRef->GetRefObjNum() != objnum); + if (pLenObj && differingObjNum) + len = pLenObj->GetInteger(); + + // Locate the start of stream. + ToNextLine(); + FX_FILESIZE streamStartPos = m_Pos; + + const CFX_ByteStringC kEndStreamStr("endstream"); + const CFX_ByteStringC kEndObjStr("endobj"); + + CPDF_CryptoHandler* pCryptoHandler = + objnum == (uint32_t)m_MetadataObjnum ? nullptr : m_pCryptoHandler.get(); + if (!pCryptoHandler) { + FX_BOOL bSearchForKeyword = TRUE; + if (len >= 0) { + pdfium::base::CheckedNumeric pos = m_Pos; + pos += len; + if (pos.IsValid() && pos.ValueOrDie() < m_FileLen) + m_Pos = pos.ValueOrDie(); + + m_Pos += ReadEOLMarkers(m_Pos); + FXSYS_memset(m_WordBuffer, 0, kEndStreamStr.GetLength() + 1); + GetNextWordInternal(nullptr); + // Earlier version of PDF specification doesn't require EOL marker before + // 'endstream' keyword. If keyword 'endstream' follows the bytes in + // specified length, it signals the end of stream. + if (FXSYS_memcmp(m_WordBuffer, kEndStreamStr.raw_str(), + kEndStreamStr.GetLength()) == 0) { + bSearchForKeyword = FALSE; + } + } + + if (bSearchForKeyword) { + // If len is not available, len needs to be calculated + // by searching the keywords "endstream" or "endobj". + m_Pos = streamStartPos; + FX_FILESIZE endStreamOffset = 0; + while (endStreamOffset >= 0) { + endStreamOffset = FindTag(kEndStreamStr, 0); + + // Can't find "endstream". + if (endStreamOffset < 0) + break; + + // Stop searching when "endstream" is found. + if (IsWholeWord(m_Pos - kEndStreamStr.GetLength(), m_FileLen, + kEndStreamStr, TRUE)) { + endStreamOffset = m_Pos - streamStartPos - kEndStreamStr.GetLength(); + break; + } + } + + m_Pos = streamStartPos; + FX_FILESIZE endObjOffset = 0; + while (endObjOffset >= 0) { + endObjOffset = FindTag(kEndObjStr, 0); + + // Can't find "endobj". + if (endObjOffset < 0) + break; + + // Stop searching when "endobj" is found. + if (IsWholeWord(m_Pos - kEndObjStr.GetLength(), m_FileLen, kEndObjStr, + TRUE)) { + endObjOffset = m_Pos - streamStartPos - kEndObjStr.GetLength(); + break; + } + } + + // Can't find "endstream" or "endobj". + if (endStreamOffset < 0 && endObjOffset < 0) { + pDict->Release(); + return nullptr; + } + + if (endStreamOffset < 0 && endObjOffset >= 0) { + // Correct the position of end stream. + endStreamOffset = endObjOffset; + } else if (endStreamOffset >= 0 && endObjOffset < 0) { + // Correct the position of end obj. + endObjOffset = endStreamOffset; + } else if (endStreamOffset > endObjOffset) { + endStreamOffset = endObjOffset; + } + + len = endStreamOffset; + int numMarkers = ReadEOLMarkers(streamStartPos + endStreamOffset - 2); + if (numMarkers == 2) { + len -= 2; + } else { + numMarkers = ReadEOLMarkers(streamStartPos + endStreamOffset - 1); + if (numMarkers == 1) { + len -= 1; + } + } + + if (len < 0) { + pDict->Release(); + return nullptr; + } + pDict->SetIntegerFor("Length", len); + } + m_Pos = streamStartPos; + } + + if (len < 0) { + pDict->Release(); + return nullptr; + } + + uint8_t* pData = nullptr; + if (len > 0) { + pData = FX_Alloc(uint8_t, len); + ReadBlock(pData, len); + if (pCryptoHandler) { + CFX_BinaryBuf dest_buf; + dest_buf.EstimateSize(pCryptoHandler->DecryptGetSize(len)); + + void* context = pCryptoHandler->DecryptStart(objnum, gennum); + pCryptoHandler->DecryptStream(context, pData, len, dest_buf); + pCryptoHandler->DecryptFinish(context, dest_buf); + + FX_Free(pData); + pData = dest_buf.GetBuffer(); + len = dest_buf.GetSize(); + dest_buf.DetachBuffer(); + } + } + + CPDF_Stream* pStream = new CPDF_Stream(pData, len, pDict); + streamStartPos = m_Pos; + FXSYS_memset(m_WordBuffer, 0, kEndObjStr.GetLength() + 1); + + GetNextWordInternal(nullptr); + + int numMarkers = ReadEOLMarkers(m_Pos); + if (m_WordSize == static_cast(kEndObjStr.GetLength()) && + numMarkers != 0 && + FXSYS_memcmp(m_WordBuffer, kEndObjStr.raw_str(), + kEndObjStr.GetLength()) == 0) { + m_Pos = streamStartPos; + } + return pStream; +} + +void CPDF_SyntaxParser::InitParser(IFX_FileRead* pFileAccess, + uint32_t HeaderOffset) { + FX_Free(m_pFileBuf); + + m_pFileBuf = FX_Alloc(uint8_t, m_BufSize); + m_HeaderOffset = HeaderOffset; + m_FileLen = pFileAccess->GetSize(); + m_Pos = 0; + m_pFileAccess = pFileAccess; + m_BufOffset = 0; + pFileAccess->ReadBlock( + m_pFileBuf, 0, + (size_t)((FX_FILESIZE)m_BufSize > m_FileLen ? m_FileLen : m_BufSize)); +} + +uint32_t CPDF_SyntaxParser::GetDirectNum() { + bool bIsNumber; + GetNextWordInternal(&bIsNumber); + if (!bIsNumber) + return 0; + + m_WordBuffer[m_WordSize] = 0; + return FXSYS_atoui(reinterpret_cast(m_WordBuffer)); +} + +bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos, + FX_FILESIZE limit, + const CFX_ByteStringC& tag, + FX_BOOL checkKeyword) { + const uint32_t taglen = tag.GetLength(); + + bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]); + bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) && + !PDFCharIsWhitespace(tag[taglen - 1]); + + uint8_t ch; + if (bCheckRight && startpos + (int32_t)taglen <= limit && + GetCharAt(startpos + (int32_t)taglen, ch)) { + if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) || + (checkKeyword && PDFCharIsDelimiter(ch))) { + return false; + } + } + + if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) { + if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) || + (checkKeyword && PDFCharIsDelimiter(ch))) { + return false; + } + } + return true; +} + +// TODO(dsinclair): Split into a SearchWordForward and SearchWordBackwards +// and drop the bool. +FX_BOOL CPDF_SyntaxParser::SearchWord(const CFX_ByteStringC& tag, + FX_BOOL bWholeWord, + FX_BOOL bForward, + FX_FILESIZE limit) { + int32_t taglen = tag.GetLength(); + if (taglen == 0) + return FALSE; + + FX_FILESIZE pos = m_Pos; + int32_t offset = 0; + if (!bForward) + offset = taglen - 1; + + const uint8_t* tag_data = tag.raw_str(); + uint8_t byte; + while (1) { + if (bForward) { + if (limit && pos >= m_Pos + limit) + return FALSE; + + if (!GetCharAt(pos, byte)) + return FALSE; + + } else { + if (limit && pos <= m_Pos - limit) + return FALSE; + + if (!GetCharAtBackward(pos, byte)) + return FALSE; + } + + if (byte == tag_data[offset]) { + if (bForward) { + offset++; + if (offset < taglen) { + pos++; + continue; + } + } else { + offset--; + if (offset >= 0) { + pos--; + continue; + } + } + + FX_FILESIZE startpos = bForward ? pos - taglen + 1 : pos; + if (!bWholeWord || IsWholeWord(startpos, limit, tag, FALSE)) { + m_Pos = startpos; + return TRUE; + } + } + + if (bForward) { + offset = byte == tag_data[0] ? 1 : 0; + pos++; + } else { + offset = byte == tag_data[taglen - 1] ? taglen - 2 : taglen - 1; + pos--; + } + + if (pos < 0) + return FALSE; + } + + return FALSE; +} + +int32_t CPDF_SyntaxParser::SearchMultiWord(const CFX_ByteStringC& tags, + FX_BOOL bWholeWord, + FX_FILESIZE limit) { + int32_t ntags = 1; + for (int i = 0; i < tags.GetLength(); ++i) { + if (tags[i] == 0) + ++ntags; + } + + // Ensure that the input byte string happens to be nul-terminated. This + // need not be the case, but the loop below uses this guarantee to put + // the last pattern into the vector. + ASSERT(tags[tags.GetLength()] == 0); + std::vector patterns(ntags); + uint32_t start = 0; + uint32_t itag = 0; + uint32_t max_len = 0; + for (int i = 0; i <= tags.GetLength(); ++i) { + if (tags[i] == 0) { + uint32_t len = i - start; + max_len = std::max(len, max_len); + patterns[itag].m_bsTag = tags.Mid(start, len); + patterns[itag].m_Offset = 0; + start = i + 1; + ++itag; + } + } + + const FX_FILESIZE pos_limit = m_Pos + limit; + for (FX_FILESIZE pos = m_Pos; !limit || pos < pos_limit; ++pos) { + uint8_t byte; + if (!GetCharAt(pos, byte)) + break; + + for (int i = 0; i < ntags; ++i) { + SearchTagRecord& pat = patterns[i]; + if (pat.m_bsTag[pat.m_Offset] != byte) { + pat.m_Offset = (pat.m_bsTag[0] == byte) ? 1 : 0; + continue; + } + + ++pat.m_Offset; + if (pat.m_Offset != pat.m_bsTag.GetLength()) + continue; + + if (!bWholeWord || IsWholeWord(pos - pat.m_bsTag.GetLength(), limit, + pat.m_bsTag, FALSE)) { + return i; + } + + pat.m_Offset = (pat.m_bsTag[0] == byte) ? 1 : 0; + } + } + return -1; +} + +FX_FILESIZE CPDF_SyntaxParser::FindTag(const CFX_ByteStringC& tag, + FX_FILESIZE limit) { + int32_t taglen = tag.GetLength(); + int32_t match = 0; + limit += m_Pos; + FX_FILESIZE startpos = m_Pos; + + while (1) { + uint8_t ch; + if (!GetNextChar(ch)) + return -1; + + if (ch == tag[match]) { + match++; + if (match == taglen) + return m_Pos - startpos - taglen; + } else { + match = ch == tag[0] ? 1 : 0; + } + + if (limit && m_Pos == limit) + return -1; + } + return -1; +} + +void CPDF_SyntaxParser::SetEncrypt( + std::unique_ptr pCryptoHandler) { + m_pCryptoHandler = std::move(pCryptoHandler); +} + +CFX_ByteString CPDF_SyntaxParser::MaybeIntern(const CFX_ByteString& str) { + return m_pPool ? m_pPool->Intern(str) : str; +} -- cgit v1.2.3