diff options
author | npm <npm@chromium.org> | 2016-11-07 08:42:11 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-11-07 08:42:11 -0800 |
commit | 014b012278b7438ef8d4b66730b8598c7eb4623a (patch) | |
tree | 75ea0ea37d0b239412133290d7b24cc975bfcc66 /core/fpdfapi/page/cpdf_streamparser.cpp | |
parent | 240dec52b2e6502e7deb27a3535af3b1a3e23428 (diff) | |
download | pdfium-014b012278b7438ef8d4b66730b8598c7eb4623a.tar.xz |
Clean up fpdf_page_parsers
- The code in fpdf_page_parser is only called by CPDF_StreamContentParser, so moved there.
- Split fpdf_page_parser_old into its two classes
- Renamed the corresponding unittests accordingly.
- Moved PDF_ReplaceAbbr to namespace
- Fixed few nits
- Added TODO because CPDF_StreamParser has a lot of code similar to CPDF_SyntaxParser
Review-Url: https://codereview.chromium.org/2474303003
Diffstat (limited to 'core/fpdfapi/page/cpdf_streamparser.cpp')
-rw-r--r-- | core/fpdfapi/page/cpdf_streamparser.cpp | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/core/fpdfapi/page/cpdf_streamparser.cpp b/core/fpdfapi/page/cpdf_streamparser.cpp new file mode 100644 index 0000000000..9d36d0a38b --- /dev/null +++ b/core/fpdfapi/page/cpdf_streamparser.cpp @@ -0,0 +1,627 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fpdfapi/page/pageint.h" + +#include <limits.h> + +#include "core/fpdfapi/cpdf_modulemgr.h" +#include "core/fpdfapi/page/cpdf_docpagedata.h" +#include "core/fpdfapi/parser/cpdf_array.h" +#include "core/fpdfapi/parser/cpdf_boolean.h" +#include "core/fpdfapi/parser/cpdf_dictionary.h" +#include "core/fpdfapi/parser/cpdf_document.h" +#include "core/fpdfapi/parser/cpdf_name.h" +#include "core/fpdfapi/parser/cpdf_null.h" +#include "core/fpdfapi/parser/cpdf_number.h" +#include "core/fpdfapi/parser/cpdf_stream.h" +#include "core/fpdfapi/parser/cpdf_string.h" +#include "core/fpdfapi/parser/fpdf_parser_decode.h" +#include "core/fpdfapi/parser/fpdf_parser_utility.h" +#include "core/fxcodec/fx_codec.h" +#include "core/fxcrt/fx_ext.h" + +CCodec_ScanlineDecoder* FPDFAPI_CreateFaxDecoder( + const uint8_t* src_buf, + uint32_t src_size, + int width, + int height, + const CPDF_Dictionary* pParams); + +namespace { + +const uint32_t kMaxNestedArrayLevel = 512; +const uint32_t kMaxWordBuffer = 256; +const FX_STRSIZE kMaxStringLength = 32767; + +uint32_t DecodeAllScanlines(CCodec_ScanlineDecoder* pDecoder, + uint8_t*& dest_buf, + uint32_t& dest_size) { + if (!pDecoder) + return FX_INVALID_OFFSET; + int ncomps = pDecoder->CountComps(); + int bpc = pDecoder->GetBPC(); + int width = pDecoder->GetWidth(); + int height = pDecoder->GetHeight(); + int pitch = (width * ncomps * bpc + 7) / 8; + if (height == 0 || pitch > (1 << 30) / height) { + delete pDecoder; + return FX_INVALID_OFFSET; + } + dest_buf = FX_Alloc2D(uint8_t, pitch, height); + dest_size = pitch * height; // Safe since checked alloc returned. + for (int row = 0; row < height; row++) { + const uint8_t* pLine = pDecoder->GetScanline(row); + if (!pLine) + break; + + FXSYS_memcpy(dest_buf + row * pitch, pLine, pitch); + } + uint32_t srcoff = pDecoder->GetSrcOffset(); + delete pDecoder; + return srcoff; +} + +uint32_t PDF_DecodeInlineStream(const uint8_t* src_buf, + uint32_t limit, + int width, + int height, + CFX_ByteString& decoder, + CPDF_Dictionary* pParam, + uint8_t*& dest_buf, + uint32_t& dest_size) { + if (decoder == "CCITTFaxDecode" || decoder == "CCF") { + CCodec_ScanlineDecoder* pDecoder = + FPDFAPI_CreateFaxDecoder(src_buf, limit, width, height, pParam); + return DecodeAllScanlines(pDecoder, dest_buf, dest_size); + } + if (decoder == "ASCII85Decode" || decoder == "A85") + return A85Decode(src_buf, limit, dest_buf, dest_size); + if (decoder == "ASCIIHexDecode" || decoder == "AHx") + return HexDecode(src_buf, limit, dest_buf, dest_size); + if (decoder == "FlateDecode" || decoder == "Fl") { + return FPDFAPI_FlateOrLZWDecode(false, src_buf, limit, pParam, dest_size, + dest_buf, dest_size); + } + if (decoder == "LZWDecode" || decoder == "LZW") { + return FPDFAPI_FlateOrLZWDecode(true, src_buf, limit, pParam, 0, dest_buf, + dest_size); + } + if (decoder == "DCTDecode" || decoder == "DCT") { + CCodec_ScanlineDecoder* pDecoder = + CPDF_ModuleMgr::Get()->GetJpegModule()->CreateDecoder( + src_buf, limit, width, height, 0, + !pParam || pParam->GetIntegerFor("ColorTransform", 1)); + return DecodeAllScanlines(pDecoder, dest_buf, dest_size); + } + if (decoder == "RunLengthDecode" || decoder == "RL") + return RunLengthDecode(src_buf, limit, dest_buf, dest_size); + dest_size = 0; + dest_buf = 0; + return (uint32_t)-1; +} + +} // namespace + +CPDF_StreamParser::CPDF_StreamParser(const uint8_t* pData, uint32_t dwSize) + : m_pBuf(pData), + m_Size(dwSize), + m_Pos(0), + m_pLastObj(nullptr), + m_pPool(nullptr) {} + +CPDF_StreamParser::CPDF_StreamParser( + const uint8_t* pData, + uint32_t dwSize, + const CFX_WeakPtr<CFX_ByteStringPool>& pPool) + : m_pBuf(pData), + m_Size(dwSize), + m_Pos(0), + m_pLastObj(nullptr), + m_pPool(pPool) {} + +CPDF_StreamParser::~CPDF_StreamParser() { + delete m_pLastObj; +} + +CPDF_Stream* CPDF_StreamParser::ReadInlineStream(CPDF_Document* pDoc, + CPDF_Dictionary* pDict, + CPDF_Object* pCSObj) { + if (m_Pos == m_Size) + return nullptr; + + if (PDFCharIsWhitespace(m_pBuf[m_Pos])) + m_Pos++; + + CFX_ByteString Decoder; + CPDF_Dictionary* pParam = nullptr; + CPDF_Object* pFilter = pDict->GetDirectObjectFor("Filter"); + if (pFilter) { + if (CPDF_Array* pArray = pFilter->AsArray()) { + Decoder = pArray->GetStringAt(0); + CPDF_Array* pParams = pDict->GetArrayFor("DecodeParms"); + if (pParams) + pParam = pParams->GetDictAt(0); + } else { + Decoder = pFilter->GetString(); + pParam = pDict->GetDictFor("DecodeParms"); + } + } + uint32_t width = pDict->GetIntegerFor("Width"); + uint32_t height = pDict->GetIntegerFor("Height"); + uint32_t OrigSize = 0; + if (pCSObj) { + uint32_t bpc = pDict->GetIntegerFor("BitsPerComponent"); + uint32_t nComponents = 1; + CPDF_ColorSpace* pCS = pDoc->LoadColorSpace(pCSObj); + if (pCS) { + nComponents = pCS->CountComponents(); + pDoc->GetPageData()->ReleaseColorSpace(pCSObj); + } else { + nComponents = 3; + } + uint32_t pitch = width; + if (bpc && pitch > INT_MAX / bpc) + return nullptr; + + pitch *= bpc; + if (nComponents && pitch > INT_MAX / nComponents) + return nullptr; + + pitch *= nComponents; + if (pitch > INT_MAX - 7) + return nullptr; + + pitch += 7; + pitch /= 8; + OrigSize = pitch; + } else { + if (width > INT_MAX - 7) + return nullptr; + + OrigSize = ((width + 7) / 8); + } + if (height && OrigSize > INT_MAX / height) + return nullptr; + + OrigSize *= height; + uint8_t* pData = nullptr; + uint32_t dwStreamSize; + if (Decoder.IsEmpty()) { + if (OrigSize > m_Size - m_Pos) + OrigSize = m_Size - m_Pos; + pData = FX_Alloc(uint8_t, OrigSize); + FXSYS_memcpy(pData, m_pBuf + m_Pos, OrigSize); + dwStreamSize = OrigSize; + m_Pos += OrigSize; + } else { + uint32_t dwDestSize = OrigSize; + dwStreamSize = + PDF_DecodeInlineStream(m_pBuf + m_Pos, m_Size - m_Pos, width, height, + Decoder, pParam, pData, dwDestSize); + FX_Free(pData); + if (static_cast<int>(dwStreamSize) < 0) + return nullptr; + + uint32_t dwSavePos = m_Pos; + m_Pos += dwStreamSize; + while (1) { + uint32_t dwPrevPos = m_Pos; + CPDF_StreamParser::SyntaxType type = ParseNextElement(); + if (type == CPDF_StreamParser::EndOfData) + break; + + if (type != CPDF_StreamParser::Keyword) { + dwStreamSize += m_Pos - dwPrevPos; + continue; + } + if (GetWordSize() == 2 && GetWordBuf()[0] == 'E' && + GetWordBuf()[1] == 'I') { + m_Pos = dwPrevPos; + break; + } + dwStreamSize += m_Pos - dwPrevPos; + } + m_Pos = dwSavePos; + pData = FX_Alloc(uint8_t, dwStreamSize); + FXSYS_memcpy(pData, m_pBuf + m_Pos, dwStreamSize); + m_Pos += dwStreamSize; + } + pDict->SetIntegerFor("Length", (int)dwStreamSize); + return new CPDF_Stream(pData, dwStreamSize, pDict); +} + +CPDF_StreamParser::SyntaxType CPDF_StreamParser::ParseNextElement() { + delete m_pLastObj; + m_pLastObj = nullptr; + + m_WordSize = 0; + bool bIsNumber = true; + if (!PositionIsInBounds()) + return EndOfData; + + int ch = m_pBuf[m_Pos++]; + while (1) { + while (PDFCharIsWhitespace(ch)) { + if (!PositionIsInBounds()) + return EndOfData; + + ch = m_pBuf[m_Pos++]; + } + + if (ch != '%') + break; + + while (1) { + if (!PositionIsInBounds()) + return EndOfData; + + ch = m_pBuf[m_Pos++]; + if (PDFCharIsLineEnding(ch)) + break; + } + } + + if (PDFCharIsDelimiter(ch) && ch != '/') { + m_Pos--; + m_pLastObj = ReadNextObject(false, 0); + return Others; + } + + while (1) { + if (m_WordSize < kMaxWordBuffer) + m_WordBuffer[m_WordSize++] = ch; + + if (!PDFCharIsNumeric(ch)) + bIsNumber = false; + + if (!PositionIsInBounds()) + break; + + ch = m_pBuf[m_Pos++]; + + if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) { + m_Pos--; + break; + } + } + + m_WordBuffer[m_WordSize] = 0; + if (bIsNumber) + return Number; + + if (m_WordBuffer[0] == '/') + return Name; + + if (m_WordSize == 4) { + if (memcmp(m_WordBuffer, "true", 4) == 0) { + m_pLastObj = new CPDF_Boolean(true); + return Others; + } + if (memcmp(m_WordBuffer, "null", 4) == 0) { + m_pLastObj = new CPDF_Null; + return Others; + } + } else if (m_WordSize == 5) { + if (memcmp(m_WordBuffer, "false", 5) == 0) { + m_pLastObj = new CPDF_Boolean(false); + return Others; + } + } + return Keyword; +} + +CPDF_Object* CPDF_StreamParser::GetObject() { + CPDF_Object* pObj = m_pLastObj; + m_pLastObj = nullptr; + return pObj; +} + +CPDF_Object* CPDF_StreamParser::ReadNextObject(bool bAllowNestedArray, + uint32_t dwInArrayLevel) { + bool bIsNumber; + GetNextWord(bIsNumber); + if (!m_WordSize) + return nullptr; + + if (bIsNumber) { + m_WordBuffer[m_WordSize] = 0; + return new CPDF_Number(CFX_ByteStringC(m_WordBuffer, m_WordSize)); + } + + int first_char = m_WordBuffer[0]; + if (first_char == '/') { + CFX_ByteString name = + PDF_NameDecode(CFX_ByteStringC(m_WordBuffer + 1, m_WordSize - 1)); + return new CPDF_Name(m_pPool ? m_pPool->Intern(name) : name); + } + + if (first_char == '(') { + CFX_ByteString str = ReadString(); + return new CPDF_String(m_pPool ? m_pPool->Intern(str) : str, false); + } + + if (first_char == '<') { + if (m_WordSize == 1) + return new CPDF_String(ReadHexString(), true); + + CPDF_Dictionary* pDict = new CPDF_Dictionary(m_pPool); + while (1) { + GetNextWord(bIsNumber); + if (m_WordSize == 2 && m_WordBuffer[0] == '>') + break; + + if (!m_WordSize || m_WordBuffer[0] != '/') { + delete pDict; + return nullptr; + } + + CFX_ByteString key = + PDF_NameDecode(CFX_ByteStringC(m_WordBuffer + 1, m_WordSize - 1)); + CPDF_Object* pObj = ReadNextObject(true, 0); + if (!pObj) { + delete pDict; + return nullptr; + } + + if (key.IsEmpty()) + delete pObj; + else + pDict->SetFor(key, pObj); + } + return pDict; + } + + if (first_char == '[') { + if ((!bAllowNestedArray && dwInArrayLevel) || + dwInArrayLevel > kMaxNestedArrayLevel) { + return nullptr; + } + + CPDF_Array* pArray = new CPDF_Array; + while (1) { + CPDF_Object* pObj = ReadNextObject(bAllowNestedArray, dwInArrayLevel + 1); + if (pObj) { + pArray->Add(pObj); + continue; + } + + if (!m_WordSize || m_WordBuffer[0] == ']') + break; + } + return pArray; + } + + if (m_WordSize == 5 && !memcmp(m_WordBuffer, "false", 5)) + return new CPDF_Boolean(false); + + if (m_WordSize == 4) { + if (memcmp(m_WordBuffer, "true", 4) == 0) + return new CPDF_Boolean(true); + + if (memcmp(m_WordBuffer, "null", 4) == 0) + return new CPDF_Null; + } + + return nullptr; +} + +// TODO(npm): the following methods are almost identical in cpdf_syntaxparser +void CPDF_StreamParser::GetNextWord(bool& bIsNumber) { + m_WordSize = 0; + bIsNumber = true; + if (!PositionIsInBounds()) + return; + + int ch = m_pBuf[m_Pos++]; + while (1) { + while (PDFCharIsWhitespace(ch)) { + if (!PositionIsInBounds()) { + return; + } + ch = m_pBuf[m_Pos++]; + } + + if (ch != '%') + break; + + while (1) { + if (!PositionIsInBounds()) + return; + ch = m_pBuf[m_Pos++]; + if (PDFCharIsLineEnding(ch)) + break; + } + } + + if (PDFCharIsDelimiter(ch)) { + bIsNumber = false; + m_WordBuffer[m_WordSize++] = ch; + if (ch == '/') { + while (1) { + if (!PositionIsInBounds()) + return; + ch = m_pBuf[m_Pos++]; + if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) { + m_Pos--; + return; + } + + if (m_WordSize < kMaxWordBuffer) + m_WordBuffer[m_WordSize++] = ch; + } + } else if (ch == '<') { + if (!PositionIsInBounds()) + return; + ch = m_pBuf[m_Pos++]; + if (ch == '<') + m_WordBuffer[m_WordSize++] = ch; + else + m_Pos--; + } else if (ch == '>') { + if (!PositionIsInBounds()) + return; + ch = m_pBuf[m_Pos++]; + if (ch == '>') + m_WordBuffer[m_WordSize++] = ch; + else + m_Pos--; + } + return; + } + + while (1) { + if (m_WordSize < kMaxWordBuffer) + m_WordBuffer[m_WordSize++] = ch; + if (!PDFCharIsNumeric(ch)) + bIsNumber = false; + + if (!PositionIsInBounds()) + return; + ch = m_pBuf[m_Pos++]; + if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) { + m_Pos--; + break; + } + } +} + +CFX_ByteString CPDF_StreamParser::ReadString() { + if (!PositionIsInBounds()) + return CFX_ByteString(); + + uint8_t ch = m_pBuf[m_Pos++]; + CFX_ByteTextBuf buf; + int parlevel = 0; + int status = 0; + int iEscCode = 0; + while (1) { + switch (status) { + case 0: + if (ch == ')') { + if (parlevel == 0) { + if (buf.GetLength() > kMaxStringLength) { + return CFX_ByteString(buf.GetBuffer(), kMaxStringLength); + } + return buf.MakeString(); + } + parlevel--; + buf.AppendChar(')'); + } else if (ch == '(') { + parlevel++; + buf.AppendChar('('); + } else if (ch == '\\') { + status = 1; + } else { + buf.AppendChar((char)ch); + } + break; + case 1: + if (ch >= '0' && ch <= '7') { + iEscCode = FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch)); + status = 2; + break; + } + if (ch == 'n') { + buf.AppendChar('\n'); + } else if (ch == 'r') { + buf.AppendChar('\r'); + } else if (ch == 't') { + buf.AppendChar('\t'); + } else if (ch == 'b') { + buf.AppendChar('\b'); + } else if (ch == 'f') { + buf.AppendChar('\f'); + } else if (ch == '\r') { + status = 4; + break; + } else if (ch == '\n') { + } else { + buf.AppendChar(ch); + } + status = 0; + break; + case 2: + if (ch >= '0' && ch <= '7') { + iEscCode = + iEscCode * 8 + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch)); + status = 3; + } else { + buf.AppendChar(iEscCode); + status = 0; + continue; + } + break; + case 3: + if (ch >= '0' && ch <= '7') { + iEscCode = + iEscCode * 8 + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch)); + buf.AppendChar(iEscCode); + status = 0; + } else { + buf.AppendChar(iEscCode); + status = 0; + continue; + } + break; + case 4: + status = 0; + if (ch != '\n') { + continue; + } + break; + } + if (!PositionIsInBounds()) + break; + + ch = m_pBuf[m_Pos++]; + } + if (PositionIsInBounds()) + ++m_Pos; + + if (buf.GetLength() > kMaxStringLength) { + return CFX_ByteString(buf.GetBuffer(), kMaxStringLength); + } + return buf.MakeString(); +} + +CFX_ByteString CPDF_StreamParser::ReadHexString() { + if (!PositionIsInBounds()) + return CFX_ByteString(); + + CFX_ByteTextBuf buf; + bool bFirst = true; + int code = 0; + while (PositionIsInBounds()) { + int ch = m_pBuf[m_Pos++]; + + if (ch == '>') + break; + + if (!std::isxdigit(ch)) + continue; + + int val = FXSYS_toHexDigit(ch); + if (bFirst) { + code = val * 16; + } else { + code += val; + buf.AppendByte((uint8_t)code); + } + bFirst = !bFirst; + } + if (!bFirst) + buf.AppendChar((char)code); + + if (buf.GetLength() > kMaxStringLength) + return CFX_ByteString(buf.GetBuffer(), kMaxStringLength); + + return buf.MakeString(); +} + +bool CPDF_StreamParser::PositionIsInBounds() const { + return m_Pos < m_Size; +} |