Clean up fpdf_page_parsers

- The code in fpdf_page_parser is only called by CPDF_StreamContentParser, so moved there. - Split fpdf_page_parser_old into its two classes - Renamed the corresponding unittests accordingly. - Moved PDF_ReplaceAbbr to namespace - Fixed few nits - Added TODO because CPDF_StreamParser has a lot of code similar to CPDF_SyntaxParser Review-Url: https://codereview.chromium.org/2474303003
author: npm <npm@chromium.org> 2016-11-07 08:42:11 -0800
committer: Commit bot <commit-bot@chromium.org> 2016-11-07 08:42:11 -0800
commit: 014b012278b7438ef8d4b66730b8598c7eb4623a (patch)
tree: 75ea0ea37d0b239412133290d7b24cc975bfcc66 /core/fpdfapi/page/cpdf_streamparser.cpp
parent: 240dec52b2e6502e7deb27a3535af3b1a3e23428 (diff)
download: pdfium-014b012278b7438ef8d4b66730b8598c7eb4623a.tar.xz
1 files changed, 627 insertions, 0 deletions
diff --git a/core/fpdfapi/page/cpdf_streamparser.cpp b/core/fpdfapi/page/cpdf_streamparser.cpp
new file mode 100644
index 0000000000..9d36d0a38b
--- /dev/null
+++ b/core/fpdfapi/page/cpdf_streamparser.cpp
@@ -0,0 +1,627 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fpdfapi/page/pageint.h"
+
+#include <limits.h>
+
+#include "core/fpdfapi/cpdf_modulemgr.h"
+#include "core/fpdfapi/page/cpdf_docpagedata.h"
+#include "core/fpdfapi/parser/cpdf_array.h"
+#include "core/fpdfapi/parser/cpdf_boolean.h"
+#include "core/fpdfapi/parser/cpdf_dictionary.h"
+#include "core/fpdfapi/parser/cpdf_document.h"
+#include "core/fpdfapi/parser/cpdf_name.h"
+#include "core/fpdfapi/parser/cpdf_null.h"
+#include "core/fpdfapi/parser/cpdf_number.h"
+#include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fpdfapi/parser/cpdf_string.h"
+#include "core/fpdfapi/parser/fpdf_parser_decode.h"
+#include "core/fpdfapi/parser/fpdf_parser_utility.h"
+#include "core/fxcodec/fx_codec.h"
+#include "core/fxcrt/fx_ext.h"
+
+CCodec_ScanlineDecoder* FPDFAPI_CreateFaxDecoder(
+    const uint8_t* src_buf,
+    uint32_t src_size,
+    int width,
+    int height,
+    const CPDF_Dictionary* pParams);
+
+namespace {
+
+const uint32_t kMaxNestedArrayLevel = 512;
+const uint32_t kMaxWordBuffer = 256;
+const FX_STRSIZE kMaxStringLength = 32767;
+
+uint32_t DecodeAllScanlines(CCodec_ScanlineDecoder* pDecoder,
+                            uint8_t*& dest_buf,
+                            uint32_t& dest_size) {
+  if (!pDecoder)
+    return FX_INVALID_OFFSET;
+  int ncomps = pDecoder->CountComps();
+  int bpc = pDecoder->GetBPC();
+  int width = pDecoder->GetWidth();
+  int height = pDecoder->GetHeight();
+  int pitch = (width * ncomps * bpc + 7) / 8;
+  if (height == 0 || pitch > (1 << 30) / height) {
+    delete pDecoder;
+    return FX_INVALID_OFFSET;
+  }
+  dest_buf = FX_Alloc2D(uint8_t, pitch, height);
+  dest_size = pitch * height;  // Safe since checked alloc returned.
+  for (int row = 0; row < height; row++) {
+    const uint8_t* pLine = pDecoder->GetScanline(row);
+    if (!pLine)
+      break;
+
+    FXSYS_memcpy(dest_buf + row * pitch, pLine, pitch);
+  }
+  uint32_t srcoff = pDecoder->GetSrcOffset();
+  delete pDecoder;
+  return srcoff;
+}
+
+uint32_t PDF_DecodeInlineStream(const uint8_t* src_buf,
+                                uint32_t limit,
+                                int width,
+                                int height,
+                                CFX_ByteString& decoder,
+                                CPDF_Dictionary* pParam,
+                                uint8_t*& dest_buf,
+                                uint32_t& dest_size) {
+  if (decoder == "CCITTFaxDecode" || decoder == "CCF") {
+    CCodec_ScanlineDecoder* pDecoder =
+        FPDFAPI_CreateFaxDecoder(src_buf, limit, width, height, pParam);
+    return DecodeAllScanlines(pDecoder, dest_buf, dest_size);
+  }
+  if (decoder == "ASCII85Decode" || decoder == "A85")
+    return A85Decode(src_buf, limit, dest_buf, dest_size);
+  if (decoder == "ASCIIHexDecode" || decoder == "AHx")
+    return HexDecode(src_buf, limit, dest_buf, dest_size);
+  if (decoder == "FlateDecode" || decoder == "Fl") {
+    return FPDFAPI_FlateOrLZWDecode(false, src_buf, limit, pParam, dest_size,
+                                    dest_buf, dest_size);
+  }
+  if (decoder == "LZWDecode" || decoder == "LZW") {
+    return FPDFAPI_FlateOrLZWDecode(true, src_buf, limit, pParam, 0, dest_buf,
+                                    dest_size);
+  }
+  if (decoder == "DCTDecode" || decoder == "DCT") {
+    CCodec_ScanlineDecoder* pDecoder =
+        CPDF_ModuleMgr::Get()->GetJpegModule()->CreateDecoder(
+            src_buf, limit, width, height, 0,
+            !pParam || pParam->GetIntegerFor("ColorTransform", 1));
+    return DecodeAllScanlines(pDecoder, dest_buf, dest_size);
+  }
+  if (decoder == "RunLengthDecode" || decoder == "RL")
+    return RunLengthDecode(src_buf, limit, dest_buf, dest_size);
+  dest_size = 0;
+  dest_buf = 0;
+  return (uint32_t)-1;
+}
+
+}  // namespace
+
+CPDF_StreamParser::CPDF_StreamParser(const uint8_t* pData, uint32_t dwSize)
+    : m_pBuf(pData),
+      m_Size(dwSize),
+      m_Pos(0),
+      m_pLastObj(nullptr),
+      m_pPool(nullptr) {}
+
+CPDF_StreamParser::CPDF_StreamParser(
+    const uint8_t* pData,
+    uint32_t dwSize,
+    const CFX_WeakPtr<CFX_ByteStringPool>& pPool)
+    : m_pBuf(pData),
+      m_Size(dwSize),
+      m_Pos(0),
+      m_pLastObj(nullptr),
+      m_pPool(pPool) {}
+
+CPDF_StreamParser::~CPDF_StreamParser() {
+  delete m_pLastObj;
+}
+
+CPDF_Stream* CPDF_StreamParser::ReadInlineStream(CPDF_Document* pDoc,
+                                                 CPDF_Dictionary* pDict,
+                                                 CPDF_Object* pCSObj) {
+  if (m_Pos == m_Size)
+    return nullptr;
+
+  if (PDFCharIsWhitespace(m_pBuf[m_Pos]))
+    m_Pos++;
+
+  CFX_ByteString Decoder;
+  CPDF_Dictionary* pParam = nullptr;
+  CPDF_Object* pFilter = pDict->GetDirectObjectFor("Filter");
+  if (pFilter) {
+    if (CPDF_Array* pArray = pFilter->AsArray()) {
+      Decoder = pArray->GetStringAt(0);
+      CPDF_Array* pParams = pDict->GetArrayFor("DecodeParms");
+      if (pParams)
+        pParam = pParams->GetDictAt(0);
+    } else {
+      Decoder = pFilter->GetString();
+      pParam = pDict->GetDictFor("DecodeParms");
+    }
+  }
+  uint32_t width = pDict->GetIntegerFor("Width");
+  uint32_t height = pDict->GetIntegerFor("Height");
+  uint32_t OrigSize = 0;
+  if (pCSObj) {
+    uint32_t bpc = pDict->GetIntegerFor("BitsPerComponent");
+    uint32_t nComponents = 1;
+    CPDF_ColorSpace* pCS = pDoc->LoadColorSpace(pCSObj);
+    if (pCS) {
+      nComponents = pCS->CountComponents();
+      pDoc->GetPageData()->ReleaseColorSpace(pCSObj);
+    } else {
+      nComponents = 3;
+    }
+    uint32_t pitch = width;
+    if (bpc && pitch > INT_MAX / bpc)
+      return nullptr;
+
+    pitch *= bpc;
+    if (nComponents && pitch > INT_MAX / nComponents)
+      return nullptr;
+
+    pitch *= nComponents;
+    if (pitch > INT_MAX - 7)
+      return nullptr;
+
+    pitch += 7;
+    pitch /= 8;
+    OrigSize = pitch;
+  } else {
+    if (width > INT_MAX - 7)
+      return nullptr;
+
+    OrigSize = ((width + 7) / 8);
+  }
+  if (height && OrigSize > INT_MAX / height)
+    return nullptr;
+
+  OrigSize *= height;
+  uint8_t* pData = nullptr;
+  uint32_t dwStreamSize;
+  if (Decoder.IsEmpty()) {
+    if (OrigSize > m_Size - m_Pos)
+      OrigSize = m_Size - m_Pos;
+    pData = FX_Alloc(uint8_t, OrigSize);
+    FXSYS_memcpy(pData, m_pBuf + m_Pos, OrigSize);
+    dwStreamSize = OrigSize;
+    m_Pos += OrigSize;
+  } else {
+    uint32_t dwDestSize = OrigSize;
+    dwStreamSize =
+        PDF_DecodeInlineStream(m_pBuf + m_Pos, m_Size - m_Pos, width, height,
+                               Decoder, pParam, pData, dwDestSize);
+    FX_Free(pData);
+    if (static_cast<int>(dwStreamSize) < 0)
+      return nullptr;
+
+    uint32_t dwSavePos = m_Pos;
+    m_Pos += dwStreamSize;
+    while (1) {
+      uint32_t dwPrevPos = m_Pos;
+      CPDF_StreamParser::SyntaxType type = ParseNextElement();
+      if (type == CPDF_StreamParser::EndOfData)
+        break;
+
+      if (type != CPDF_StreamParser::Keyword) {
+        dwStreamSize += m_Pos - dwPrevPos;
+        continue;
+      }
+      if (GetWordSize() == 2 && GetWordBuf()[0] == 'E' &&
+          GetWordBuf()[1] == 'I') {
+        m_Pos = dwPrevPos;
+        break;
+      }
+      dwStreamSize += m_Pos - dwPrevPos;
+    }
+    m_Pos = dwSavePos;
+    pData = FX_Alloc(uint8_t, dwStreamSize);
+    FXSYS_memcpy(pData, m_pBuf + m_Pos, dwStreamSize);
+    m_Pos += dwStreamSize;
+  }
+  pDict->SetIntegerFor("Length", (int)dwStreamSize);
+  return new CPDF_Stream(pData, dwStreamSize, pDict);
+}
+
+CPDF_StreamParser::SyntaxType CPDF_StreamParser::ParseNextElement() {
+  delete m_pLastObj;
+  m_pLastObj = nullptr;
+
+  m_WordSize = 0;
+  bool bIsNumber = true;
+  if (!PositionIsInBounds())
+    return EndOfData;
+
+  int ch = m_pBuf[m_Pos++];
+  while (1) {
+    while (PDFCharIsWhitespace(ch)) {
+      if (!PositionIsInBounds())
+        return EndOfData;
+
+      ch = m_pBuf[m_Pos++];
+    }
+
+    if (ch != '%')
+      break;
+
+    while (1) {
+      if (!PositionIsInBounds())
+        return EndOfData;
+
+      ch = m_pBuf[m_Pos++];
+      if (PDFCharIsLineEnding(ch))
+        break;
+    }
+  }
+
+  if (PDFCharIsDelimiter(ch) && ch != '/') {
+    m_Pos--;
+    m_pLastObj = ReadNextObject(false, 0);
+    return Others;
+  }
+
+  while (1) {
+    if (m_WordSize < kMaxWordBuffer)
+      m_WordBuffer[m_WordSize++] = ch;
+
+    if (!PDFCharIsNumeric(ch))
+      bIsNumber = false;
+
+    if (!PositionIsInBounds())
+      break;
+
+    ch = m_pBuf[m_Pos++];
+
+    if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
+      m_Pos--;
+      break;
+    }
+  }
+
+  m_WordBuffer[m_WordSize] = 0;
+  if (bIsNumber)
+    return Number;
+
+  if (m_WordBuffer[0] == '/')
+    return Name;
+
+  if (m_WordSize == 4) {
+    if (memcmp(m_WordBuffer, "true", 4) == 0) {
+      m_pLastObj = new CPDF_Boolean(true);
+      return Others;
+    }
+    if (memcmp(m_WordBuffer, "null", 4) == 0) {
+      m_pLastObj = new CPDF_Null;
+      return Others;
+    }
+  } else if (m_WordSize == 5) {
+    if (memcmp(m_WordBuffer, "false", 5) == 0) {
+      m_pLastObj = new CPDF_Boolean(false);
+      return Others;
+    }
+  }
+  return Keyword;
+}
+
+CPDF_Object* CPDF_StreamParser::GetObject() {
+  CPDF_Object* pObj = m_pLastObj;
+  m_pLastObj = nullptr;
+  return pObj;
+}
+
+CPDF_Object* CPDF_StreamParser::ReadNextObject(bool bAllowNestedArray,
+                                               uint32_t dwInArrayLevel) {
+  bool bIsNumber;
+  GetNextWord(bIsNumber);
+  if (!m_WordSize)
+    return nullptr;
+
+  if (bIsNumber) {
+    m_WordBuffer[m_WordSize] = 0;
+    return new CPDF_Number(CFX_ByteStringC(m_WordBuffer, m_WordSize));
+  }
+
+  int first_char = m_WordBuffer[0];
+  if (first_char == '/') {
+    CFX_ByteString name =
+        PDF_NameDecode(CFX_ByteStringC(m_WordBuffer + 1, m_WordSize - 1));
+    return new CPDF_Name(m_pPool ? m_pPool->Intern(name) : name);
+  }
+
+  if (first_char == '(') {
+    CFX_ByteString str = ReadString();
+    return new CPDF_String(m_pPool ? m_pPool->Intern(str) : str, false);
+  }
+
+  if (first_char == '<') {
+    if (m_WordSize == 1)
+      return new CPDF_String(ReadHexString(), true);
+
+    CPDF_Dictionary* pDict = new CPDF_Dictionary(m_pPool);
+    while (1) {
+      GetNextWord(bIsNumber);
+      if (m_WordSize == 2 && m_WordBuffer[0] == '>')
+        break;
+
+      if (!m_WordSize || m_WordBuffer[0] != '/') {
+        delete pDict;
+        return nullptr;
+      }
+
+      CFX_ByteString key =
+          PDF_NameDecode(CFX_ByteStringC(m_WordBuffer + 1, m_WordSize - 1));
+      CPDF_Object* pObj = ReadNextObject(true, 0);
+      if (!pObj) {
+        delete pDict;
+        return nullptr;
+      }
+
+      if (key.IsEmpty())
+        delete pObj;
+      else
+        pDict->SetFor(key, pObj);
+    }
+    return pDict;
+  }
+
+  if (first_char == '[') {
+    if ((!bAllowNestedArray && dwInArrayLevel) ||
+        dwInArrayLevel > kMaxNestedArrayLevel) {
+      return nullptr;
+    }
+
+    CPDF_Array* pArray = new CPDF_Array;
+    while (1) {
+      CPDF_Object* pObj = ReadNextObject(bAllowNestedArray, dwInArrayLevel + 1);
+      if (pObj) {
+        pArray->Add(pObj);
+        continue;
+      }
+
+      if (!m_WordSize || m_WordBuffer[0] == ']')
+        break;
+    }
+    return pArray;
+  }
+
+  if (m_WordSize == 5 && !memcmp(m_WordBuffer, "false", 5))
+    return new CPDF_Boolean(false);
+
+  if (m_WordSize == 4) {
+    if (memcmp(m_WordBuffer, "true", 4) == 0)
+      return new CPDF_Boolean(true);
+
+    if (memcmp(m_WordBuffer, "null", 4) == 0)
+      return new CPDF_Null;
+  }
+
+  return nullptr;
+}
+
+// TODO(npm): the following methods are almost identical in cpdf_syntaxparser
+void CPDF_StreamParser::GetNextWord(bool& bIsNumber) {
+  m_WordSize = 0;
+  bIsNumber = true;
+  if (!PositionIsInBounds())
+    return;
+
+  int ch = m_pBuf[m_Pos++];
+  while (1) {
+    while (PDFCharIsWhitespace(ch)) {
+      if (!PositionIsInBounds()) {
+        return;
+      }
+      ch = m_pBuf[m_Pos++];
+    }
+
+    if (ch != '%')
+      break;
+
+    while (1) {
+      if (!PositionIsInBounds())
+        return;
+      ch = m_pBuf[m_Pos++];
+      if (PDFCharIsLineEnding(ch))
+        break;
+    }
+  }
+
+  if (PDFCharIsDelimiter(ch)) {
+    bIsNumber = false;
+    m_WordBuffer[m_WordSize++] = ch;
+    if (ch == '/') {
+      while (1) {
+        if (!PositionIsInBounds())
+          return;
+        ch = m_pBuf[m_Pos++];
+        if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
+          m_Pos--;
+          return;
+        }
+
+        if (m_WordSize < kMaxWordBuffer)
+          m_WordBuffer[m_WordSize++] = ch;
+      }
+    } else if (ch == '<') {
+      if (!PositionIsInBounds())
+        return;
+      ch = m_pBuf[m_Pos++];
+      if (ch == '<')
+        m_WordBuffer[m_WordSize++] = ch;
+      else
+        m_Pos--;
+    } else if (ch == '>') {
+      if (!PositionIsInBounds())
+        return;
+      ch = m_pBuf[m_Pos++];
+      if (ch == '>')
+        m_WordBuffer[m_WordSize++] = ch;
+      else
+        m_Pos--;
+    }
+    return;
+  }
+
+  while (1) {
+    if (m_WordSize < kMaxWordBuffer)
+      m_WordBuffer[m_WordSize++] = ch;
+    if (!PDFCharIsNumeric(ch))
+      bIsNumber = false;
+
+    if (!PositionIsInBounds())
+      return;
+    ch = m_pBuf[m_Pos++];
+    if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
+      m_Pos--;
+      break;
+    }
+  }
+}
+
+CFX_ByteString CPDF_StreamParser::ReadString() {
+  if (!PositionIsInBounds())
+    return CFX_ByteString();
+
+  uint8_t ch = m_pBuf[m_Pos++];
+  CFX_ByteTextBuf buf;
+  int parlevel = 0;
+  int status = 0;
+  int iEscCode = 0;
+  while (1) {
+    switch (status) {
+      case 0:
+        if (ch == ')') {
+          if (parlevel == 0) {
+            if (buf.GetLength() > kMaxStringLength) {
+              return CFX_ByteString(buf.GetBuffer(), kMaxStringLength);
+            }
+            return buf.MakeString();
+          }
+          parlevel--;
+          buf.AppendChar(')');
+        } else if (ch == '(') {
+          parlevel++;
+          buf.AppendChar('(');
+        } else if (ch == '\\') {
+          status = 1;
+        } else {
+          buf.AppendChar((char)ch);
+        }
+        break;
+      case 1:
+        if (ch >= '0' && ch <= '7') {
+          iEscCode = FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
+          status = 2;
+          break;
+        }
+        if (ch == 'n') {
+          buf.AppendChar('\n');
+        } else if (ch == 'r') {
+          buf.AppendChar('\r');
+        } else if (ch == 't') {
+          buf.AppendChar('\t');
+        } else if (ch == 'b') {
+          buf.AppendChar('\b');
+        } else if (ch == 'f') {
+          buf.AppendChar('\f');
+        } else if (ch == '\r') {
+          status = 4;
+          break;
+        } else if (ch == '\n') {
+        } else {
+          buf.AppendChar(ch);
+        }
+        status = 0;
+        break;
+      case 2:
+        if (ch >= '0' && ch <= '7') {
+          iEscCode =
+              iEscCode * 8 + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
+          status = 3;
+        } else {
+          buf.AppendChar(iEscCode);
+          status = 0;
+          continue;
+        }
+        break;
+      case 3:
+        if (ch >= '0' && ch <= '7') {
+          iEscCode =
+              iEscCode * 8 + FXSYS_toDecimalDigit(static_cast<FX_WCHAR>(ch));
+          buf.AppendChar(iEscCode);
+          status = 0;
+        } else {
+          buf.AppendChar(iEscCode);
+          status = 0;
+          continue;
+        }
+        break;
+      case 4:
+        status = 0;
+        if (ch != '\n') {
+          continue;
+        }
+        break;
+    }
+    if (!PositionIsInBounds())
+      break;
+
+    ch = m_pBuf[m_Pos++];
+  }
+  if (PositionIsInBounds())
+    ++m_Pos;
+
+  if (buf.GetLength() > kMaxStringLength) {
+    return CFX_ByteString(buf.GetBuffer(), kMaxStringLength);
+  }
+  return buf.MakeString();
+}
+
+CFX_ByteString CPDF_StreamParser::ReadHexString() {
+  if (!PositionIsInBounds())
+    return CFX_ByteString();
+
+  CFX_ByteTextBuf buf;
+  bool bFirst = true;
+  int code = 0;
+  while (PositionIsInBounds()) {
+    int ch = m_pBuf[m_Pos++];
+
+    if (ch == '>')
+      break;
+
+    if (!std::isxdigit(ch))
+      continue;
+
+    int val = FXSYS_toHexDigit(ch);
+    if (bFirst) {
+      code = val * 16;
+    } else {
+      code += val;
+      buf.AppendByte((uint8_t)code);
+    }
+    bFirst = !bFirst;
+  }
+  if (!bFirst)
+    buf.AppendChar((char)code);
+
+  if (buf.GetLength() > kMaxStringLength)
+    return CFX_ByteString(buf.GetBuffer(), kMaxStringLength);
+
+  return buf.MakeString();
+}
+
+bool CPDF_StreamParser::PositionIsInBounds() const {
+  return m_Pos < m_Size;
+}
author	npm <npm@chromium.org>	2016-11-07 08:42:11 -0800
committer	Commit bot <commit-bot@chromium.org>	2016-11-07 08:42:11 -0800
commit	014b012278b7438ef8d4b66730b8598c7eb4623a (patch)
tree	75ea0ea37d0b239412133290d7b24cc975bfcc66 /core/fpdfapi/page/cpdf_streamparser.cpp
parent	240dec52b2e6502e7deb27a3535af3b1a3e23428 (diff)
download	pdfium-014b012278b7438ef8d4b66730b8598c7eb4623a.tar.xz