// Copyright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

#include <algorithm>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>

#include "core/fxcrt/cfx_utf8decoder.h"
#include "core/fxcrt/cfx_widetextbuf.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_fallthrough.h"
#include "core/fxcrt/xml/cxml_content.h"
#include "core/fxcrt/xml/cxml_element.h"
#include "core/fxcrt/xml/cxml_parser.h"
#include "third_party/base/ptr_util.h"
#include "third_party/base/stl_util.h"

namespace {

#define FXCRTM_XML_CHARTYPE_Normal 0x00
#define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
#define FXCRTM_XML_CHARTYPE_Letter 0x02
#define FXCRTM_XML_CHARTYPE_Digital 0x04
#define FXCRTM_XML_CHARTYPE_NameIntro 0x08
#define FXCRTM_XML_CHARTYPE_NameChar 0x10
#define FXCRTM_XML_CHARTYPE_HexDigital 0x20
#define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
#define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
#define FXCRTM_XML_CHARTYPE_HexChar 0x60

const uint8_t g_FXCRT_XML_ByteTypes[256] = {
    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
    0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
    0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
    0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
    0x1A, 0x1A, 0x01, 0x01,
};

constexpr int kMaxDepth = 1024;

bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
  return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
}

bool g_FXCRT_XML_IsDigital(uint8_t ch) {
  return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
}

bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
  return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
}

bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
  return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
}

}  // namespace

CXML_Parser::CXML_Parser()
    : m_nOffset(0),
      m_pBuffer(nullptr),
      m_dwBufferSize(0),
      m_nBufferOffset(0),
      m_dwIndex(0) {}

CXML_Parser::~CXML_Parser() {}

bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
  m_pDataAcc = pdfium::MakeUnique<CXML_DataBufAcc>(pBuffer, size);
  m_nOffset = 0;
  return ReadNextBlock();
}

bool CXML_Parser::ReadNextBlock() {
  if (!m_pDataAcc->ReadNextBlock())
    return false;

  m_pBuffer = m_pDataAcc->GetBlockBuffer();
  m_dwBufferSize = m_pDataAcc->GetBlockSize();
  m_nBufferOffset = 0;
  m_dwIndex = 0;
  return m_dwBufferSize > 0;
}

bool CXML_Parser::IsEOF() {
  return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
}

void CXML_Parser::SkipWhiteSpaces() {
  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF())
    return;

  do {
    while (m_dwIndex < m_dwBufferSize &&
           g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
      m_dwIndex++;
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
}

void CXML_Parser::GetName(ByteString* space, ByteString* name) {
  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF())
    return;

  std::ostringstream buf;
  do {
    while (m_dwIndex < m_dwBufferSize) {
      uint8_t ch = m_pBuffer[m_dwIndex];
      if (ch == ':') {
        *space = ByteString(buf);
        buf.str("");
      } else if (g_FXCRT_XML_IsNameChar(ch)) {
        buf << static_cast<char>(ch);
      } else {
        break;
      }
      m_dwIndex++;
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
  *name = ByteString(buf);
}

void CXML_Parser::SkipLiterals(const ByteStringView& str) {
  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF()) {
    return;
  }
  int32_t i = 0, iLen = str.GetLength();
  do {
    while (m_dwIndex < m_dwBufferSize) {
      if (str[i] != m_pBuffer[m_dwIndex++]) {
        i = 0;
        continue;
      }
      i++;
      if (i == iLen)
        break;
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (i == iLen)
      return;

    if (m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
  while (!m_pDataAcc->IsEOF()) {
    ReadNextBlock();
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
  }
  m_dwIndex = m_dwBufferSize;
}

uint32_t CXML_Parser::GetCharRef() {
  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF())
    return 0;

  uint8_t ch;
  int32_t iState = 0;
  std::ostringstream buf;
  uint32_t code = 0;
  do {
    while (m_dwIndex < m_dwBufferSize) {
      ch = m_pBuffer[m_dwIndex];
      switch (iState) {
        case 0:
          if (ch == '#') {
            m_dwIndex++;
            iState = 2;
            break;
          }
          iState = 1;
          FX_FALLTHROUGH;
        case 1:
          m_dwIndex++;
          if (ch == ';') {
            std::string ref = buf.str();
            if (ref == "gt")
              code = '>';
            else if (ref == "lt")
              code = '<';
            else if (ref == "amp")
              code = '&';
            else if (ref == "apos")
              code = '\'';
            else if (ref == "quot")
              code = '"';
            iState = 10;
            break;
          }
          buf << static_cast<char>(ch);
          break;
        case 2:
          if (ch == 'x') {
            m_dwIndex++;
            iState = 4;
            break;
          }
          iState = 3;
          FX_FALLTHROUGH;
        case 3:
          m_dwIndex++;
          if (ch == ';') {
            iState = 10;
            break;
          }
          if (g_FXCRT_XML_IsDigital(ch))
            code = code * 10 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
          break;
        case 4:
          m_dwIndex++;
          if (ch == ';') {
            iState = 10;
            break;
          }
          uint8_t nHex =
              g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
          if (nHex) {
            if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
              code = (code << 4) +
                     FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
            } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
              code = (code << 4) + ch - 87;
            } else {
              code = (code << 4) + ch - 55;
            }
          }
          break;
      }
      if (iState == 10)
        break;
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
      break;
    }
  } while (ReadNextBlock());
  return code;
}

WideString CXML_Parser::GetAttrValue() {
  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF())
    return WideString();

  CFX_UTF8Decoder decoder;
  uint8_t mark = 0;
  uint8_t ch = 0;
  do {
    while (m_dwIndex < m_dwBufferSize) {
      ch = m_pBuffer[m_dwIndex];
      if (mark == 0) {
        if (ch != '\'' && ch != '"')
          return WideString();

        mark = ch;
        m_dwIndex++;
        ch = 0;
        continue;
      }
      m_dwIndex++;
      if (ch == mark)
        break;

      if (ch == '&') {
        decoder.AppendCodePoint(GetCharRef());
        if (IsEOF())
          return WideString(decoder.GetResult());
      } else {
        decoder.Input(ch);
      }
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
  return WideString(decoder.GetResult());
}

void CXML_Parser::GetTagName(bool bStartTag,
                             bool* bEndTag,
                             ByteString* space,
                             ByteString* name) {
  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF())
    return;

  *bEndTag = false;
  uint8_t ch;
  int32_t iState = bStartTag ? 1 : 0;
  do {
    while (m_dwIndex < m_dwBufferSize) {
      ch = m_pBuffer[m_dwIndex];
      switch (iState) {
        case 0:
          m_dwIndex++;
          if (ch != '<')
            break;

          iState = 1;
          break;
        case 1:
          if (ch == '?') {
            m_dwIndex++;
            SkipLiterals("?>");
            iState = 0;
            break;
          }
          if (ch == '!') {
            m_dwIndex++;
            SkipLiterals("-->");
            iState = 0;
            break;
          }
          if (ch == '/') {
            m_dwIndex++;
            GetName(space, name);
            *bEndTag = true;
          } else {
            GetName(space, name);
            *bEndTag = false;
          }
          return;
      }
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
}

std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
                                                        bool bStartTag) {
  return ParseElementInternal(pParent, bStartTag, 0);
}

std::unique_ptr<CXML_Element> CXML_Parser::ParseElementInternal(
    CXML_Element* pParent,
    bool bStartTag,
    int nDepth) {
  if (nDepth > kMaxDepth)
    return nullptr;

  m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
  if (IsEOF())
    return nullptr;

  ByteString tag_name;
  ByteString tag_space;
  bool bEndTag;
  GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
  if (tag_name.IsEmpty() || bEndTag)
    return nullptr;

  auto pElement = pdfium::MakeUnique<CXML_Element>(
      pParent, tag_space.AsStringView(), tag_name.AsStringView());
  do {
    ByteString attr_space;
    ByteString attr_name;
    while (m_dwIndex < m_dwBufferSize) {
      SkipWhiteSpaces();
      if (IsEOF())
        break;

      if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
        break;

      GetName(&attr_space, &attr_name);
      SkipWhiteSpaces();
      if (IsEOF())
        break;

      if (m_pBuffer[m_dwIndex] != '=')
        break;

      m_dwIndex++;
      SkipWhiteSpaces();
      if (IsEOF())
        break;

      WideString attr_value = GetAttrValue();
      pElement->SetAttribute(attr_space, attr_name, attr_value);
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
  SkipWhiteSpaces();
  if (IsEOF())
    return pElement;

  uint8_t ch = m_pBuffer[m_dwIndex++];
  if (ch == '/') {
    m_dwIndex++;
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    return pElement;
  }
  if (ch != '>') {
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    return nullptr;
  }
  SkipWhiteSpaces();
  if (IsEOF())
    return pElement;

  CFX_UTF8Decoder decoder;
  CFX_WideTextBuf content;
  bool bCDATA = false;
  int32_t iState = 0;
  do {
    while (m_dwIndex < m_dwBufferSize) {
      ch = m_pBuffer[m_dwIndex++];
      switch (iState) {
        case 0:
          if (ch == '<') {
            iState = 1;
          } else if (ch == '&') {
            decoder.ClearStatus();
            decoder.AppendCodePoint(GetCharRef());
          } else {
            decoder.Input(ch);
          }
          break;
        case 1:
          if (ch == '!') {
            iState = 2;
          } else if (ch == '?') {
            SkipLiterals("?>");
            SkipWhiteSpaces();
            iState = 0;
          } else if (ch == '/') {
            ByteString space;
            ByteString name;
            GetName(&space, &name);
            SkipWhiteSpaces();
            m_dwIndex++;
            iState = 10;
          } else {
            content << decoder.GetResult();
            WideString dataStr = content.MakeString();
            if (!bCDATA)
              dataStr.TrimRight(L" \t\r\n");

            InsertContentSegment(bCDATA, dataStr.AsStringView(),
                                 pElement.get());
            content.Clear();
            decoder.Clear();
            bCDATA = false;
            iState = 0;
            m_dwIndex--;
            std::unique_ptr<CXML_Element> pSubElement =
                ParseElementInternal(pElement.get(), true, nDepth + 1);
            if (!pSubElement)
              break;

            pElement->AppendChild(std::move(pSubElement));
            SkipWhiteSpaces();
          }
          break;
        case 2:
          if (ch == '[') {
            SkipLiterals("]]>");
          } else if (ch == '-') {
            m_dwIndex++;
            SkipLiterals("-->");
          } else {
            SkipLiterals(">");
          }
          decoder.Clear();
          SkipWhiteSpaces();
          iState = 0;
          break;
      }
      if (iState == 10) {
        break;
      }
    }
    m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
    if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
      break;
  } while (ReadNextBlock());
  content << decoder.GetResult();
  WideString dataStr = content.MakeString();
  dataStr.TrimRight(L" \t\r\n");

  InsertContentSegment(bCDATA, dataStr.AsStringView(), pElement.get());
  content.Clear();
  decoder.Clear();
  bCDATA = false;
  return pElement;
}

void CXML_Parser::InsertContentSegment(bool bCDATA,
                                       const WideStringView& content,
                                       CXML_Element* pElement) {
  if (content.IsEmpty())
    return;

  pElement->AppendChild(pdfium::MakeUnique<CXML_Content>(bCDATA, content));
}