diff options
Diffstat (limited to 'core/fxcrt/xml')
-rw-r--r-- | core/fxcrt/xml/cfx_saxcontext.h | 23 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_saxreader.cpp | 730 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_saxreader.h | 144 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_saxreaderhandler.cpp | 128 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_saxreaderhandler.h | 49 |
5 files changed, 1074 insertions, 0 deletions
diff --git a/core/fxcrt/xml/cfx_saxcontext.h b/core/fxcrt/xml/cfx_saxcontext.h new file mode 100644 index 0000000000..7afebed98d --- /dev/null +++ b/core/fxcrt/xml/cfx_saxcontext.h @@ -0,0 +1,23 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_SAXCONTEXT_H_ +#define CORE_FXCRT_XML_CFX_SAXCONTEXT_H_ + +#include "core/fxcrt/fx_basic.h" +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_saxreader.h" + +class CFX_SAXContext { + public: + CFX_SAXContext() : m_eNode(CFX_SAXItem::Type::Unknown) {} + + CFX_ByteTextBuf m_TextBuf; + CFX_ByteString m_bsTagName; + CFX_SAXItem::Type m_eNode; +}; + +#endif // CORE_FXCRT_XML_CFX_SAXCONTEXT_H_ diff --git a/core/fxcrt/xml/cfx_saxreader.cpp b/core/fxcrt/xml/cfx_saxreader.cpp new file mode 100644 index 0000000000..287eaaa5b8 --- /dev/null +++ b/core/fxcrt/xml/cfx_saxreader.cpp @@ -0,0 +1,730 @@ +// Copyright 2014 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_saxreader.h" + +#include <algorithm> +#include <utility> + +#include "core/fxcrt/xml/cfx_saxreaderhandler.h" +#include "third_party/base/ptr_util.h" +#include "third_party/base/stl_util.h" + +enum class CFX_SaxMode { + Text = 0, + NodeStart, + DeclOrComment, + DeclNode, + Comment, + CommentContent, + TagName, + TagAttributeName, + TagAttributeEqual, + TagAttributeValue, + TagMaybeClose, + TagClose, + TagEnd, + TargetData, + MAX +}; + +class CFX_SAXCommentContext { + public: + CFX_SAXCommentContext() : m_iHeaderCount(0), m_iTailCount(0) {} + int32_t m_iHeaderCount; + int32_t m_iTailCount; +}; + +namespace { + +const uint32_t kSaxFileBufSize = 32768; + +typedef void (CFX_SAXReader::*FX_SAXReader_LPFParse)(); +static const FX_SAXReader_LPFParse + g_FX_SAXReader_LPFParse[static_cast<int>(CFX_SaxMode::MAX)] = { + &CFX_SAXReader::ParseText, + &CFX_SAXReader::ParseNodeStart, + &CFX_SAXReader::ParseDeclOrComment, + &CFX_SAXReader::ParseDeclNode, + &CFX_SAXReader::ParseComment, + &CFX_SAXReader::ParseCommentContent, + &CFX_SAXReader::ParseTagName, + &CFX_SAXReader::ParseTagAttributeName, + &CFX_SAXReader::ParseTagAttributeEqual, + &CFX_SAXReader::ParseTagAttributeValue, + &CFX_SAXReader::ParseMaybeClose, + &CFX_SAXReader::ParseTagClose, + &CFX_SAXReader::ParseTagEnd, + &CFX_SAXReader::ParseTargetData, +}; + +} // namespace + +CFX_SAXFile::CFX_SAXFile() + : m_dwStart(0), + m_dwEnd(0), + m_dwCur(0), + m_pBuf(nullptr), + m_dwBufSize(0), + m_dwBufIndex(0) {} + +CFX_SAXFile::~CFX_SAXFile() {} + +bool CFX_SAXFile::StartFile(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile, + uint32_t dwStart, + uint32_t dwLen) { + ASSERT(!m_pFile && pFile); + uint32_t dwSize = pFile->GetSize(); + if (dwStart >= dwSize) + return false; + + if (dwLen == static_cast<uint32_t>(-1) || dwStart + dwLen > dwSize) + dwLen = dwSize - dwStart; + + if (dwLen == 0) + return false; + + m_dwBufSize = std::min(dwLen, kSaxFileBufSize); + m_pBuf = FX_Alloc(uint8_t, m_dwBufSize); + if (!pFile->ReadBlock(m_pBuf, dwStart, m_dwBufSize)) + return false; + + m_dwStart = dwStart; + m_dwEnd = dwStart + dwLen; + m_dwCur = dwStart; + m_pFile = pFile; + m_dwBufIndex = 0; + return true; +} + +bool CFX_SAXFile::ReadNextBlock() { + ASSERT(m_pFile); + uint32_t dwSize = m_dwEnd - m_dwCur; + if (dwSize == 0) { + return false; + } + m_dwBufSize = std::min(dwSize, kSaxFileBufSize); + if (!m_pFile->ReadBlock(m_pBuf, m_dwCur, m_dwBufSize)) { + return false; + } + m_dwBufIndex = 0; + return true; +} + +void CFX_SAXFile::Reset() { + if (m_pBuf) { + FX_Free(m_pBuf); + m_pBuf = nullptr; + } + m_pFile = nullptr; +} + +CFX_SAXReader::CFX_SAXReader() + : m_File(), + m_pHandler(nullptr), + m_iState(-1), + m_dwItemID(0), + m_iDataSize(256), + m_iNameSize(256), + m_dwParseMode(0), + m_pCommentContext(nullptr) { + m_pszData = FX_Alloc(uint8_t, m_iDataSize); + m_pszName = FX_Alloc(uint8_t, m_iNameSize); +} +CFX_SAXReader::~CFX_SAXReader() { + Reset(); + if (m_pszData) { + FX_Free(m_pszData); + m_pszData = nullptr; + } + if (m_pszName) { + FX_Free(m_pszName); + m_pszName = nullptr; + } +} + +void CFX_SAXReader::Reset() { + m_File.Reset(); + m_Stack = std::stack<std::unique_ptr<CFX_SAXItem>>(); + m_dwItemID = 0; + m_SkipStack = std::stack<char>(); + m_SkipChar = 0; + m_iDataLength = 0; + m_iEntityStart = -1; + m_iNameLength = 0; + m_iDataPos = 0; + m_pCommentContext.reset(); +} + +void CFX_SAXReader::Push() { + std::unique_ptr<CFX_SAXItem> pNew = + pdfium::MakeUnique<CFX_SAXItem>(++m_dwItemID); + if (!m_Stack.empty()) + pNew->m_bSkip = m_Stack.top()->m_bSkip; + m_Stack.push(std::move(pNew)); +} + +void CFX_SAXReader::Pop() { + if (!m_Stack.empty()) + m_Stack.pop(); +} + +CFX_SAXItem* CFX_SAXReader::GetCurrentItem() const { + return m_Stack.empty() ? nullptr : m_Stack.top().get(); +} + +void CFX_SAXReader::AppendData(uint8_t ch) { + ReallocDataBuffer(); + m_pszData[m_iDataPos++] = ch; +} + +void CFX_SAXReader::AppendName(uint8_t ch) { + ReallocNameBuffer(); + m_pszName[m_iDataPos++] = ch; +} + +void CFX_SAXReader::ReallocDataBuffer() { + if (m_iDataPos < m_iDataSize) { + return; + } + if (m_iDataSize <= 1024 * 1024) { + m_iDataSize *= 2; + } else { + m_iDataSize += 1024 * 1024; + } + m_pszData = (uint8_t*)FX_Realloc(uint8_t, m_pszData, m_iDataSize); +} + +void CFX_SAXReader::ReallocNameBuffer() { + if (m_iDataPos < m_iNameSize) { + return; + } + if (m_iNameSize <= 1024 * 1024) { + m_iNameSize *= 2; + } else { + m_iNameSize += 1024 * 1024; + } + m_pszName = (uint8_t*)FX_Realloc(uint8_t, m_pszName, m_iNameSize); +} + +bool CFX_SAXReader::SkipSpace(uint8_t ch) { + return (m_dwParseMode & CFX_SaxParseMode_NotSkipSpace) == 0 && ch < 0x21; +} + +int32_t CFX_SAXReader::StartParse( + const CFX_RetainPtr<IFX_SeekableReadStream>& pFile, + uint32_t dwStart, + uint32_t dwLen, + uint32_t dwParseMode) { + m_iState = -1; + Reset(); + if (!m_File.StartFile(pFile, dwStart, dwLen)) + return -1; + + m_iState = 0; + m_eMode = CFX_SaxMode::Text; + m_ePrevMode = CFX_SaxMode::Text; + m_bCharData = false; + m_dwDataOffset = 0; + m_dwParseMode = dwParseMode; + m_Stack.emplace(new CFX_SAXItem(++m_dwItemID)); + return 0; +} + +int32_t CFX_SAXReader::ContinueParse(IFX_Pause* pPause) { + if (m_iState < 0 || m_iState > 99) { + return m_iState; + } + while (m_File.m_dwCur < m_File.m_dwEnd) { + uint32_t& index = m_File.m_dwBufIndex; + uint32_t size = m_File.m_dwBufSize; + const uint8_t* pBuf = m_File.m_pBuf; + while (index < size) { + m_CurByte = pBuf[index]; + (this->*g_FX_SAXReader_LPFParse[static_cast<int>(m_eMode)])(); + index++; + } + m_File.m_dwCur += index; + m_iState = (m_File.m_dwCur - m_File.m_dwStart) * 100 / + (m_File.m_dwEnd - m_File.m_dwStart); + if (m_File.m_dwCur >= m_File.m_dwEnd) { + break; + } + if (!m_File.ReadNextBlock()) { + m_iState = -2; + break; + } + m_dwDataOffset = 0; + if (pPause && pPause->NeedToPauseNow()) { + break; + } + } + return m_iState; +} +void CFX_SAXReader::ParseChar(uint8_t ch) { + ReallocDataBuffer(); + m_pszData[m_iDataPos] = ch; + if (m_iEntityStart > -1 && ch == ';') { + int32_t iSaveEntityStart = m_iEntityStart; + CFX_ByteString csEntity(m_pszData + m_iEntityStart + 1, + m_iDataPos - m_iEntityStart - 1); + int32_t iLen = csEntity.GetLength(); + if (iLen > 0) { + if (csEntity[0] == '#') { + if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_sharp) == 0) { + ch = 0; + uint8_t w; + if (iLen > 1 && csEntity[1] == 'x') { + for (int32_t i = 2; i < iLen; i++) { + w = csEntity[i]; + if (w >= '0' && w <= '9') { + ch = (ch << 4) + w - '0'; + } else if (w >= 'A' && w <= 'F') { + ch = (ch << 4) + w - 55; + } else if (w >= 'a' && w <= 'f') { + ch = (ch << 4) + w - 87; + } else { + break; + } + } + } else { + for (int32_t i = 1; i < iLen; i++) { + w = csEntity[i]; + if (w < '0' || w > '9') { + break; + } + ch = ch * 10 + w - '0'; + } + } + if (ch != 0) { + m_pszData[m_iEntityStart++] = ch; + } + } + } else { + if (csEntity.Compare("amp") == 0) { + if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_amp) == 0) { + m_pszData[m_iEntityStart++] = '&'; + } + } else if (csEntity.Compare("lt") == 0) { + if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_lt) == 0) { + m_pszData[m_iEntityStart++] = '<'; + } + } else if (csEntity.Compare("gt") == 0) { + if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_gt) == 0) { + m_pszData[m_iEntityStart++] = '>'; + } + } else if (csEntity.Compare("apos") == 0) { + if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_apos) == 0) { + m_pszData[m_iEntityStart++] = '\''; + } + } else if (csEntity.Compare("quot") == 0) { + if ((m_dwParseMode & CFX_SaxParseMode_NotConvert_quot) == 0) { + m_pszData[m_iEntityStart++] = '\"'; + } + } + } + } + if (iSaveEntityStart != m_iEntityStart) { + m_iDataPos = m_iEntityStart; + m_iEntityStart = -1; + } else { + m_iDataPos++; + m_iEntityStart = -1; + } + } else { + if (m_iEntityStart < 0 && ch == '&') { + m_iEntityStart = m_iDataPos; + } + m_iDataPos++; + } +} + +void CFX_SAXReader::ParseText() { + if (m_CurByte == '<') { + if (m_iDataPos > 0) { + m_iDataLength = m_iDataPos; + m_iDataPos = 0; + if (m_pHandler) { + NotifyData(); + } + } + Push(); + m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex; + m_eMode = CFX_SaxMode::NodeStart; + return; + } + if (m_iDataPos < 1 && SkipSpace(m_CurByte)) { + return; + } + ParseChar(m_CurByte); +} + +void CFX_SAXReader::ParseNodeStart() { + if (m_CurByte == '?') { + GetCurrentItem()->m_eNode = CFX_SAXItem::Type::Instruction; + m_eMode = CFX_SaxMode::TagName; + return; + } + if (m_CurByte == '!') { + m_eMode = CFX_SaxMode::DeclOrComment; + return; + } + if (m_CurByte == '/') { + m_eMode = CFX_SaxMode::TagEnd; + return; + } + if (m_CurByte == '>') { + Pop(); + m_eMode = CFX_SaxMode::Text; + return; + } + if (m_CurByte > 0x20) { + m_dwDataOffset = m_File.m_dwBufIndex; + GetCurrentItem()->m_eNode = CFX_SAXItem::Type::Tag; + m_eMode = CFX_SaxMode::TagName; + AppendData(m_CurByte); + } +} + +void CFX_SAXReader::ParseDeclOrComment() { + if (m_CurByte == '-') { + m_eMode = CFX_SaxMode::Comment; + GetCurrentItem()->m_eNode = CFX_SAXItem::Type::Comment; + if (!m_pCommentContext) + m_pCommentContext = pdfium::MakeUnique<CFX_SAXCommentContext>(); + m_pCommentContext->m_iHeaderCount = 1; + m_pCommentContext->m_iTailCount = 0; + return; + } + m_eMode = CFX_SaxMode::DeclNode; + m_dwDataOffset = m_File.m_dwBufIndex; + m_SkipChar = '>'; + m_SkipStack.push('>'); + SkipNode(); +} + +void CFX_SAXReader::ParseComment() { + m_pCommentContext->m_iHeaderCount = 2; + m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex; + m_eMode = CFX_SaxMode::CommentContent; +} + +void CFX_SAXReader::ParseCommentContent() { + if (m_CurByte == '-') { + m_pCommentContext->m_iTailCount++; + } else if (m_CurByte == '>' && m_pCommentContext->m_iTailCount == 2) { + m_iDataLength = m_iDataPos; + m_iDataPos = 0; + if (m_pHandler) { + NotifyTargetData(); + } + Pop(); + m_eMode = CFX_SaxMode::Text; + } else { + while (m_pCommentContext->m_iTailCount > 0) { + AppendData('-'); + m_pCommentContext->m_iTailCount--; + } + AppendData(m_CurByte); + } +} +void CFX_SAXReader::ParseDeclNode() { + SkipNode(); +} +void CFX_SAXReader::ParseTagName() { + if (m_CurByte < 0x21 || m_CurByte == '/' || m_CurByte == '>' || + m_CurByte == '?') { + m_iDataLength = m_iDataPos; + m_iDataPos = 0; + if (m_pHandler) { + NotifyEnter(); + } + if (m_CurByte < 0x21) { + m_eMode = CFX_SaxMode::TagAttributeName; + } else if (m_CurByte == '/' || m_CurByte == '?') { + m_ePrevMode = m_eMode; + m_eMode = CFX_SaxMode::TagMaybeClose; + } else { + if (m_pHandler) { + NotifyBreak(); + } + m_eMode = CFX_SaxMode::Text; + } + } else { + AppendData(m_CurByte); + } +} +void CFX_SAXReader::ParseTagAttributeName() { + if (m_CurByte < 0x21 || m_CurByte == '=') { + if (m_iDataPos < 1 && m_CurByte < 0x21) { + return; + } + m_iNameLength = m_iDataPos; + m_iDataPos = 0; + m_SkipChar = 0; + m_eMode = m_CurByte == '=' ? CFX_SaxMode::TagAttributeValue + : CFX_SaxMode::TagAttributeEqual; + return; + } + if (m_CurByte == '/' || m_CurByte == '>' || m_CurByte == '?') { + if (m_CurByte == '/' || m_CurByte == '?') { + m_ePrevMode = m_eMode; + m_eMode = CFX_SaxMode::TagMaybeClose; + } else { + if (m_pHandler) { + NotifyBreak(); + } + m_eMode = CFX_SaxMode::Text; + } + return; + } + if (m_iDataPos < 1) { + m_dwDataOffset = m_File.m_dwBufIndex; + } + AppendName(m_CurByte); +} + +void CFX_SAXReader::ParseTagAttributeEqual() { + if (m_CurByte == '=') { + m_SkipChar = 0; + m_eMode = CFX_SaxMode::TagAttributeValue; + return; + } + if (GetCurrentItem()->m_eNode == CFX_SAXItem::Type::Instruction) { + m_iDataPos = m_iNameLength; + AppendName(0x20); + m_eMode = CFX_SaxMode::TargetData; + ParseTargetData(); + } +} + +void CFX_SAXReader::ParseTagAttributeValue() { + if (m_SkipChar) { + if (m_SkipChar == m_CurByte) { + { + m_iDataLength = m_iDataPos; + m_iDataPos = 0; + if (m_pHandler) { + NotifyAttribute(); + } + } + m_SkipChar = 0; + m_eMode = CFX_SaxMode::TagAttributeName; + return; + } + ParseChar(m_CurByte); + return; + } + if (m_CurByte < 0x21) { + return; + } + if (m_iDataPos < 1) { + if (m_CurByte == '\'' || m_CurByte == '\"') { + m_SkipChar = m_CurByte; + } + } +} + +void CFX_SAXReader::ParseMaybeClose() { + if (m_CurByte == '>') { + if (GetCurrentItem()->m_eNode == CFX_SAXItem::Type::Instruction) { + m_iNameLength = m_iDataPos; + m_iDataPos = 0; + if (m_pHandler) { + NotifyTargetData(); + } + } + ParseTagClose(); + m_eMode = CFX_SaxMode::Text; + } else if (m_ePrevMode == CFX_SaxMode::TagName) { + AppendData('/'); + m_eMode = CFX_SaxMode::TagName; + m_ePrevMode = CFX_SaxMode::Text; + ParseTagName(); + } else if (m_ePrevMode == CFX_SaxMode::TagAttributeName) { + AppendName('/'); + m_eMode = CFX_SaxMode::TagAttributeName; + m_ePrevMode = CFX_SaxMode::Text; + ParseTagAttributeName(); + } else if (m_ePrevMode == CFX_SaxMode::TargetData) { + AppendName('?'); + m_eMode = CFX_SaxMode::TargetData; + m_ePrevMode = CFX_SaxMode::Text; + ParseTargetData(); + } +} +void CFX_SAXReader::ParseTagClose() { + m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex; + if (m_pHandler) { + NotifyClose(); + } + Pop(); +} +void CFX_SAXReader::ParseTagEnd() { + if (m_CurByte < 0x21) { + return; + } + if (m_CurByte == '>') { + Pop(); + m_dwNodePos = m_File.m_dwCur + m_File.m_dwBufIndex; + m_iDataLength = m_iDataPos; + m_iDataPos = 0; + if (m_pHandler) { + NotifyEnd(); + } + Pop(); + m_eMode = CFX_SaxMode::Text; + } else { + ParseChar(m_CurByte); + } +} +void CFX_SAXReader::ParseTargetData() { + if (m_CurByte == '?') { + m_ePrevMode = m_eMode; + m_eMode = CFX_SaxMode::TagMaybeClose; + } else { + AppendName(m_CurByte); + } +} +void CFX_SAXReader::SkipNode() { + if (m_SkipChar == '\'' || m_SkipChar == '\"') { + if (m_CurByte != m_SkipChar) + return; + + ASSERT(!m_SkipStack.empty()); + m_SkipStack.pop(); + m_SkipChar = !m_SkipStack.empty() ? m_SkipStack.top() : 0; + return; + } + switch (m_CurByte) { + case '<': + m_SkipChar = '>'; + m_SkipStack.push('>'); + break; + case '[': + m_SkipChar = ']'; + m_SkipStack.push(']'); + break; + case '(': + m_SkipChar = ')'; + m_SkipStack.push(')'); + break; + case '\'': + m_SkipChar = '\''; + m_SkipStack.push('\''); + break; + case '\"': + m_SkipChar = '\"'; + m_SkipStack.push('\"'); + break; + default: + if (m_CurByte == m_SkipChar) { + m_SkipStack.pop(); + m_SkipChar = !m_SkipStack.empty() ? m_SkipStack.top() : 0; + if (m_SkipStack.empty() && m_CurByte == '>') { + m_iDataLength = m_iDataPos; + m_iDataPos = 0; + if (m_iDataLength >= 9 && + FXSYS_memcmp(m_pszData, "[CDATA[", 7 * sizeof(uint8_t)) == 0 && + FXSYS_memcmp(m_pszData + m_iDataLength - 2, "]]", + 2 * sizeof(uint8_t)) == 0) { + Pop(); + m_iDataLength -= 9; + m_dwDataOffset += 7; + FXSYS_memmove(m_pszData, m_pszData + 7, + m_iDataLength * sizeof(uint8_t)); + m_bCharData = true; + if (m_pHandler) { + NotifyData(); + } + m_bCharData = false; + } else { + Pop(); + } + m_eMode = CFX_SaxMode::Text; + } + } + break; + } + if (!m_SkipStack.empty()) + ParseChar(m_CurByte); +} + +void CFX_SAXReader::NotifyData() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (!pItem) + return; + + if (pItem->m_eNode == CFX_SAXItem::Type::Tag) + m_pHandler->OnTagData( + pItem->m_pNode, + m_bCharData ? CFX_SAXItem::Type::CharData : CFX_SAXItem::Type::Text, + CFX_ByteStringC(m_pszData, m_iDataLength), + m_File.m_dwCur + m_dwDataOffset); +} + +void CFX_SAXReader::NotifyEnter() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (pItem->m_eNode == CFX_SAXItem::Type::Tag || + pItem->m_eNode == CFX_SAXItem::Type::Instruction) { + pItem->m_pNode = m_pHandler->OnTagEnter( + CFX_ByteStringC(m_pszData, m_iDataLength), pItem->m_eNode, m_dwNodePos); + } +} + +void CFX_SAXReader::NotifyAttribute() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (pItem->m_eNode == CFX_SAXItem::Type::Tag || + pItem->m_eNode == CFX_SAXItem::Type::Instruction) { + m_pHandler->OnTagAttribute(pItem->m_pNode, + CFX_ByteStringC(m_pszName, m_iNameLength), + CFX_ByteStringC(m_pszData, m_iDataLength)); + } +} + +void CFX_SAXReader::NotifyBreak() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (pItem->m_eNode == CFX_SAXItem::Type::Tag) + m_pHandler->OnTagBreak(pItem->m_pNode); +} + +void CFX_SAXReader::NotifyClose() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (pItem->m_eNode == CFX_SAXItem::Type::Tag || + pItem->m_eNode == CFX_SAXItem::Type::Instruction) { + m_pHandler->OnTagClose(pItem->m_pNode, m_dwNodePos); + } +} + +void CFX_SAXReader::NotifyEnd() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (!pItem || pItem->m_eNode != CFX_SAXItem::Type::Tag) + return; + + m_pHandler->OnTagEnd(pItem->m_pNode, + CFX_ByteStringC(m_pszData, m_iDataLength), m_dwNodePos); +} + +void CFX_SAXReader::NotifyTargetData() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (pItem->m_eNode == CFX_SAXItem::Type::Instruction) { + m_pHandler->OnTargetData(pItem->m_pNode, pItem->m_eNode, + CFX_ByteStringC(m_pszName, m_iNameLength), + m_dwNodePos); + } else if (pItem->m_eNode == CFX_SAXItem::Type::Comment) { + m_pHandler->OnTargetData(pItem->m_pNode, pItem->m_eNode, + CFX_ByteStringC(m_pszData, m_iDataLength), + m_dwNodePos); + } +} + +void CFX_SAXReader::SkipCurrentNode() { + CFX_SAXItem* pItem = GetCurrentItem(); + if (!pItem) + return; + + pItem->m_bSkip = true; +} diff --git a/core/fxcrt/xml/cfx_saxreader.h b/core/fxcrt/xml/cfx_saxreader.h new file mode 100644 index 0000000000..af3c26f94a --- /dev/null +++ b/core/fxcrt/xml/cfx_saxreader.h @@ -0,0 +1,144 @@ +// Copyright 2014 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_SAXREADER_H_ +#define CORE_FXCRT_XML_CFX_SAXREADER_H_ + +#include <memory> +#include <stack> + +#include "core/fxcrt/fx_basic.h" + +class CFX_SAXCommentContext; +class CFX_SAXContext; +class CFX_SAXReaderHandler; +enum class CFX_SaxMode; + +class CFX_SAXItem { + public: + enum class Type { + Unknown = 0, + Instruction, + Declaration, + Comment, + Tag, + Text, + CharData, + }; + + explicit CFX_SAXItem(uint32_t id) + : m_pNode(nullptr), m_eNode(Type::Unknown), m_dwID(id), m_bSkip(false) {} + + CFX_SAXContext* m_pNode; + Type m_eNode; + const uint32_t m_dwID; + bool m_bSkip; +}; + +class CFX_SAXFile { + public: + CFX_SAXFile(); + ~CFX_SAXFile(); + + bool StartFile(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile, + uint32_t dwStart, + uint32_t dwLen); + bool ReadNextBlock(); + void Reset(); + + CFX_RetainPtr<IFX_SeekableReadStream> m_pFile; + uint32_t m_dwStart; + uint32_t m_dwEnd; + uint32_t m_dwCur; + uint8_t* m_pBuf; + uint32_t m_dwBufSize; + uint32_t m_dwBufIndex; +}; + +enum CFX_SaxParseMode { + CFX_SaxParseMode_NotConvert_amp = 1 << 0, + CFX_SaxParseMode_NotConvert_lt = 1 << 1, + CFX_SaxParseMode_NotConvert_gt = 1 << 2, + CFX_SaxParseMode_NotConvert_apos = 1 << 3, + CFX_SaxParseMode_NotConvert_quot = 1 << 4, + CFX_SaxParseMode_NotConvert_sharp = 1 << 5, + CFX_SaxParseMode_NotSkipSpace = 1 << 6 +}; + +class CFX_SAXReader { + public: + CFX_SAXReader(); + ~CFX_SAXReader(); + + int32_t StartParse(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile, + uint32_t dwStart = 0, + uint32_t dwLen = -1, + uint32_t dwParseMode = 0); + int32_t ContinueParse(IFX_Pause* pPause = nullptr); + void SkipCurrentNode(); + void SetHandler(CFX_SAXReaderHandler* pHandler) { m_pHandler = pHandler; } + void AppendData(uint8_t ch); + void AppendName(uint8_t ch); + void ParseText(); + void ParseNodeStart(); + void ParseInstruction(); + void ParseDeclOrComment(); + void ParseDeclNode(); + void ParseComment(); + void ParseCommentContent(); + void ParseTagName(); + void ParseTagAttributeName(); + void ParseTagAttributeEqual(); + void ParseTagAttributeValue(); + void ParseMaybeClose(); + void ParseTagClose(); + void ParseTagEnd(); + void ParseTargetData(); + + private: + void Reset(); + void Push(); + void Pop(); + CFX_SAXItem* GetCurrentItem() const; + bool SkipSpace(uint8_t ch); + void SkipNode(); + void NotifyData(); + void NotifyEnter(); + void NotifyAttribute(); + void NotifyBreak(); + void NotifyClose(); + void NotifyEnd(); + void NotifyTargetData(); + void ReallocDataBuffer(); + void ReallocNameBuffer(); + void ParseChar(uint8_t ch); + + CFX_SAXFile m_File; + CFX_SAXReaderHandler* m_pHandler; + int32_t m_iState; + std::stack<std::unique_ptr<CFX_SAXItem>> m_Stack; + uint32_t m_dwItemID; + CFX_SaxMode m_eMode; + CFX_SaxMode m_ePrevMode; + bool m_bCharData; + uint8_t m_CurByte; + uint32_t m_dwDataOffset; + std::stack<char> m_SkipStack; + uint8_t m_SkipChar; + uint32_t m_dwNodePos; + uint8_t* m_pszData; + int32_t m_iDataSize; + int32_t m_iDataLength; + int32_t m_iEntityStart; + int32_t m_iDataPos; + uint8_t* m_pszName; + int32_t m_iNameSize; + int32_t m_iNameLength; + uint32_t m_dwParseMode; + std::unique_ptr<CFX_SAXCommentContext> m_pCommentContext; +}; + +#endif // CORE_FXCRT_XML_CFX_SAXREADER_H_ diff --git a/core/fxcrt/xml/cfx_saxreaderhandler.cpp b/core/fxcrt/xml/cfx_saxreaderhandler.cpp new file mode 100644 index 0000000000..e7b6cd186c --- /dev/null +++ b/core/fxcrt/xml/cfx_saxreaderhandler.cpp @@ -0,0 +1,128 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_saxreaderhandler.h" + +#include "core/fxcrt/cfx_checksumcontext.h" + +CFX_SAXReaderHandler::CFX_SAXReaderHandler(CFX_ChecksumContext* pContext) + : m_pContext(pContext) { + ASSERT(m_pContext); +} + +CFX_SAXReaderHandler::~CFX_SAXReaderHandler() {} + +CFX_SAXContext* CFX_SAXReaderHandler::OnTagEnter( + const CFX_ByteStringC& bsTagName, + CFX_SAXItem::Type eType, + uint32_t dwStartPos) { + UpdateChecksum(true); + if (eType != CFX_SAXItem::Type::Tag && + eType != CFX_SAXItem::Type::Instruction) { + return nullptr; + } + + m_SAXContext.m_eNode = eType; + CFX_ByteTextBuf& textBuf = m_SAXContext.m_TextBuf; + textBuf << "<"; + if (eType == CFX_SAXItem::Type::Instruction) + textBuf << "?"; + + textBuf << bsTagName; + m_SAXContext.m_bsTagName = bsTagName; + return &m_SAXContext; +} + +void CFX_SAXReaderHandler::OnTagAttribute(CFX_SAXContext* pTag, + const CFX_ByteStringC& bsAttri, + const CFX_ByteStringC& bsValue) { + if (!pTag) + return; + pTag->m_TextBuf << " " << bsAttri << "=\"" << bsValue << "\""; +} + +void CFX_SAXReaderHandler::OnTagBreak(CFX_SAXContext* pTag) { + if (!pTag) + return; + + pTag->m_TextBuf << ">"; + UpdateChecksum(false); +} + +void CFX_SAXReaderHandler::OnTagData(CFX_SAXContext* pTag, + CFX_SAXItem::Type eType, + const CFX_ByteStringC& bsData, + uint32_t dwStartPos) { + if (!pTag) + return; + + CFX_ByteTextBuf& textBuf = pTag->m_TextBuf; + if (eType == CFX_SAXItem::Type::CharData) + textBuf << "<![CDATA["; + + textBuf << bsData; + if (eType == CFX_SAXItem::Type::CharData) + textBuf << "]]>"; +} + +void CFX_SAXReaderHandler::OnTagClose(CFX_SAXContext* pTag, uint32_t dwEndPos) { + if (!pTag) + return; + + CFX_ByteTextBuf& textBuf = pTag->m_TextBuf; + if (pTag->m_eNode == CFX_SAXItem::Type::Instruction) + textBuf << "?>"; + else if (pTag->m_eNode == CFX_SAXItem::Type::Tag) + textBuf << "></" << pTag->m_bsTagName.AsStringC() << ">"; + + UpdateChecksum(false); +} + +void CFX_SAXReaderHandler::OnTagEnd(CFX_SAXContext* pTag, + const CFX_ByteStringC& bsTagName, + uint32_t dwEndPos) { + if (!pTag) + return; + + pTag->m_TextBuf << "</" << bsTagName << ">"; + UpdateChecksum(false); +} + +void CFX_SAXReaderHandler::OnTargetData(CFX_SAXContext* pTag, + CFX_SAXItem::Type eType, + const CFX_ByteStringC& bsData, + uint32_t dwStartPos) { + if (!pTag && eType != CFX_SAXItem::Type::Comment) + return; + + if (eType == CFX_SAXItem::Type::Comment) { + m_SAXContext.m_TextBuf << "<!--" << bsData << "-->"; + UpdateChecksum(false); + } else { + pTag->m_TextBuf << " " << bsData; + } +} + +void CFX_SAXReaderHandler::UpdateChecksum(bool bCheckSpace) { + int32_t iLength = m_SAXContext.m_TextBuf.GetLength(); + if (iLength < 1) + return; + + uint8_t* pBuffer = m_SAXContext.m_TextBuf.GetBuffer(); + bool bUpdata = true; + if (bCheckSpace) { + bUpdata = false; + for (int32_t i = 0; i < iLength; i++) { + bUpdata = (pBuffer[i] > 0x20); + if (bUpdata) + break; + } + } + if (bUpdata) + m_pContext->Update(CFX_ByteStringC(pBuffer, iLength)); + + m_SAXContext.m_TextBuf.Clear(); +} diff --git a/core/fxcrt/xml/cfx_saxreaderhandler.h b/core/fxcrt/xml/cfx_saxreaderhandler.h new file mode 100644 index 0000000000..bfedf03846 --- /dev/null +++ b/core/fxcrt/xml/cfx_saxreaderhandler.h @@ -0,0 +1,49 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_SAXREADERHANDLER_H_ +#define CORE_FXCRT_XML_CFX_SAXREADERHANDLER_H_ + +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_saxcontext.h" +#include "core/fxcrt/xml/cfx_saxreader.h" + +class CFX_ChecksumContext; + +class CFX_SAXReaderHandler { + public: + explicit CFX_SAXReaderHandler(CFX_ChecksumContext* pContext); + ~CFX_SAXReaderHandler(); + + CFX_SAXContext* OnTagEnter(const CFX_ByteStringC& bsTagName, + CFX_SAXItem::Type eType, + uint32_t dwStartPos); + void OnTagAttribute(CFX_SAXContext* pTag, + const CFX_ByteStringC& bsAttri, + const CFX_ByteStringC& bsValue); + void OnTagBreak(CFX_SAXContext* pTag); + void OnTagData(CFX_SAXContext* pTag, + CFX_SAXItem::Type eType, + const CFX_ByteStringC& bsData, + uint32_t dwStartPos); + void OnTagClose(CFX_SAXContext* pTag, uint32_t dwEndPos); + void OnTagEnd(CFX_SAXContext* pTag, + const CFX_ByteStringC& bsTagName, + uint32_t dwEndPos); + + void OnTargetData(CFX_SAXContext* pTag, + CFX_SAXItem::Type eType, + const CFX_ByteStringC& bsData, + uint32_t dwStartPos); + + private: + void UpdateChecksum(bool bCheckSpace); + + CFX_ChecksumContext* m_pContext; + CFX_SAXContext m_SAXContext; +}; + +#endif // CORE_FXCRT_XML_CFX_SAXREADERHANDLER_H_ |