summaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorDan Sinclair <dsinclair@chromium.org>2018-04-12 13:15:39 +0000
committerChromium commit bot <commit-bot@chromium.org>2018-04-12 13:15:39 +0000
commit332139df2c3c0826069fa61bcd436309fcdf5a6f (patch)
tree404ec9e20810029fc132ea0c7cdcbd2eb39aa558 /core
parent6d503b875e6f75f0d8b5f29fcf811a89f12ad12d (diff)
downloadpdfium-332139df2c3c0826069fa61bcd436309fcdf5a6f.tar.xz
Merge CFX_XMLParser and CFX_XMLSyntaxParser
The CFX_XMLParser was a wrapper around the CFX_XMLSyntaxParser. This CL merges the SyntaxParser into protected/private methods if the XMLParser. Change-Id: If1519b5de55866ed14359dffd64dc12c36ee0244 Reviewed-on: https://pdfium-review.googlesource.com/30171 Reviewed-by: Ryan Harrison <rharrison@chromium.org> Commit-Queue: dsinclair <dsinclair@chromium.org>
Diffstat (limited to 'core')
-rw-r--r--core/fxcrt/xml/cfx_xmlparser.cpp709
-rw-r--r--core/fxcrt/xml/cfx_xmlparser.h112
-rw-r--r--core/fxcrt/xml/cfx_xmlparser_unittest.cpp (renamed from core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp)130
-rw-r--r--core/fxcrt/xml/cfx_xmlsyntaxparser.cpp694
-rw-r--r--core/fxcrt/xml/cfx_xmlsyntaxparser.h130
5 files changed, 887 insertions, 888 deletions
diff --git a/core/fxcrt/xml/cfx_xmlparser.cpp b/core/fxcrt/xml/cfx_xmlparser.cpp
index 76ea32df6c..39196d0264 100644
--- a/core/fxcrt/xml/cfx_xmlparser.cpp
+++ b/core/fxcrt/xml/cfx_xmlparser.cpp
@@ -6,6 +6,12 @@
#include "core/fxcrt/xml/cfx_xmlparser.h"
+#include <algorithm>
+#include <cwctype>
+#include <iterator>
+
+#include "core/fxcrt/fx_extension.h"
+#include "core/fxcrt/fx_safe_types.h"
#include "core/fxcrt/xml/cfx_xmlchardata.h"
#include "core/fxcrt/xml/cfx_xmlelement.h"
#include "core/fxcrt/xml/cfx_xmlinstruction.h"
@@ -13,13 +19,115 @@
#include "core/fxcrt/xml/cfx_xmltext.h"
#include "third_party/base/ptr_util.h"
+namespace {
+
+const uint32_t kMaxCharRange = 0x10ffff;
+
+bool IsXMLWhiteSpace(wchar_t ch) {
+ return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
+}
+
+struct FX_XMLNAMECHAR {
+ uint16_t wStart;
+ uint16_t wEnd;
+ bool bStartChar;
+};
+
+const FX_XMLNAMECHAR g_XMLNameChars[] = {
+ {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
+ {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
+ {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
+ {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
+ {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
+ {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
+ {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
+};
+
+int32_t GetUTF8EncodeLength(const std::vector<wchar_t>& src,
+ FX_FILESIZE iSrcLen) {
+ uint32_t unicode = 0;
+ int32_t iDstNum = 0;
+ const wchar_t* pSrc = src.data();
+ while (iSrcLen-- > 0) {
+ unicode = *pSrc++;
+ int nbytes = 0;
+ if ((uint32_t)unicode < 0x80) {
+ nbytes = 1;
+ } else if ((uint32_t)unicode < 0x800) {
+ nbytes = 2;
+ } else if ((uint32_t)unicode < 0x10000) {
+ nbytes = 3;
+ } else if ((uint32_t)unicode < 0x200000) {
+ nbytes = 4;
+ } else if ((uint32_t)unicode < 0x4000000) {
+ nbytes = 5;
+ } else {
+ nbytes = 6;
+ }
+ iDstNum += nbytes;
+ }
+ return iDstNum;
+}
+
+} // namespace
+
+// static
+bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
+ auto* it = std::lower_bound(
+ std::begin(g_XMLNameChars), std::end(g_XMLNameChars), ch,
+ [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
+ return it != std::end(g_XMLNameChars) && ch >= it->wStart &&
+ (!bFirstChar || it->bStartChar);
+}
+
CFX_XMLParser::CFX_XMLParser(CFX_XMLNode* pParent,
const RetainPtr<CFX_SeekableStreamProxy>& pStream)
- : m_pParser(pdfium::MakeUnique<CFX_XMLSyntaxParser>(pStream)),
- m_pParent(pParent),
- m_pChild(nullptr) {
- ASSERT(m_pParent && pStream);
+ : m_pParent(pParent),
+ m_pChild(nullptr),
+ m_pStream(pStream),
+ m_iXMLPlaneSize(1024),
+ m_iCurrentPos(0),
+ m_iCurrentNodeNum(-1),
+ m_iLastNodeNum(-1),
+ m_iParsedBytes(0),
+ m_ParsedChars(0),
+ m_iBufferChars(0),
+ m_bEOS(false),
+ m_Start(0),
+ m_End(0),
+ m_iAllocStep(m_BlockBuffer.GetAllocStep()),
+ m_pCurrentBlock(nullptr),
+ m_iIndexInBlock(0),
+ m_iTextDataLength(0),
+ m_syntaxParserResult(FX_XmlSyntaxResult::None),
+ m_syntaxParserState(FDE_XmlSyntaxState::Text),
+ m_wQuotationMark(0),
+ m_iEntityStart(-1) {
+ ASSERT(m_pParent);
+ ASSERT(pStream);
+
m_NodeStack.push(m_pParent);
+
+ m_CurNode.iNodeNum = -1;
+ m_CurNode.eNodeType = FX_XMLNODE_Unknown;
+
+ m_iXMLPlaneSize =
+ std::min(m_iXMLPlaneSize,
+ pdfium::base::checked_cast<size_t>(m_pStream->GetLength()));
+ m_iCurrentPos = m_pStream->GetBOMLength();
+
+ FX_SAFE_SIZE_T alloc_size_safe = m_iXMLPlaneSize;
+ alloc_size_safe += 1; // For NUL.
+ if (!alloc_size_safe.IsValid() || alloc_size_safe.ValueOrDie() <= 0) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return;
+ }
+
+ m_Buffer.resize(pdfium::base::ValueOrDieForType<size_t>(alloc_size_safe));
+
+ m_BlockBuffer.InitBuffer();
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
}
CFX_XMLParser::~CFX_XMLParser() {}
@@ -27,7 +135,7 @@ CFX_XMLParser::~CFX_XMLParser() {}
int32_t CFX_XMLParser::Parse() {
int32_t iCount = 0;
while (true) {
- FX_XmlSyntaxResult result = m_pParser->DoSyntaxParse();
+ FX_XmlSyntaxResult result = DoSyntaxParse();
if (result == FX_XmlSyntaxResult::Error)
return -1;
if (result == FX_XmlSyntaxResult::EndOfString)
@@ -44,7 +152,7 @@ int32_t CFX_XMLParser::Parse() {
if (m_pChild->GetType() != FX_XMLNODE_Element)
return -1;
- m_ws1 = m_pParser->GetTagName();
+ m_ws1 = GetTagName();
if (m_ws1.GetLength() > 0 &&
m_ws1 != static_cast<CFX_XMLElement*>(m_pChild)->GetName()) {
return -1;
@@ -60,7 +168,7 @@ int32_t CFX_XMLParser::Parse() {
iCount++;
break;
case FX_XmlSyntaxResult::TargetName:
- m_ws1 = m_pParser->GetTargetName();
+ m_ws1 = GetTargetName();
if (m_ws1 == L"originalXFAVersion" || m_ws1 == L"acrobat") {
m_pChild = new CFX_XMLInstruction(m_ws1);
m_pParent->AppendChild(m_pChild);
@@ -70,30 +178,30 @@ int32_t CFX_XMLParser::Parse() {
m_ws1.clear();
break;
case FX_XmlSyntaxResult::TagName:
- m_ws1 = m_pParser->GetTagName();
+ m_ws1 = GetTagName();
m_pChild = new CFX_XMLElement(m_ws1);
m_pParent->AppendChild(m_pChild);
m_NodeStack.push(m_pChild);
m_pParent = m_pChild;
break;
case FX_XmlSyntaxResult::AttriName:
- m_ws1 = m_pParser->GetAttributeName();
+ m_ws1 = GetAttributeName();
break;
case FX_XmlSyntaxResult::AttriValue:
if (m_pChild && m_pChild->GetType() == FX_XMLNODE_Element) {
- static_cast<CFX_XMLElement*>(m_pChild)->SetString(
- m_ws1, m_pParser->GetAttributeName());
+ static_cast<CFX_XMLElement*>(m_pChild)->SetString(m_ws1,
+ GetAttributeName());
}
m_ws1.clear();
break;
case FX_XmlSyntaxResult::Text:
- m_ws1 = m_pParser->GetTextData();
+ m_ws1 = GetTextData();
m_pChild = new CFX_XMLText(m_ws1);
m_pParent->AppendChild(m_pChild);
m_pChild = m_pParent;
break;
case FX_XmlSyntaxResult::CData:
- m_ws1 = m_pParser->GetTextData();
+ m_ws1 = GetTextData();
m_pChild = new CFX_XMLCharData(m_ws1);
m_pParent->AppendChild(m_pChild);
m_pChild = m_pParent;
@@ -107,7 +215,7 @@ int32_t CFX_XMLParser::Parse() {
if (!m_ws1.IsEmpty())
instruction->AppendData(m_ws1);
- instruction->AppendData(m_pParser->GetTargetData());
+ instruction->AppendData(GetTargetData());
}
m_ws1.clear();
break;
@@ -118,5 +226,576 @@ int32_t CFX_XMLParser::Parse() {
break;
}
}
- return m_NodeStack.size() != 1 ? -1 : m_pParser->GetStatus();
+ return m_NodeStack.size() != 1 ? -1 : GetStatus();
+}
+
+FX_XmlSyntaxResult CFX_XMLParser::DoSyntaxParse() {
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::Error ||
+ m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) {
+ return m_syntaxParserResult;
+ }
+
+ FX_FILESIZE iStreamLength = m_pStream->GetLength();
+ FX_FILESIZE iPos;
+
+ FX_XmlSyntaxResult syntaxParserResult = FX_XmlSyntaxResult::None;
+ while (true) {
+ if (m_Start >= m_End) {
+ if (m_bEOS || m_iCurrentPos >= iStreamLength) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString;
+ return m_syntaxParserResult;
+ }
+ m_ParsedChars += m_End;
+ m_iParsedBytes = m_iCurrentPos;
+ if (m_pStream->GetPosition() != m_iCurrentPos)
+ m_pStream->Seek(CFX_SeekableStreamProxy::From::Begin, m_iCurrentPos);
+
+ m_iBufferChars =
+ m_pStream->ReadString(m_Buffer.data(), m_iXMLPlaneSize, &m_bEOS);
+ iPos = m_pStream->GetPosition();
+ if (m_iBufferChars < 1) {
+ m_iCurrentPos = iStreamLength;
+ m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString;
+ return m_syntaxParserResult;
+ }
+ m_iCurrentPos = iPos;
+ m_Start = 0;
+ m_End = m_iBufferChars;
+ }
+
+ while (m_Start < m_End) {
+ wchar_t ch = m_Buffer[m_Start];
+ switch (m_syntaxParserState) {
+ case FDE_XmlSyntaxState::Text:
+ if (ch == L'<') {
+ if (!m_BlockBuffer.IsEmpty()) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_iEntityStart = -1;
+ syntaxParserResult = FX_XmlSyntaxResult::Text;
+ } else {
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::Node;
+ }
+ } else {
+ ParseTextChar(ch);
+ }
+ break;
+ case FDE_XmlSyntaxState::Node:
+ if (ch == L'!') {
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipCommentOrDecl;
+ } else if (ch == L'/') {
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseElement;
+ } else if (ch == L'?') {
+ m_iLastNodeNum++;
+ m_iCurrentNodeNum = m_iLastNodeNum;
+ m_CurNode.iNodeNum = m_iLastNodeNum;
+ m_CurNode.eNodeType = FX_XMLNODE_Instruction;
+ m_XMLNodeStack.push(m_CurNode);
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::Target;
+ syntaxParserResult = FX_XmlSyntaxResult::InstructionOpen;
+ } else {
+ m_iLastNodeNum++;
+ m_iCurrentNodeNum = m_iLastNodeNum;
+ m_CurNode.iNodeNum = m_iLastNodeNum;
+ m_CurNode.eNodeType = FX_XMLNODE_Element;
+ m_XMLNodeStack.push(m_CurNode);
+ m_syntaxParserState = FDE_XmlSyntaxState::Tag;
+ syntaxParserResult = FX_XmlSyntaxResult::ElementOpen;
+ }
+ break;
+ case FDE_XmlSyntaxState::Target:
+ case FDE_XmlSyntaxState::Tag:
+ if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
+ if (m_BlockBuffer.IsEmpty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (m_syntaxParserState != FDE_XmlSyntaxState::Target)
+ syntaxParserResult = FX_XmlSyntaxResult::TagName;
+ else
+ syntaxParserResult = FX_XmlSyntaxResult::TargetName;
+
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriName;
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriName:
+ if (m_BlockBuffer.IsEmpty() && IsXMLWhiteSpace(ch)) {
+ m_Start++;
+ break;
+ }
+ if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
+ if (m_BlockBuffer.IsEmpty()) {
+ if (m_CurNode.eNodeType == FX_XMLNODE_Element) {
+ if (ch == L'>' || ch == L'/') {
+ m_syntaxParserState = FDE_XmlSyntaxState::BreakElement;
+ break;
+ }
+ } else if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
+ if (ch == L'?') {
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction;
+ m_Start++;
+ } else {
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ }
+ break;
+ }
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ } else {
+ if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
+ if (ch != '=' && !IsXMLWhiteSpace(ch)) {
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ break;
+ }
+ }
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriEqualSign;
+ syntaxParserResult = FX_XmlSyntaxResult::AttriName;
+ }
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriEqualSign:
+ if (IsXMLWhiteSpace(ch)) {
+ m_Start++;
+ break;
+ }
+ if (ch != L'=') {
+ if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ break;
+ }
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ } else {
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriQuotation;
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriQuotation:
+ if (IsXMLWhiteSpace(ch)) {
+ m_Start++;
+ break;
+ }
+ if (ch != L'\"' && ch != L'\'') {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ } else {
+ m_wQuotationMark = ch;
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriValue;
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriValue:
+ if (ch == m_wQuotationMark) {
+ if (m_iEntityStart > -1) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_wQuotationMark = 0;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriName;
+ syntaxParserResult = FX_XmlSyntaxResult::AttriValue;
+ } else {
+ ParseTextChar(ch);
+ }
+ break;
+ case FDE_XmlSyntaxState::CloseInstruction:
+ if (ch != L'>') {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ } else if (!m_BlockBuffer.IsEmpty()) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ syntaxParserResult = FX_XmlSyntaxResult::TargetData;
+ } else {
+ m_Start++;
+ if (m_XMLNodeStack.empty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_XMLNodeStack.pop();
+ if (!m_XMLNodeStack.empty()) {
+ m_CurNode = m_XMLNodeStack.top();
+ } else {
+ m_CurNode.iNodeNum = -1;
+ m_CurNode.eNodeType = FX_XMLNODE_Unknown;
+ }
+ m_iCurrentNodeNum = m_CurNode.iNodeNum;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ syntaxParserResult = FX_XmlSyntaxResult::InstructionClose;
+ }
+ break;
+ case FDE_XmlSyntaxState::BreakElement:
+ if (ch == L'>') {
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ syntaxParserResult = FX_XmlSyntaxResult::ElementBreak;
+ } else if (ch == L'/') {
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseElement;
+ } else {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_Start++;
+ break;
+ case FDE_XmlSyntaxState::CloseElement:
+ if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
+ if (ch == L'>') {
+ if (m_XMLNodeStack.empty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_XMLNodeStack.pop();
+ if (!m_XMLNodeStack.empty()) {
+ m_CurNode = m_XMLNodeStack.top();
+ } else {
+ m_CurNode.iNodeNum = -1;
+ m_CurNode.eNodeType = FX_XMLNODE_Unknown;
+ }
+ m_iCurrentNodeNum = m_CurNode.iNodeNum;
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ syntaxParserResult = FX_XmlSyntaxResult::ElementClose;
+ } else if (!IsXMLWhiteSpace(ch)) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ }
+ m_Start++;
+ break;
+ case FDE_XmlSyntaxState::SkipCommentOrDecl:
+ if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"--", 2) == 0) {
+ m_Start += 2;
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipComment;
+ } else if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"[CDATA[", 7) ==
+ 0) {
+ m_Start += 7;
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipCData;
+ } else {
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipDeclNode;
+ m_SkipChar = L'>';
+ m_SkipStack.push(L'>');
+ }
+ break;
+ case FDE_XmlSyntaxState::SkipCData: {
+ if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"]]>", 3) == 0) {
+ m_Start += 3;
+ syntaxParserResult = FX_XmlSyntaxResult::CData;
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock)
+ return FX_XmlSyntaxResult::Error;
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ }
+ case FDE_XmlSyntaxState::SkipDeclNode:
+ if (m_SkipChar == L'\'' || m_SkipChar == L'\"') {
+ m_Start++;
+ if (ch != m_SkipChar)
+ break;
+
+ m_SkipStack.pop();
+ if (m_SkipStack.empty())
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ else
+ m_SkipChar = m_SkipStack.top();
+ } else {
+ switch (ch) {
+ case L'<':
+ m_SkipChar = L'>';
+ m_SkipStack.push(L'>');
+ break;
+ case L'[':
+ m_SkipChar = L']';
+ m_SkipStack.push(L']');
+ break;
+ case L'(':
+ m_SkipChar = L')';
+ m_SkipStack.push(L')');
+ break;
+ case L'\'':
+ m_SkipChar = L'\'';
+ m_SkipStack.push(L'\'');
+ break;
+ case L'\"':
+ m_SkipChar = L'\"';
+ m_SkipStack.push(L'\"');
+ break;
+ default:
+ if (ch == m_SkipChar) {
+ m_SkipStack.pop();
+ if (m_SkipStack.empty()) {
+ if (m_BlockBuffer.GetDataLength() >= 9)
+ (void)m_BlockBuffer.GetTextData(0, 7);
+
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ } else {
+ m_SkipChar = m_SkipStack.top();
+ }
+ }
+ break;
+ }
+ if (!m_SkipStack.empty()) {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ }
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::SkipComment:
+ if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"-->", 3) == 0) {
+ m_Start += 2;
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ }
+
+ m_Start++;
+ break;
+ case FDE_XmlSyntaxState::TargetData:
+ if (IsXMLWhiteSpace(ch)) {
+ if (m_BlockBuffer.IsEmpty()) {
+ m_Start++;
+ break;
+ }
+ if (m_wQuotationMark == 0) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_wQuotationMark = 0;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_Start++;
+ syntaxParserResult = FX_XmlSyntaxResult::TargetData;
+ break;
+ }
+ }
+ if (ch == '?') {
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction;
+ m_Start++;
+ } else if (ch == '\"') {
+ if (m_wQuotationMark == 0) {
+ m_wQuotationMark = ch;
+ m_Start++;
+ } else if (ch == m_wQuotationMark) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_wQuotationMark = 0;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_Start++;
+ syntaxParserResult = FX_XmlSyntaxResult::TargetData;
+ } else {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ default:
+ break;
+ }
+ if (syntaxParserResult != FX_XmlSyntaxResult::None)
+ return syntaxParserResult;
+ }
+ }
+ return FX_XmlSyntaxResult::Text;
+}
+
+int32_t CFX_XMLParser::GetStatus() const {
+ if (!m_pStream)
+ return -1;
+
+ int32_t iStreamLength = m_pStream->GetLength();
+ if (iStreamLength < 1)
+ return 100;
+
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::Error)
+ return -1;
+
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString)
+ return 100;
+ return m_iParsedBytes * 100 / iStreamLength;
+}
+
+FX_FILESIZE CFX_XMLParser::GetCurrentBinaryPos() const {
+ if (!m_pStream)
+ return 0;
+
+ int32_t nDstLen = GetUTF8EncodeLength(m_Buffer, m_Start);
+ return m_iParsedBytes + nDstLen;
+}
+
+void CFX_XMLParser::ParseTextChar(wchar_t character) {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock)
+ return;
+ }
+
+ m_pCurrentBlock[m_iIndexInBlock++] = character;
+ m_BlockBuffer.IncrementDataLength();
+ if (m_iEntityStart > -1 && character == L';') {
+ WideString csEntity = m_BlockBuffer.GetTextData(
+ m_iEntityStart + 1,
+ m_BlockBuffer.GetDataLength() - 1 - m_iEntityStart - 1);
+ int32_t iLen = csEntity.GetLength();
+ if (iLen > 0) {
+ if (csEntity[0] == L'#') {
+ uint32_t ch = 0;
+ wchar_t w;
+ if (iLen > 1 && csEntity[1] == L'x') {
+ for (int32_t i = 2; i < iLen; i++) {
+ w = csEntity[i];
+ if (std::iswdigit(w))
+ ch = (ch << 4) + w - L'0';
+ else if (w >= L'A' && w <= L'F')
+ ch = (ch << 4) + w - 55;
+ else if (w >= L'a' && w <= L'f')
+ ch = (ch << 4) + w - 87;
+ else
+ break;
+ }
+ } else {
+ for (int32_t i = 1; i < iLen; i++) {
+ w = csEntity[i];
+ if (!std::iswdigit(w))
+ break;
+ ch = ch * 10 + w - L'0';
+ }
+ }
+ if (ch > kMaxCharRange)
+ ch = ' ';
+
+ character = static_cast<wchar_t>(ch);
+ if (character != 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, character);
+ m_iEntityStart++;
+ }
+ } else {
+ if (csEntity.Compare(L"amp") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'&');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"lt") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'<');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"gt") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'>');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"apos") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'\'');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"quot") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'\"');
+ m_iEntityStart++;
+ }
+ }
+ }
+ if (m_iEntityStart >= 0 &&
+ m_BlockBuffer.GetDataLength() > static_cast<size_t>(m_iEntityStart)) {
+ m_BlockBuffer.DeleteTextChars(m_BlockBuffer.GetDataLength() -
+ m_iEntityStart);
+ }
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_iEntityStart = -1;
+ } else if (m_iEntityStart < 0 && character == L'&') {
+ m_iEntityStart = m_BlockBuffer.GetDataLength() - 1;
+ }
+ m_Start++;
}
diff --git a/core/fxcrt/xml/cfx_xmlparser.h b/core/fxcrt/xml/cfx_xmlparser.h
index 8c4c354699..c7587e9151 100644
--- a/core/fxcrt/xml/cfx_xmlparser.h
+++ b/core/fxcrt/xml/cfx_xmlparser.h
@@ -9,29 +9,135 @@
#include <memory>
#include <stack>
+#include <vector>
+#include "core/fxcrt/cfx_blockbuffer.h"
+#include "core/fxcrt/cfx_seekablestreamproxy.h"
#include "core/fxcrt/fx_string.h"
#include "core/fxcrt/retain_ptr.h"
-#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
class CFX_XMLElement;
class CFX_XMLNode;
class CFX_SeekableStreamProxy;
+enum class FX_XmlSyntaxResult {
+ None,
+ InstructionOpen,
+ InstructionClose,
+ ElementOpen,
+ ElementBreak,
+ ElementClose,
+ TargetName,
+ TagName,
+ AttriName,
+ AttriValue,
+ Text,
+ CData,
+ TargetData,
+ Error,
+ EndOfString
+};
+
class CFX_XMLParser {
public:
+ static bool IsXMLNameChar(wchar_t ch, bool bFirstChar);
+
CFX_XMLParser(CFX_XMLNode* pParent,
const RetainPtr<CFX_SeekableStreamProxy>& pStream);
- ~CFX_XMLParser();
+ virtual ~CFX_XMLParser();
int32_t Parse();
+ protected:
+ FX_XmlSyntaxResult DoSyntaxParse();
+
+ WideString GetTagName() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ WideString GetAttributeName() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ WideString GetAttributeValue() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ WideString GetTextData() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
private:
- std::unique_ptr<CFX_XMLSyntaxParser> m_pParser;
+ enum class FDE_XmlSyntaxState {
+ Text,
+ Node,
+ Target,
+ Tag,
+ AttriName,
+ AttriEqualSign,
+ AttriQuotation,
+ AttriValue,
+ Entity,
+ EntityDecimal,
+ EntityHex,
+ CloseInstruction,
+ BreakElement,
+ CloseElement,
+ SkipDeclNode,
+ DeclCharData,
+ SkipComment,
+ SkipCommentOrDecl,
+ SkipCData,
+ TargetData
+ };
+
+ void ParseTextChar(wchar_t ch);
+
+ int32_t GetStatus() const;
+ FX_FILESIZE GetCurrentPos() const { return m_ParsedChars + m_Start; }
+ FX_FILESIZE GetCurrentBinaryPos() const;
+ int32_t GetCurrentNodeNumber() const { return m_iCurrentNodeNum; }
+ int32_t GetLastNodeNumber() const { return m_iLastNodeNum; }
+
+ WideString GetTargetName() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ WideString GetTargetData() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
CFX_XMLNode* m_pParent;
CFX_XMLNode* m_pChild;
std::stack<CFX_XMLNode*> m_NodeStack;
WideString m_ws1;
+
+ RetainPtr<CFX_SeekableStreamProxy> m_pStream;
+ size_t m_iXMLPlaneSize;
+ FX_FILESIZE m_iCurrentPos;
+ int32_t m_iCurrentNodeNum;
+ int32_t m_iLastNodeNum;
+ int32_t m_iParsedBytes;
+ FX_FILESIZE m_ParsedChars;
+ std::vector<wchar_t> m_Buffer;
+ size_t m_iBufferChars;
+ bool m_bEOS;
+ FX_FILESIZE m_Start; // Start position in m_Buffer
+ FX_FILESIZE m_End; // End position in m_Buffer
+ FX_XMLNODE m_CurNode;
+ std::stack<FX_XMLNODE> m_XMLNodeStack;
+ CFX_BlockBuffer m_BlockBuffer;
+ int32_t m_iAllocStep;
+ wchar_t* m_pCurrentBlock; // Pointer into CFX_BlockBuffer
+ int32_t m_iIndexInBlock;
+ int32_t m_iTextDataLength;
+ FX_XmlSyntaxResult m_syntaxParserResult;
+ FDE_XmlSyntaxState m_syntaxParserState;
+ wchar_t m_wQuotationMark;
+ int32_t m_iEntityStart;
+ std::stack<wchar_t> m_SkipStack;
+ wchar_t m_SkipChar;
};
#endif // CORE_FXCRT_XML_CFX_XMLPARSER_H_
diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp b/core/fxcrt/xml/cfx_xmlparser_unittest.cpp
index b7f7c416b1..d22925f797 100644
--- a/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp
+++ b/core/fxcrt/xml/cfx_xmlparser_unittest.cpp
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h"
+#include "core/fxcrt/xml/cfx_xmlparser.h"
#include <memory>
@@ -11,7 +11,30 @@
#include "testing/gtest/include/gtest/gtest.h"
#include "testing/test_support.h"
-TEST(CFX_XMLSyntaxParserTest, CData) {
+class CFX_XMLTestParser : public CFX_XMLParser {
+ public:
+ CFX_XMLTestParser(CFX_XMLNode* pParent,
+ const RetainPtr<CFX_SeekableStreamProxy>& pStream)
+ : CFX_XMLParser(pParent, pStream) {}
+
+ ~CFX_XMLTestParser() override = default;
+
+ FX_XmlSyntaxResult DoSyntaxParse() { return CFX_XMLParser::DoSyntaxParse(); }
+
+ WideString GetTagName() const { return CFX_XMLParser::GetTagName(); }
+
+ WideString GetAttributeName() const {
+ return CFX_XMLParser::GetAttributeName();
+ }
+
+ WideString GetAttributeValue() const {
+ return CFX_XMLParser::GetAttributeValue();
+ }
+
+ WideString GetTextData() const { return CFX_XMLParser::GetTextData(); }
+};
+
+TEST(CFX_XMLParserTest, CData) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <![CDATA[\n"
@@ -31,7 +54,8 @@ TEST(CFX_XMLSyntaxParserTest, CData) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -57,7 +81,7 @@ TEST(CFX_XMLSyntaxParserTest, CData) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, CDataWithInnerScript) {
+TEST(CFX_XMLParserTest, CDataWithInnerScript) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <![CDATA[\n"
@@ -79,7 +103,8 @@ TEST(CFX_XMLSyntaxParserTest, CDataWithInnerScript) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -105,7 +130,7 @@ TEST(CFX_XMLSyntaxParserTest, CDataWithInnerScript) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, ArrowBangArrow) {
+TEST(CFX_XMLParserTest, ArrowBangArrow) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <!>\n"
@@ -116,7 +141,8 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangArrow) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
@@ -140,7 +166,7 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangArrow) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) {
+TEST(CFX_XMLParserTest, ArrowBangBracketArrow) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <![>\n"
@@ -151,7 +177,8 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -170,7 +197,7 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, IncompleteCData) {
+TEST(CFX_XMLParserTest, IncompleteCData) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <![CDATA>\n"
@@ -181,7 +208,8 @@ TEST(CFX_XMLSyntaxParserTest, IncompleteCData) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -200,7 +228,7 @@ TEST(CFX_XMLSyntaxParserTest, IncompleteCData) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, UnClosedCData) {
+TEST(CFX_XMLParserTest, UnClosedCData) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <![CDATA[\n"
@@ -211,7 +239,8 @@ TEST(CFX_XMLSyntaxParserTest, UnClosedCData) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -230,7 +259,7 @@ TEST(CFX_XMLSyntaxParserTest, UnClosedCData) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, EmptyCData) {
+TEST(CFX_XMLParserTest, EmptyCData) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <![CDATA[]]>\n"
@@ -241,7 +270,8 @@ TEST(CFX_XMLSyntaxParserTest, EmptyCData) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -267,7 +297,7 @@ TEST(CFX_XMLSyntaxParserTest, EmptyCData) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, Comment) {
+TEST(CFX_XMLParserTest, Comment) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <!-- A Comment -->\n"
@@ -278,7 +308,8 @@ TEST(CFX_XMLSyntaxParserTest, Comment) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -301,7 +332,7 @@ TEST(CFX_XMLSyntaxParserTest, Comment) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, IncorrectCommentStart) {
+TEST(CFX_XMLParserTest, IncorrectCommentStart) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <!- A Comment -->\n"
@@ -312,7 +343,8 @@ TEST(CFX_XMLSyntaxParserTest, IncorrectCommentStart) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -335,7 +367,7 @@ TEST(CFX_XMLSyntaxParserTest, IncorrectCommentStart) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, CommentEmpty) {
+TEST(CFX_XMLParserTest, CommentEmpty) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <!---->\n"
@@ -346,7 +378,8 @@ TEST(CFX_XMLSyntaxParserTest, CommentEmpty) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -369,7 +402,7 @@ TEST(CFX_XMLSyntaxParserTest, CommentEmpty) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, CommentThreeDash) {
+TEST(CFX_XMLParserTest, CommentThreeDash) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <!--->\n"
@@ -380,7 +413,8 @@ TEST(CFX_XMLSyntaxParserTest, CommentThreeDash) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -397,7 +431,7 @@ TEST(CFX_XMLSyntaxParserTest, CommentThreeDash) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, CommentTwoDash) {
+TEST(CFX_XMLParserTest, CommentTwoDash) {
const char* input =
"<script contentType=\"application/x-javascript\">\n"
" <!-->\n"
@@ -408,7 +442,8 @@ TEST(CFX_XMLSyntaxParserTest, CommentTwoDash) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -425,7 +460,7 @@ TEST(CFX_XMLSyntaxParserTest, CommentTwoDash) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, Entities) {
+TEST(CFX_XMLParserTest, Entities) {
const char* input =
"<script contentType=\"application/x-javascript\">"
"&#66;"
@@ -440,7 +475,8 @@ TEST(CFX_XMLSyntaxParserTest, Entities) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -460,7 +496,7 @@ TEST(CFX_XMLSyntaxParserTest, Entities) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, EntityOverflowHex) {
+TEST(CFX_XMLParserTest, EntityOverflowHex) {
const char* input =
"<script contentType=\"application/x-javascript\">"
"&#xaDBDFFFFF;"
@@ -472,7 +508,8 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowHex) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -492,7 +529,7 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowHex) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) {
+TEST(CFX_XMLParserTest, EntityOverflowDecimal) {
const char* input =
"<script contentType=\"application/x-javascript\">"
"&#2914910205;"
@@ -504,7 +541,8 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) {
reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
stream->SetCodePage(FX_CODEPAGE_UTF8);
- CFX_XMLSyntaxParser parser(stream);
+ auto root = pdfium::MakeUnique<CFX_XMLNode>();
+ CFX_XMLTestParser parser(root.get(), stream);
ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
ASSERT_EQ(L"script", parser.GetTagName());
@@ -524,19 +562,19 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) {
ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
}
-TEST(CFX_XMLSyntaxParserTest, IsXMLNameChar) {
- EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(L'-', true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(L'-', false));
-
- EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2069, true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2070, true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2073, true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0x218F, true));
- EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2190, true));
-
- EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFDEF, true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFDF0, true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFDF1, true));
- EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFFFD, true));
- EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFFFE, true));
+TEST(CFX_XMLParserTest, IsXMLNameChar) {
+ EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(L'-', true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(L'-', false));
+
+ EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0x2069, true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0x2070, true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0x2073, true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0x218F, true));
+ EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0x2190, true));
+
+ EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0xFDEF, true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0xFDF0, true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0xFDF1, true));
+ EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0xFFFD, true));
+ EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0xFFFE, true));
}
diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp b/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp
deleted file mode 100644
index 1fb51b7d52..0000000000
--- a/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp
+++ /dev/null
@@ -1,694 +0,0 @@
-// Copyright 2017 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h"
-
-#include <algorithm>
-#include <cwctype>
-#include <iterator>
-
-#include "core/fxcrt/fx_extension.h"
-#include "core/fxcrt/fx_safe_types.h"
-
-namespace {
-
-const uint32_t kMaxCharRange = 0x10ffff;
-
-bool IsXMLWhiteSpace(wchar_t ch) {
- return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
-}
-
-struct FX_XMLNAMECHAR {
- uint16_t wStart;
- uint16_t wEnd;
- bool bStartChar;
-};
-
-const FX_XMLNAMECHAR g_XMLNameChars[] = {
- {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
- {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
- {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
- {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
- {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
- {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
- {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
-};
-
-
-int32_t GetUTF8EncodeLength(const std::vector<wchar_t>& src,
- FX_FILESIZE iSrcLen) {
- uint32_t unicode = 0;
- int32_t iDstNum = 0;
- const wchar_t* pSrc = src.data();
- while (iSrcLen-- > 0) {
- unicode = *pSrc++;
- int nbytes = 0;
- if ((uint32_t)unicode < 0x80) {
- nbytes = 1;
- } else if ((uint32_t)unicode < 0x800) {
- nbytes = 2;
- } else if ((uint32_t)unicode < 0x10000) {
- nbytes = 3;
- } else if ((uint32_t)unicode < 0x200000) {
- nbytes = 4;
- } else if ((uint32_t)unicode < 0x4000000) {
- nbytes = 5;
- } else {
- nbytes = 6;
- }
- iDstNum += nbytes;
- }
- return iDstNum;
-}
-
-} // namespace
-
-// static
-bool CFX_XMLSyntaxParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
- auto* it = std::lower_bound(
- std::begin(g_XMLNameChars), std::end(g_XMLNameChars), ch,
- [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
- return it != std::end(g_XMLNameChars) && ch >= it->wStart &&
- (!bFirstChar || it->bStartChar);
-}
-
-CFX_XMLSyntaxParser::CFX_XMLSyntaxParser(
- const RetainPtr<CFX_SeekableStreamProxy>& pStream)
- : m_pStream(pStream),
- m_iXMLPlaneSize(1024),
- m_iCurrentPos(0),
- m_iCurrentNodeNum(-1),
- m_iLastNodeNum(-1),
- m_iParsedBytes(0),
- m_ParsedChars(0),
- m_iBufferChars(0),
- m_bEOS(false),
- m_Start(0),
- m_End(0),
- m_iAllocStep(m_BlockBuffer.GetAllocStep()),
- m_pCurrentBlock(nullptr),
- m_iIndexInBlock(0),
- m_iTextDataLength(0),
- m_syntaxParserResult(FX_XmlSyntaxResult::None),
- m_syntaxParserState(FDE_XmlSyntaxState::Text),
- m_wQuotationMark(0),
- m_iEntityStart(-1) {
- ASSERT(pStream);
-
- m_CurNode.iNodeNum = -1;
- m_CurNode.eNodeType = FX_XMLNODE_Unknown;
-
- m_iXMLPlaneSize =
- std::min(m_iXMLPlaneSize,
- pdfium::base::checked_cast<size_t>(m_pStream->GetLength()));
- m_iCurrentPos = m_pStream->GetBOMLength();
-
- FX_SAFE_SIZE_T alloc_size_safe = m_iXMLPlaneSize;
- alloc_size_safe += 1; // For NUL.
- if (!alloc_size_safe.IsValid() || alloc_size_safe.ValueOrDie() <= 0) {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return;
- }
-
- m_Buffer.resize(pdfium::base::ValueOrDieForType<size_t>(alloc_size_safe));
-
- m_BlockBuffer.InitBuffer();
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
-}
-
-CFX_XMLSyntaxParser::~CFX_XMLSyntaxParser() {}
-
-FX_XmlSyntaxResult CFX_XMLSyntaxParser::DoSyntaxParse() {
- if (m_syntaxParserResult == FX_XmlSyntaxResult::Error ||
- m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) {
- return m_syntaxParserResult;
- }
-
- FX_FILESIZE iStreamLength = m_pStream->GetLength();
- FX_FILESIZE iPos;
-
- FX_XmlSyntaxResult syntaxParserResult = FX_XmlSyntaxResult::None;
- while (true) {
- if (m_Start >= m_End) {
- if (m_bEOS || m_iCurrentPos >= iStreamLength) {
- m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString;
- return m_syntaxParserResult;
- }
- m_ParsedChars += m_End;
- m_iParsedBytes = m_iCurrentPos;
- if (m_pStream->GetPosition() != m_iCurrentPos)
- m_pStream->Seek(CFX_SeekableStreamProxy::From::Begin, m_iCurrentPos);
-
- m_iBufferChars =
- m_pStream->ReadString(m_Buffer.data(), m_iXMLPlaneSize, &m_bEOS);
- iPos = m_pStream->GetPosition();
- if (m_iBufferChars < 1) {
- m_iCurrentPos = iStreamLength;
- m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString;
- return m_syntaxParserResult;
- }
- m_iCurrentPos = iPos;
- m_Start = 0;
- m_End = m_iBufferChars;
- }
-
- while (m_Start < m_End) {
- wchar_t ch = m_Buffer[m_Start];
- switch (m_syntaxParserState) {
- case FDE_XmlSyntaxState::Text:
- if (ch == L'<') {
- if (!m_BlockBuffer.IsEmpty()) {
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_iEntityStart = -1;
- syntaxParserResult = FX_XmlSyntaxResult::Text;
- } else {
- m_Start++;
- m_syntaxParserState = FDE_XmlSyntaxState::Node;
- }
- } else {
- ParseTextChar(ch);
- }
- break;
- case FDE_XmlSyntaxState::Node:
- if (ch == L'!') {
- m_Start++;
- m_syntaxParserState = FDE_XmlSyntaxState::SkipCommentOrDecl;
- } else if (ch == L'/') {
- m_Start++;
- m_syntaxParserState = FDE_XmlSyntaxState::CloseElement;
- } else if (ch == L'?') {
- m_iLastNodeNum++;
- m_iCurrentNodeNum = m_iLastNodeNum;
- m_CurNode.iNodeNum = m_iLastNodeNum;
- m_CurNode.eNodeType = FX_XMLNODE_Instruction;
- m_XMLNodeStack.push(m_CurNode);
- m_Start++;
- m_syntaxParserState = FDE_XmlSyntaxState::Target;
- syntaxParserResult = FX_XmlSyntaxResult::InstructionOpen;
- } else {
- m_iLastNodeNum++;
- m_iCurrentNodeNum = m_iLastNodeNum;
- m_CurNode.iNodeNum = m_iLastNodeNum;
- m_CurNode.eNodeType = FX_XMLNODE_Element;
- m_XMLNodeStack.push(m_CurNode);
- m_syntaxParserState = FDE_XmlSyntaxState::Tag;
- syntaxParserResult = FX_XmlSyntaxResult::ElementOpen;
- }
- break;
- case FDE_XmlSyntaxState::Target:
- case FDE_XmlSyntaxState::Tag:
- if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
- if (m_BlockBuffer.IsEmpty()) {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
-
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (m_syntaxParserState != FDE_XmlSyntaxState::Target)
- syntaxParserResult = FX_XmlSyntaxResult::TagName;
- else
- syntaxParserResult = FX_XmlSyntaxResult::TargetName;
-
- m_syntaxParserState = FDE_XmlSyntaxState::AttriName;
- } else {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock) {
- return FX_XmlSyntaxResult::Error;
- }
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- m_Start++;
- }
- break;
- case FDE_XmlSyntaxState::AttriName:
- if (m_BlockBuffer.IsEmpty() && IsXMLWhiteSpace(ch)) {
- m_Start++;
- break;
- }
- if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
- if (m_BlockBuffer.IsEmpty()) {
- if (m_CurNode.eNodeType == FX_XMLNODE_Element) {
- if (ch == L'>' || ch == L'/') {
- m_syntaxParserState = FDE_XmlSyntaxState::BreakElement;
- break;
- }
- } else if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
- if (ch == L'?') {
- m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction;
- m_Start++;
- } else {
- m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
- }
- break;
- }
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- } else {
- if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
- if (ch != '=' && !IsXMLWhiteSpace(ch)) {
- m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
- break;
- }
- }
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_syntaxParserState = FDE_XmlSyntaxState::AttriEqualSign;
- syntaxParserResult = FX_XmlSyntaxResult::AttriName;
- }
- } else {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock) {
- return FX_XmlSyntaxResult::Error;
- }
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- m_Start++;
- }
- break;
- case FDE_XmlSyntaxState::AttriEqualSign:
- if (IsXMLWhiteSpace(ch)) {
- m_Start++;
- break;
- }
- if (ch != L'=') {
- if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
- m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
- break;
- }
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- } else {
- m_syntaxParserState = FDE_XmlSyntaxState::AttriQuotation;
- m_Start++;
- }
- break;
- case FDE_XmlSyntaxState::AttriQuotation:
- if (IsXMLWhiteSpace(ch)) {
- m_Start++;
- break;
- }
- if (ch != L'\"' && ch != L'\'') {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- } else {
- m_wQuotationMark = ch;
- m_syntaxParserState = FDE_XmlSyntaxState::AttriValue;
- m_Start++;
- }
- break;
- case FDE_XmlSyntaxState::AttriValue:
- if (ch == m_wQuotationMark) {
- if (m_iEntityStart > -1) {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_wQuotationMark = 0;
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_Start++;
- m_syntaxParserState = FDE_XmlSyntaxState::AttriName;
- syntaxParserResult = FX_XmlSyntaxResult::AttriValue;
- } else {
- ParseTextChar(ch);
- }
- break;
- case FDE_XmlSyntaxState::CloseInstruction:
- if (ch != L'>') {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock) {
- return FX_XmlSyntaxResult::Error;
- }
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
- } else if (!m_BlockBuffer.IsEmpty()) {
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- syntaxParserResult = FX_XmlSyntaxResult::TargetData;
- } else {
- m_Start++;
- if (m_XMLNodeStack.empty()) {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
- m_XMLNodeStack.pop();
- if (!m_XMLNodeStack.empty()) {
- m_CurNode = m_XMLNodeStack.top();
- } else {
- m_CurNode.iNodeNum = -1;
- m_CurNode.eNodeType = FX_XMLNODE_Unknown;
- }
- m_iCurrentNodeNum = m_CurNode.iNodeNum;
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- syntaxParserResult = FX_XmlSyntaxResult::InstructionClose;
- }
- break;
- case FDE_XmlSyntaxState::BreakElement:
- if (ch == L'>') {
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- syntaxParserResult = FX_XmlSyntaxResult::ElementBreak;
- } else if (ch == L'/') {
- m_syntaxParserState = FDE_XmlSyntaxState::CloseElement;
- } else {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
- m_Start++;
- break;
- case FDE_XmlSyntaxState::CloseElement:
- if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
- if (ch == L'>') {
- if (m_XMLNodeStack.empty()) {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
- m_XMLNodeStack.pop();
- if (!m_XMLNodeStack.empty()) {
- m_CurNode = m_XMLNodeStack.top();
- } else {
- m_CurNode.iNodeNum = -1;
- m_CurNode.eNodeType = FX_XMLNODE_Unknown;
- }
- m_iCurrentNodeNum = m_CurNode.iNodeNum;
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- syntaxParserResult = FX_XmlSyntaxResult::ElementClose;
- } else if (!IsXMLWhiteSpace(ch)) {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
- } else {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock) {
- return FX_XmlSyntaxResult::Error;
- }
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- }
- m_Start++;
- break;
- case FDE_XmlSyntaxState::SkipCommentOrDecl:
- if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"--", 2) == 0) {
- m_Start += 2;
- m_syntaxParserState = FDE_XmlSyntaxState::SkipComment;
- } else if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"[CDATA[", 7) ==
- 0) {
- m_Start += 7;
- m_syntaxParserState = FDE_XmlSyntaxState::SkipCData;
- } else {
- m_syntaxParserState = FDE_XmlSyntaxState::SkipDeclNode;
- m_SkipChar = L'>';
- m_SkipStack.push(L'>');
- }
- break;
- case FDE_XmlSyntaxState::SkipCData: {
- if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"]]>", 3) == 0) {
- m_Start += 3;
- syntaxParserResult = FX_XmlSyntaxResult::CData;
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- } else {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock)
- return FX_XmlSyntaxResult::Error;
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- m_Start++;
- }
- break;
- }
- case FDE_XmlSyntaxState::SkipDeclNode:
- if (m_SkipChar == L'\'' || m_SkipChar == L'\"') {
- m_Start++;
- if (ch != m_SkipChar)
- break;
-
- m_SkipStack.pop();
- if (m_SkipStack.empty())
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- else
- m_SkipChar = m_SkipStack.top();
- } else {
- switch (ch) {
- case L'<':
- m_SkipChar = L'>';
- m_SkipStack.push(L'>');
- break;
- case L'[':
- m_SkipChar = L']';
- m_SkipStack.push(L']');
- break;
- case L'(':
- m_SkipChar = L')';
- m_SkipStack.push(L')');
- break;
- case L'\'':
- m_SkipChar = L'\'';
- m_SkipStack.push(L'\'');
- break;
- case L'\"':
- m_SkipChar = L'\"';
- m_SkipStack.push(L'\"');
- break;
- default:
- if (ch == m_SkipChar) {
- m_SkipStack.pop();
- if (m_SkipStack.empty()) {
- if (m_BlockBuffer.GetDataLength() >= 9)
- (void)m_BlockBuffer.GetTextData(0, 7);
-
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- } else {
- m_SkipChar = m_SkipStack.top();
- }
- }
- break;
- }
- if (!m_SkipStack.empty()) {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock) {
- return FX_XmlSyntaxResult::Error;
- }
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- }
- m_Start++;
- }
- break;
- case FDE_XmlSyntaxState::SkipComment:
- if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"-->", 3) == 0) {
- m_Start += 2;
- m_syntaxParserState = FDE_XmlSyntaxState::Text;
- }
-
- m_Start++;
- break;
- case FDE_XmlSyntaxState::TargetData:
- if (IsXMLWhiteSpace(ch)) {
- if (m_BlockBuffer.IsEmpty()) {
- m_Start++;
- break;
- }
- if (m_wQuotationMark == 0) {
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_wQuotationMark = 0;
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_Start++;
- syntaxParserResult = FX_XmlSyntaxResult::TargetData;
- break;
- }
- }
- if (ch == '?') {
- m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction;
- m_Start++;
- } else if (ch == '\"') {
- if (m_wQuotationMark == 0) {
- m_wQuotationMark = ch;
- m_Start++;
- } else if (ch == m_wQuotationMark) {
- m_iTextDataLength = m_BlockBuffer.GetDataLength();
- m_wQuotationMark = 0;
- m_BlockBuffer.Reset(true);
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_Start++;
- syntaxParserResult = FX_XmlSyntaxResult::TargetData;
- } else {
- m_syntaxParserResult = FX_XmlSyntaxResult::Error;
- return m_syntaxParserResult;
- }
- } else {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock) {
- return FX_XmlSyntaxResult::Error;
- }
- }
- m_pCurrentBlock[m_iIndexInBlock++] = ch;
- m_BlockBuffer.IncrementDataLength();
- m_Start++;
- }
- break;
- default:
- break;
- }
- if (syntaxParserResult != FX_XmlSyntaxResult::None)
- return syntaxParserResult;
- }
- }
- return FX_XmlSyntaxResult::Text;
-}
-
-int32_t CFX_XMLSyntaxParser::GetStatus() const {
- if (!m_pStream)
- return -1;
-
- int32_t iStreamLength = m_pStream->GetLength();
- if (iStreamLength < 1)
- return 100;
-
- if (m_syntaxParserResult == FX_XmlSyntaxResult::Error)
- return -1;
-
- if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString)
- return 100;
- return m_iParsedBytes * 100 / iStreamLength;
-}
-
-FX_FILESIZE CFX_XMLSyntaxParser::GetCurrentBinaryPos() const {
- if (!m_pStream)
- return 0;
-
- int32_t nDstLen = GetUTF8EncodeLength(m_Buffer, m_Start);
- return m_iParsedBytes + nDstLen;
-}
-
-void CFX_XMLSyntaxParser::ParseTextChar(wchar_t character) {
- if (m_iIndexInBlock == m_iAllocStep) {
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- if (!m_pCurrentBlock)
- return;
- }
-
- m_pCurrentBlock[m_iIndexInBlock++] = character;
- m_BlockBuffer.IncrementDataLength();
- if (m_iEntityStart > -1 && character == L';') {
- WideString csEntity = m_BlockBuffer.GetTextData(
- m_iEntityStart + 1,
- m_BlockBuffer.GetDataLength() - 1 - m_iEntityStart - 1);
- int32_t iLen = csEntity.GetLength();
- if (iLen > 0) {
- if (csEntity[0] == L'#') {
- uint32_t ch = 0;
- wchar_t w;
- if (iLen > 1 && csEntity[1] == L'x') {
- for (int32_t i = 2; i < iLen; i++) {
- w = csEntity[i];
- if (std::iswdigit(w))
- ch = (ch << 4) + w - L'0';
- else if (w >= L'A' && w <= L'F')
- ch = (ch << 4) + w - 55;
- else if (w >= L'a' && w <= L'f')
- ch = (ch << 4) + w - 87;
- else
- break;
- }
- } else {
- for (int32_t i = 1; i < iLen; i++) {
- w = csEntity[i];
- if (!std::iswdigit(w))
- break;
- ch = ch * 10 + w - L'0';
- }
- }
- if (ch > kMaxCharRange)
- ch = ' ';
-
- character = static_cast<wchar_t>(ch);
- if (character != 0) {
- m_BlockBuffer.SetTextChar(m_iEntityStart, character);
- m_iEntityStart++;
- }
- } else {
- if (csEntity.Compare(L"amp") == 0) {
- m_BlockBuffer.SetTextChar(m_iEntityStart, L'&');
- m_iEntityStart++;
- } else if (csEntity.Compare(L"lt") == 0) {
- m_BlockBuffer.SetTextChar(m_iEntityStart, L'<');
- m_iEntityStart++;
- } else if (csEntity.Compare(L"gt") == 0) {
- m_BlockBuffer.SetTextChar(m_iEntityStart, L'>');
- m_iEntityStart++;
- } else if (csEntity.Compare(L"apos") == 0) {
- m_BlockBuffer.SetTextChar(m_iEntityStart, L'\'');
- m_iEntityStart++;
- } else if (csEntity.Compare(L"quot") == 0) {
- m_BlockBuffer.SetTextChar(m_iEntityStart, L'\"');
- m_iEntityStart++;
- }
- }
- }
- if (m_iEntityStart >= 0 &&
- m_BlockBuffer.GetDataLength() > static_cast<size_t>(m_iEntityStart)) {
- m_BlockBuffer.DeleteTextChars(m_BlockBuffer.GetDataLength() -
- m_iEntityStart);
- }
- std::tie(m_pCurrentBlock, m_iIndexInBlock) =
- m_BlockBuffer.GetAvailableBlock();
- m_iEntityStart = -1;
- } else if (m_iEntityStart < 0 && character == L'&') {
- m_iEntityStart = m_BlockBuffer.GetDataLength() - 1;
- }
- m_Start++;
-}
diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.h b/core/fxcrt/xml/cfx_xmlsyntaxparser.h
deleted file mode 100644
index b93bbb6801..0000000000
--- a/core/fxcrt/xml/cfx_xmlsyntaxparser.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright 2017 PDFium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_
-#define CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_
-
-#include <stack>
-#include <vector>
-
-#include "core/fxcrt/cfx_blockbuffer.h"
-#include "core/fxcrt/cfx_seekablestreamproxy.h"
-#include "core/fxcrt/fx_string.h"
-#include "core/fxcrt/retain_ptr.h"
-#include "core/fxcrt/xml/cfx_xmlnode.h"
-
-enum class FX_XmlSyntaxResult {
- None,
- InstructionOpen,
- InstructionClose,
- ElementOpen,
- ElementBreak,
- ElementClose,
- TargetName,
- TagName,
- AttriName,
- AttriValue,
- Text,
- CData,
- TargetData,
- Error,
- EndOfString
-};
-
-class CFX_XMLSyntaxParser {
- public:
- static bool IsXMLNameChar(wchar_t ch, bool bFirstChar);
-
- explicit CFX_XMLSyntaxParser(
- const RetainPtr<CFX_SeekableStreamProxy>& pStream);
- ~CFX_XMLSyntaxParser();
-
- FX_XmlSyntaxResult DoSyntaxParse();
-
- int32_t GetStatus() const;
- FX_FILESIZE GetCurrentPos() const { return m_ParsedChars + m_Start; }
- FX_FILESIZE GetCurrentBinaryPos() const;
- int32_t GetCurrentNodeNumber() const { return m_iCurrentNodeNum; }
- int32_t GetLastNodeNumber() const { return m_iLastNodeNum; }
-
- WideString GetTargetName() const {
- return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
- }
-
- WideString GetTagName() const {
- return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
- }
-
- WideString GetAttributeName() const {
- return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
- }
-
- WideString GetAttributeValue() const {
- return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
- }
-
- WideString GetTextData() const {
- return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
- }
-
- WideString GetTargetData() const {
- return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
- }
-
- protected:
- enum class FDE_XmlSyntaxState {
- Text,
- Node,
- Target,
- Tag,
- AttriName,
- AttriEqualSign,
- AttriQuotation,
- AttriValue,
- Entity,
- EntityDecimal,
- EntityHex,
- CloseInstruction,
- BreakElement,
- CloseElement,
- SkipDeclNode,
- DeclCharData,
- SkipComment,
- SkipCommentOrDecl,
- SkipCData,
- TargetData
- };
-
- void ParseTextChar(wchar_t ch);
-
- RetainPtr<CFX_SeekableStreamProxy> m_pStream;
- size_t m_iXMLPlaneSize;
- FX_FILESIZE m_iCurrentPos;
- int32_t m_iCurrentNodeNum;
- int32_t m_iLastNodeNum;
- int32_t m_iParsedBytes;
- FX_FILESIZE m_ParsedChars;
- std::vector<wchar_t> m_Buffer;
- size_t m_iBufferChars;
- bool m_bEOS;
- FX_FILESIZE m_Start; // Start position in m_Buffer
- FX_FILESIZE m_End; // End position in m_Buffer
- FX_XMLNODE m_CurNode;
- std::stack<FX_XMLNODE> m_XMLNodeStack;
- CFX_BlockBuffer m_BlockBuffer;
- int32_t m_iAllocStep;
- wchar_t* m_pCurrentBlock; // Pointer into CFX_BlockBuffer
- int32_t m_iIndexInBlock;
- int32_t m_iTextDataLength;
- FX_XmlSyntaxResult m_syntaxParserResult;
- FDE_XmlSyntaxState m_syntaxParserState;
- wchar_t m_wQuotationMark;
- int32_t m_iEntityStart;
- std::stack<wchar_t> m_SkipStack;
- wchar_t m_SkipChar;
-};
-
-#endif // CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_