summaryrefslogtreecommitdiff
path: root/core/fxcrt/xml
diff options
context:
space:
mode:
Diffstat (limited to 'core/fxcrt/xml')
-rw-r--r--core/fxcrt/xml/cfx_xmlattributenode.cpp35
-rw-r--r--core/fxcrt/xml/cfx_xmlattributenode.h44
-rw-r--r--core/fxcrt/xml/cfx_xmlchardata.cpp22
-rw-r--r--core/fxcrt/xml/cfx_xmlchardata.h24
-rw-r--r--core/fxcrt/xml/cfx_xmldoc.cpp160
-rw-r--r--core/fxcrt/xml/cfx_xmldoc.h37
-rw-r--r--core/fxcrt/xml/cfx_xmlelement.cpp102
-rw-r--r--core/fxcrt/xml/cfx_xmlelement.h33
-rw-r--r--core/fxcrt/xml/cfx_xmlinstruction.cpp36
-rw-r--r--core/fxcrt/xml/cfx_xmlinstruction.h35
-rw-r--r--core/fxcrt/xml/cfx_xmlnode.cpp441
-rw-r--r--core/fxcrt/xml/cfx_xmlnode.h75
-rw-r--r--core/fxcrt/xml/cfx_xmlparser.cpp171
-rw-r--r--core/fxcrt/xml/cfx_xmlparser.h47
-rw-r--r--core/fxcrt/xml/cfx_xmlsyntaxparser.cpp698
-rw-r--r--core/fxcrt/xml/cfx_xmlsyntaxparser.h128
-rw-r--r--core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp527
-rw-r--r--core/fxcrt/xml/cfx_xmltext.cpp22
-rw-r--r--core/fxcrt/xml/cfx_xmltext.h31
19 files changed, 2668 insertions, 0 deletions
diff --git a/core/fxcrt/xml/cfx_xmlattributenode.cpp b/core/fxcrt/xml/cfx_xmlattributenode.cpp
new file mode 100644
index 0000000000..9c81efc109
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlattributenode.cpp
@@ -0,0 +1,35 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlattributenode.h"
+
+#include "core/fxcrt/fx_ext.h"
+
+CFX_XMLAttributeNode::CFX_XMLAttributeNode(const CFX_WideString& name)
+ : CFX_XMLNode(), name_(name) {
+ ASSERT(name_.GetLength() > 0);
+}
+
+CFX_XMLAttributeNode::~CFX_XMLAttributeNode() {}
+
+bool CFX_XMLAttributeNode::HasAttribute(const CFX_WideString& name) const {
+ return attrs_.find(name) != attrs_.end();
+}
+
+CFX_WideString CFX_XMLAttributeNode::GetString(
+ const CFX_WideString& name) const {
+ auto it = attrs_.find(name);
+ return it != attrs_.end() ? it->second : CFX_WideString();
+}
+
+void CFX_XMLAttributeNode::SetString(const CFX_WideString& name,
+ const CFX_WideString& value) {
+ attrs_[name] = value;
+}
+
+void CFX_XMLAttributeNode::RemoveAttribute(const CFX_WideString& name) {
+ attrs_.erase(name);
+}
diff --git a/core/fxcrt/xml/cfx_xmlattributenode.h b/core/fxcrt/xml/cfx_xmlattributenode.h
new file mode 100644
index 0000000000..e8f358eb59
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlattributenode.h
@@ -0,0 +1,44 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLATTRIBUTENODE_H_
+#define CORE_FXCRT_XML_CFX_XMLATTRIBUTENODE_H_
+
+#include <map>
+#include <memory>
+
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+
+class CFX_XMLAttributeNode : public CFX_XMLNode {
+ public:
+ explicit CFX_XMLAttributeNode(const CFX_WideString& name);
+ ~CFX_XMLAttributeNode() override;
+
+ // CFX_XMLNode
+ FX_XMLNODETYPE GetType() const override = 0;
+ std::unique_ptr<CFX_XMLNode> Clone() override = 0;
+
+ CFX_WideString GetName() const { return name_; }
+ const std::map<CFX_WideString, CFX_WideString>& GetAttributes() const {
+ return attrs_;
+ }
+ void SetAttributes(const std::map<CFX_WideString, CFX_WideString>& attrs) {
+ attrs_ = attrs;
+ }
+ bool HasAttribute(const CFX_WideString& name) const;
+
+ void SetString(const CFX_WideString& name, const CFX_WideString& value);
+ CFX_WideString GetString(const CFX_WideString& name) const;
+
+ void RemoveAttribute(const CFX_WideString& name);
+
+ private:
+ CFX_WideString name_;
+ std::map<CFX_WideString, CFX_WideString> attrs_;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLATTRIBUTENODE_H_
diff --git a/core/fxcrt/xml/cfx_xmlchardata.cpp b/core/fxcrt/xml/cfx_xmlchardata.cpp
new file mode 100644
index 0000000000..185bd064df
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlchardata.cpp
@@ -0,0 +1,22 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlchardata.h"
+
+#include "third_party/base/ptr_util.h"
+
+CFX_XMLCharData::CFX_XMLCharData(const CFX_WideString& wsCData)
+ : CFX_XMLText(wsCData) {}
+
+CFX_XMLCharData::~CFX_XMLCharData() {}
+
+FX_XMLNODETYPE CFX_XMLCharData::GetType() const {
+ return FX_XMLNODE_CharData;
+}
+
+std::unique_ptr<CFX_XMLNode> CFX_XMLCharData::Clone() {
+ return pdfium::MakeUnique<CFX_XMLCharData>(GetText());
+}
diff --git a/core/fxcrt/xml/cfx_xmlchardata.h b/core/fxcrt/xml/cfx_xmlchardata.h
new file mode 100644
index 0000000000..c5c007be90
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlchardata.h
@@ -0,0 +1,24 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLCHARDATA_H_
+#define CORE_FXCRT_XML_CFX_XMLCHARDATA_H_
+
+#include <memory>
+
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmltext.h"
+
+class CFX_XMLCharData : public CFX_XMLText {
+ public:
+ explicit CFX_XMLCharData(const CFX_WideString& wsCData);
+ ~CFX_XMLCharData() override;
+
+ FX_XMLNODETYPE GetType() const override;
+ std::unique_ptr<CFX_XMLNode> Clone() override;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLCHARDATA_H_
diff --git a/core/fxcrt/xml/cfx_xmldoc.cpp b/core/fxcrt/xml/cfx_xmldoc.cpp
new file mode 100644
index 0000000000..4f58da91f2
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmldoc.cpp
@@ -0,0 +1,160 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmldoc.h"
+
+#include <utility>
+#include <vector>
+
+#include "core/fxcrt/fx_codepage.h"
+#include "core/fxcrt/xml/cfx_xmlchardata.h"
+#include "core/fxcrt/xml/cfx_xmlelement.h"
+#include "core/fxcrt/xml/cfx_xmlinstruction.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+#include "core/fxcrt/xml/cfx_xmltext.h"
+#include "third_party/base/ptr_util.h"
+#include "third_party/base/stl_util.h"
+
+CFX_XMLDoc::CFX_XMLDoc()
+ : m_iStatus(0), m_pRoot(pdfium::MakeUnique<CFX_XMLNode>()) {
+ m_pRoot->InsertChildNode(new CFX_XMLInstruction(L"xml"));
+}
+
+CFX_XMLDoc::~CFX_XMLDoc() {}
+
+bool CFX_XMLDoc::LoadXML(std::unique_ptr<CFX_XMLParser> pXMLParser) {
+ if (!pXMLParser)
+ return false;
+
+ m_iStatus = 0;
+ m_pStream.Reset();
+ m_pRoot->DeleteChildren();
+ m_pXMLParser = std::move(pXMLParser);
+ return true;
+}
+
+int32_t CFX_XMLDoc::DoLoad(IFX_Pause* pPause) {
+ if (m_iStatus < 100)
+ m_iStatus = m_pXMLParser->DoParser(pPause);
+
+ return m_iStatus;
+}
+
+void CFX_XMLDoc::CloseXML() {
+ m_pXMLParser.reset();
+}
+
+void CFX_XMLDoc::SaveXMLNode(
+ const CFX_RetainPtr<CFX_SeekableStreamProxy>& pXMLStream,
+ CFX_XMLNode* pINode) {
+ CFX_XMLNode* pNode = (CFX_XMLNode*)pINode;
+ switch (pNode->GetType()) {
+ case FX_XMLNODE_Instruction: {
+ CFX_WideString ws;
+ CFX_XMLInstruction* pInstruction = (CFX_XMLInstruction*)pNode;
+ if (pInstruction->GetName().CompareNoCase(L"xml") == 0) {
+ ws = L"<?xml version=\"1.0\" encoding=\"";
+ uint16_t wCodePage = pXMLStream->GetCodePage();
+ if (wCodePage == FX_CODEPAGE_UTF16LE) {
+ ws += L"UTF-16";
+ } else if (wCodePage == FX_CODEPAGE_UTF16BE) {
+ ws += L"UTF-16be";
+ } else {
+ ws += L"UTF-8";
+ }
+ ws += L"\"?>";
+ pXMLStream->WriteString(ws.AsStringC());
+ } else {
+ ws.Format(L"<?%s", pInstruction->GetName().c_str());
+ pXMLStream->WriteString(ws.AsStringC());
+
+ for (auto it : pInstruction->GetAttributes()) {
+ CFX_WideString wsValue = it.second;
+ wsValue.Replace(L"&", L"&amp;");
+ wsValue.Replace(L"<", L"&lt;");
+ wsValue.Replace(L">", L"&gt;");
+ wsValue.Replace(L"\'", L"&apos;");
+ wsValue.Replace(L"\"", L"&quot;");
+
+ ws = L" ";
+ ws += it.first;
+ ws += L"=\"";
+ ws += wsValue;
+ ws += L"\"";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+
+ for (auto target : pInstruction->GetTargetData()) {
+ ws = L" \"";
+ ws += target;
+ ws += L"\"";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+ ws = L"?>";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+ break;
+ }
+ case FX_XMLNODE_Element: {
+ CFX_WideString ws;
+ ws = L"<";
+ ws += static_cast<CFX_XMLElement*>(pNode)->GetName();
+ pXMLStream->WriteString(ws.AsStringC());
+
+ for (auto it : static_cast<CFX_XMLElement*>(pNode)->GetAttributes()) {
+ CFX_WideString wsValue = it.second;
+ wsValue.Replace(L"&", L"&amp;");
+ wsValue.Replace(L"<", L"&lt;");
+ wsValue.Replace(L">", L"&gt;");
+ wsValue.Replace(L"\'", L"&apos;");
+ wsValue.Replace(L"\"", L"&quot;");
+
+ ws = L" ";
+ ws += it.first;
+ ws += L"=\"";
+ ws += wsValue;
+ ws += L"\"";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+ if (pNode->m_pChild) {
+ ws = L"\n>";
+ pXMLStream->WriteString(ws.AsStringC());
+ CFX_XMLNode* pChild = pNode->m_pChild;
+ while (pChild) {
+ SaveXMLNode(pXMLStream, static_cast<CFX_XMLNode*>(pChild));
+ pChild = pChild->m_pNext;
+ }
+ ws = L"</";
+ ws += static_cast<CFX_XMLElement*>(pNode)->GetName();
+ ws += L"\n>";
+ } else {
+ ws = L"\n/>";
+ }
+ pXMLStream->WriteString(ws.AsStringC());
+ break;
+ }
+ case FX_XMLNODE_Text: {
+ CFX_WideString ws = static_cast<CFX_XMLText*>(pNode)->GetText();
+ ws.Replace(L"&", L"&amp;");
+ ws.Replace(L"<", L"&lt;");
+ ws.Replace(L">", L"&gt;");
+ ws.Replace(L"\'", L"&apos;");
+ ws.Replace(L"\"", L"&quot;");
+ pXMLStream->WriteString(ws.AsStringC());
+ break;
+ }
+ case FX_XMLNODE_CharData: {
+ CFX_WideString ws = L"<![CDATA[";
+ ws += static_cast<CFX_XMLCharData*>(pNode)->GetText();
+ ws += L"]]>";
+ pXMLStream->WriteString(ws.AsStringC());
+ break;
+ }
+ case FX_XMLNODE_Unknown:
+ default:
+ break;
+ }
+}
diff --git a/core/fxcrt/xml/cfx_xmldoc.h b/core/fxcrt/xml/cfx_xmldoc.h
new file mode 100644
index 0000000000..5966c096ba
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmldoc.h
@@ -0,0 +1,37 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLDOC_H_
+#define CORE_FXCRT_XML_CFX_XMLDOC_H_
+
+#include <memory>
+
+#include "core/fxcrt/cfx_retain_ptr.h"
+#include "core/fxcrt/cfx_seekablestreamproxy.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+#include "core/fxcrt/xml/cfx_xmlparser.h"
+
+class CFX_XMLDoc {
+ public:
+ CFX_XMLDoc();
+ ~CFX_XMLDoc();
+
+ bool LoadXML(std::unique_ptr<CFX_XMLParser> pXMLParser);
+ int32_t DoLoad(IFX_Pause* pPause);
+ void CloseXML();
+
+ CFX_XMLNode* GetRoot() const { return m_pRoot.get(); }
+ void SaveXMLNode(const CFX_RetainPtr<CFX_SeekableStreamProxy>& pXMLStream,
+ CFX_XMLNode* pNode);
+
+ private:
+ int32_t m_iStatus;
+ std::unique_ptr<CFX_XMLNode> m_pRoot;
+ std::unique_ptr<CFX_XMLParser> m_pXMLParser;
+ CFX_RetainPtr<CFX_SeekableStreamProxy> m_pStream;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLDOC_H_
diff --git a/core/fxcrt/xml/cfx_xmlelement.cpp b/core/fxcrt/xml/cfx_xmlelement.cpp
new file mode 100644
index 0000000000..c6b70e1cc4
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlelement.cpp
@@ -0,0 +1,102 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlelement.h"
+
+#include "core/fxcrt/fx_ext.h"
+#include "core/fxcrt/xml/cfx_xmlchardata.h"
+#include "core/fxcrt/xml/cfx_xmltext.h"
+#include "third_party/base/ptr_util.h"
+#include "third_party/base/stl_util.h"
+
+CFX_XMLElement::CFX_XMLElement(const CFX_WideString& wsTag)
+ : CFX_XMLAttributeNode(wsTag) {}
+
+CFX_XMLElement::~CFX_XMLElement() {}
+
+FX_XMLNODETYPE CFX_XMLElement::GetType() const {
+ return FX_XMLNODE_Element;
+}
+
+std::unique_ptr<CFX_XMLNode> CFX_XMLElement::Clone() {
+ auto pClone = pdfium::MakeUnique<CFX_XMLElement>(GetName());
+ pClone->SetAttributes(GetAttributes());
+
+ CFX_WideString wsText;
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild) {
+ switch (pChild->GetType()) {
+ case FX_XMLNODE_Text:
+ wsText += static_cast<CFX_XMLText*>(pChild)->GetText();
+ break;
+ default:
+ break;
+ }
+ pChild = pChild->m_pNext;
+ }
+ pClone->SetTextData(wsText);
+ return pClone;
+}
+
+CFX_WideString CFX_XMLElement::GetLocalTagName() const {
+ FX_STRSIZE iFind = GetName().Find(L':', 0);
+ if (iFind < 0)
+ return GetName();
+ return GetName().Right(GetName().GetLength() - iFind - 1);
+}
+
+CFX_WideString CFX_XMLElement::GetNamespacePrefix() const {
+ FX_STRSIZE iFind = GetName().Find(L':', 0);
+ if (iFind < 0)
+ return CFX_WideString();
+ return GetName().Left(iFind);
+}
+
+CFX_WideString CFX_XMLElement::GetNamespaceURI() const {
+ CFX_WideString wsAttri(L"xmlns");
+ CFX_WideString wsPrefix = GetNamespacePrefix();
+ if (wsPrefix.GetLength() > 0) {
+ wsAttri += L":";
+ wsAttri += wsPrefix;
+ }
+
+ auto* pNode = static_cast<const CFX_XMLNode*>(this);
+ while (pNode) {
+ if (pNode->GetType() != FX_XMLNODE_Element)
+ break;
+
+ auto* pElement = static_cast<const CFX_XMLElement*>(pNode);
+ if (!pElement->HasAttribute(wsAttri)) {
+ pNode = pNode->GetNodeItem(CFX_XMLNode::Parent);
+ continue;
+ }
+ return pElement->GetString(wsAttri);
+ }
+ return CFX_WideString();
+}
+
+CFX_WideString CFX_XMLElement::GetTextData() const {
+ CFX_WideTextBuf buffer;
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild) {
+ switch (pChild->GetType()) {
+ case FX_XMLNODE_Text:
+ case FX_XMLNODE_CharData:
+ buffer << static_cast<CFX_XMLText*>(pChild)->GetText();
+ break;
+ default:
+ break;
+ }
+ pChild = pChild->m_pNext;
+ }
+ return buffer.MakeString();
+}
+
+void CFX_XMLElement::SetTextData(const CFX_WideString& wsText) {
+ if (wsText.GetLength() < 1)
+ return;
+ InsertChildNode(new CFX_XMLText(wsText));
+}
diff --git a/core/fxcrt/xml/cfx_xmlelement.h b/core/fxcrt/xml/cfx_xmlelement.h
new file mode 100644
index 0000000000..20780342af
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlelement.h
@@ -0,0 +1,33 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLELEMENT_H_
+#define CORE_FXCRT_XML_CFX_XMLELEMENT_H_
+
+#include <memory>
+#include <vector>
+
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmlattributenode.h"
+
+class CFX_XMLElement : public CFX_XMLAttributeNode {
+ public:
+ explicit CFX_XMLElement(const CFX_WideString& wsTag);
+ ~CFX_XMLElement() override;
+
+ // CFX_XMLNode
+ FX_XMLNODETYPE GetType() const override;
+ std::unique_ptr<CFX_XMLNode> Clone() override;
+
+ CFX_WideString GetLocalTagName() const;
+ CFX_WideString GetNamespacePrefix() const;
+ CFX_WideString GetNamespaceURI() const;
+
+ CFX_WideString GetTextData() const;
+ void SetTextData(const CFX_WideString& wsText);
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLELEMENT_H_
diff --git a/core/fxcrt/xml/cfx_xmlinstruction.cpp b/core/fxcrt/xml/cfx_xmlinstruction.cpp
new file mode 100644
index 0000000000..d07b92e3c1
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlinstruction.cpp
@@ -0,0 +1,36 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlinstruction.h"
+
+#include "core/fxcrt/fx_ext.h"
+#include "third_party/base/ptr_util.h"
+#include "third_party/base/stl_util.h"
+
+CFX_XMLInstruction::CFX_XMLInstruction(const CFX_WideString& wsTarget)
+ : CFX_XMLAttributeNode(wsTarget) {}
+
+CFX_XMLInstruction::~CFX_XMLInstruction() {}
+
+FX_XMLNODETYPE CFX_XMLInstruction::GetType() const {
+ return FX_XMLNODE_Instruction;
+}
+
+std::unique_ptr<CFX_XMLNode> CFX_XMLInstruction::Clone() {
+ auto pClone = pdfium::MakeUnique<CFX_XMLInstruction>(GetName());
+ pClone->SetAttributes(GetAttributes());
+ pClone->m_TargetData = m_TargetData;
+ return pClone;
+}
+
+void CFX_XMLInstruction::AppendData(const CFX_WideString& wsData) {
+ m_TargetData.push_back(wsData);
+}
+
+void CFX_XMLInstruction::RemoveData(int32_t index) {
+ if (pdfium::IndexInBounds(m_TargetData, index))
+ m_TargetData.erase(m_TargetData.begin() + index);
+}
diff --git a/core/fxcrt/xml/cfx_xmlinstruction.h b/core/fxcrt/xml/cfx_xmlinstruction.h
new file mode 100644
index 0000000000..99554fc239
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlinstruction.h
@@ -0,0 +1,35 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLINSTRUCTION_H_
+#define CORE_FXCRT_XML_CFX_XMLINSTRUCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmlattributenode.h"
+
+class CFX_XMLInstruction : public CFX_XMLAttributeNode {
+ public:
+ explicit CFX_XMLInstruction(const CFX_WideString& wsTarget);
+ ~CFX_XMLInstruction() override;
+
+ // CFX_XMLNode
+ FX_XMLNODETYPE GetType() const override;
+ std::unique_ptr<CFX_XMLNode> Clone() override;
+
+ const std::vector<CFX_WideString>& GetTargetData() const {
+ return m_TargetData;
+ }
+ void AppendData(const CFX_WideString& wsData);
+ void RemoveData(int32_t index);
+
+ private:
+ std::vector<CFX_WideString> m_TargetData;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLINSTRUCTION_H_
diff --git a/core/fxcrt/xml/cfx_xmlnode.cpp b/core/fxcrt/xml/cfx_xmlnode.cpp
new file mode 100644
index 0000000000..47b3105f10
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlnode.cpp
@@ -0,0 +1,441 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+
+#include <vector>
+
+#include "core/fxcrt/fx_codepage.h"
+#include "core/fxcrt/xml/cfx_xmlchardata.h"
+#include "core/fxcrt/xml/cfx_xmlelement.h"
+#include "core/fxcrt/xml/cfx_xmlinstruction.h"
+#include "core/fxcrt/xml/cfx_xmltext.h"
+#include "third_party/base/stl_util.h"
+
+CFX_XMLNode::CFX_XMLNode()
+ : m_pParent(nullptr),
+ m_pChild(nullptr),
+ m_pPrior(nullptr),
+ m_pNext(nullptr) {}
+
+FX_XMLNODETYPE CFX_XMLNode::GetType() const {
+ return FX_XMLNODE_Unknown;
+}
+
+CFX_XMLNode::~CFX_XMLNode() {
+ DeleteChildren();
+}
+
+void CFX_XMLNode::DeleteChildren() {
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild) {
+ CFX_XMLNode* pNext = pChild->m_pNext;
+ delete pChild;
+ pChild = pNext;
+ }
+ m_pChild = nullptr;
+}
+
+int32_t CFX_XMLNode::CountChildNodes() const {
+ int32_t iCount = 0;
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild) {
+ iCount++;
+ pChild = pChild->m_pNext;
+ }
+ return iCount;
+}
+
+CFX_XMLNode* CFX_XMLNode::GetChildNode(int32_t index) const {
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild) {
+ if (index == 0) {
+ return pChild;
+ }
+ index--;
+ pChild = pChild->m_pNext;
+ }
+ return nullptr;
+}
+
+int32_t CFX_XMLNode::GetChildNodeIndex(CFX_XMLNode* pNode) const {
+ int32_t index = 0;
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild) {
+ if (pChild == pNode) {
+ return index;
+ }
+ index++;
+ pChild = pChild->m_pNext;
+ }
+ return -1;
+}
+
+CFX_XMLNode* CFX_XMLNode::GetPath(const wchar_t* pPath,
+ int32_t iLength,
+ bool bQualifiedName) const {
+ ASSERT(pPath);
+ if (iLength < 0) {
+ iLength = FXSYS_wcslen(pPath);
+ }
+ if (iLength == 0) {
+ return nullptr;
+ }
+ CFX_WideString csPath;
+ const wchar_t* pStart = pPath;
+ const wchar_t* pEnd = pPath + iLength;
+ wchar_t ch;
+ while (pStart < pEnd) {
+ ch = *pStart++;
+ if (ch == L'/') {
+ break;
+ } else {
+ csPath += ch;
+ }
+ }
+ iLength -= pStart - pPath;
+ CFX_XMLNode* pFind = nullptr;
+ if (csPath.GetLength() < 1) {
+ pFind = GetNodeItem(CFX_XMLNode::Root);
+ } else if (csPath.Compare(L"..") == 0) {
+ pFind = m_pParent;
+ } else if (csPath.Compare(L".") == 0) {
+ pFind = (CFX_XMLNode*)this;
+ } else {
+ CFX_WideString wsTag;
+ CFX_XMLNode* pNode = m_pChild;
+ while (pNode) {
+ if (pNode->GetType() == FX_XMLNODE_Element) {
+ if (bQualifiedName)
+ wsTag = static_cast<CFX_XMLElement*>(pNode)->GetName();
+ else
+ wsTag = static_cast<CFX_XMLElement*>(pNode)->GetLocalTagName();
+
+ if (wsTag.Compare(csPath) == 0) {
+ if (iLength < 1)
+ pFind = pNode;
+ else
+ pFind = pNode->GetPath(pStart, iLength, bQualifiedName);
+
+ if (pFind)
+ return pFind;
+ }
+ }
+ pNode = pNode->m_pNext;
+ }
+ }
+ if (!pFind || iLength < 1)
+ return pFind;
+ return pFind->GetPath(pStart, iLength, bQualifiedName);
+}
+
+int32_t CFX_XMLNode::InsertChildNode(CFX_XMLNode* pNode, int32_t index) {
+ pNode->m_pParent = this;
+ if (!m_pChild) {
+ m_pChild = pNode;
+ pNode->m_pPrior = nullptr;
+ pNode->m_pNext = nullptr;
+ return 0;
+ }
+ if (index == 0) {
+ pNode->m_pNext = m_pChild;
+ pNode->m_pPrior = nullptr;
+ m_pChild->m_pPrior = pNode;
+ m_pChild = pNode;
+ return 0;
+ }
+ int32_t iCount = 0;
+ CFX_XMLNode* pFind = m_pChild;
+ while (++iCount != index && pFind->m_pNext) {
+ pFind = pFind->m_pNext;
+ }
+ pNode->m_pPrior = pFind;
+ pNode->m_pNext = pFind->m_pNext;
+ if (pFind->m_pNext)
+ pFind->m_pNext->m_pPrior = pNode;
+ pFind->m_pNext = pNode;
+ return iCount;
+}
+
+void CFX_XMLNode::RemoveChildNode(CFX_XMLNode* pNode) {
+ ASSERT(m_pChild && pNode);
+ if (m_pChild == pNode) {
+ m_pChild = pNode->m_pNext;
+ } else {
+ pNode->m_pPrior->m_pNext = pNode->m_pNext;
+ }
+ if (pNode->m_pNext)
+ pNode->m_pNext->m_pPrior = pNode->m_pPrior;
+ pNode->m_pParent = nullptr;
+ pNode->m_pNext = nullptr;
+ pNode->m_pPrior = nullptr;
+}
+
+CFX_XMLNode* CFX_XMLNode::GetNodeItem(CFX_XMLNode::NodeItem eItem) const {
+ switch (eItem) {
+ case CFX_XMLNode::Root: {
+ CFX_XMLNode* pParent = (CFX_XMLNode*)this;
+ while (pParent->m_pParent) {
+ pParent = pParent->m_pParent;
+ }
+ return pParent;
+ }
+ case CFX_XMLNode::Parent:
+ return m_pParent;
+ case CFX_XMLNode::FirstSibling: {
+ CFX_XMLNode* pItem = (CFX_XMLNode*)this;
+ while (pItem->m_pPrior) {
+ pItem = pItem->m_pPrior;
+ }
+ return pItem == (CFX_XMLNode*)this ? nullptr : pItem;
+ }
+ case CFX_XMLNode::PriorSibling:
+ return m_pPrior;
+ case CFX_XMLNode::NextSibling:
+ return m_pNext;
+ case CFX_XMLNode::LastSibling: {
+ CFX_XMLNode* pItem = (CFX_XMLNode*)this;
+ while (pItem->m_pNext)
+ pItem = pItem->m_pNext;
+ return pItem == (CFX_XMLNode*)this ? nullptr : pItem;
+ }
+ case CFX_XMLNode::FirstNeighbor: {
+ CFX_XMLNode* pParent = (CFX_XMLNode*)this;
+ while (pParent->m_pParent)
+ pParent = pParent->m_pParent;
+ return pParent == (CFX_XMLNode*)this ? nullptr : pParent;
+ }
+ case CFX_XMLNode::PriorNeighbor: {
+ if (!m_pPrior)
+ return m_pParent;
+
+ CFX_XMLNode* pItem = m_pPrior;
+ while (pItem->m_pChild) {
+ pItem = pItem->m_pChild;
+ while (pItem->m_pNext)
+ pItem = pItem->m_pNext;
+ }
+ return pItem;
+ }
+ case CFX_XMLNode::NextNeighbor: {
+ if (m_pChild)
+ return m_pChild;
+ if (m_pNext)
+ return m_pNext;
+ CFX_XMLNode* pItem = m_pParent;
+ while (pItem) {
+ if (pItem->m_pNext)
+ return pItem->m_pNext;
+ pItem = pItem->m_pParent;
+ }
+ return nullptr;
+ }
+ case CFX_XMLNode::LastNeighbor: {
+ CFX_XMLNode* pItem = (CFX_XMLNode*)this;
+ while (pItem->m_pParent) {
+ pItem = pItem->m_pParent;
+ }
+ while (true) {
+ while (pItem->m_pNext)
+ pItem = pItem->m_pNext;
+ if (!pItem->m_pChild)
+ break;
+ pItem = pItem->m_pChild;
+ }
+ return pItem == (CFX_XMLNode*)this ? nullptr : pItem;
+ }
+ case CFX_XMLNode::FirstChild:
+ return m_pChild;
+ case CFX_XMLNode::LastChild: {
+ if (!m_pChild)
+ return nullptr;
+
+ CFX_XMLNode* pChild = m_pChild;
+ while (pChild->m_pNext)
+ pChild = pChild->m_pNext;
+ return pChild;
+ }
+ default:
+ break;
+ }
+ return nullptr;
+}
+
+int32_t CFX_XMLNode::GetNodeLevel() const {
+ int32_t iLevel = 0;
+ const CFX_XMLNode* pItem = m_pParent;
+ while (pItem) {
+ iLevel++;
+ pItem = pItem->m_pParent;
+ }
+ return iLevel;
+}
+
+bool CFX_XMLNode::InsertNodeItem(CFX_XMLNode::NodeItem eItem,
+ CFX_XMLNode* pNode) {
+ switch (eItem) {
+ case CFX_XMLNode::NextSibling: {
+ pNode->m_pParent = m_pParent;
+ pNode->m_pNext = m_pNext;
+ pNode->m_pPrior = this;
+ if (m_pNext) {
+ m_pNext->m_pPrior = pNode;
+ }
+ m_pNext = pNode;
+ return true;
+ }
+ case CFX_XMLNode::PriorSibling: {
+ pNode->m_pParent = m_pParent;
+ pNode->m_pNext = this;
+ pNode->m_pPrior = m_pPrior;
+ if (m_pPrior) {
+ m_pPrior->m_pNext = pNode;
+ } else if (m_pParent) {
+ m_pParent->m_pChild = pNode;
+ }
+ m_pPrior = pNode;
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+CFX_XMLNode* CFX_XMLNode::RemoveNodeItem(CFX_XMLNode::NodeItem eItem) {
+ CFX_XMLNode* pNode = nullptr;
+ switch (eItem) {
+ case CFX_XMLNode::NextSibling:
+ if (m_pNext) {
+ pNode = m_pNext;
+ m_pNext = pNode->m_pNext;
+ if (m_pNext) {
+ m_pNext->m_pPrior = this;
+ }
+ pNode->m_pParent = nullptr;
+ pNode->m_pNext = nullptr;
+ pNode->m_pPrior = nullptr;
+ }
+ break;
+ default:
+ break;
+ }
+ return pNode;
+}
+
+std::unique_ptr<CFX_XMLNode> CFX_XMLNode::Clone() {
+ return nullptr;
+}
+
+void CFX_XMLNode::SaveXMLNode(
+ const CFX_RetainPtr<CFX_SeekableStreamProxy>& pXMLStream) {
+ CFX_XMLNode* pNode = (CFX_XMLNode*)this;
+ switch (pNode->GetType()) {
+ case FX_XMLNODE_Instruction: {
+ CFX_WideString ws;
+ CFX_XMLInstruction* pInstruction = (CFX_XMLInstruction*)pNode;
+ if (pInstruction->GetName().CompareNoCase(L"xml") == 0) {
+ ws = L"<?xml version=\"1.0\" encoding=\"";
+ uint16_t wCodePage = pXMLStream->GetCodePage();
+ if (wCodePage == FX_CODEPAGE_UTF16LE) {
+ ws += L"UTF-16";
+ } else if (wCodePage == FX_CODEPAGE_UTF16BE) {
+ ws += L"UTF-16be";
+ } else {
+ ws += L"UTF-8";
+ }
+ ws += L"\"?>";
+ pXMLStream->WriteString(ws.AsStringC());
+ } else {
+ ws.Format(L"<?%s", pInstruction->GetName().c_str());
+ pXMLStream->WriteString(ws.AsStringC());
+
+ for (auto it : pInstruction->GetAttributes()) {
+ CFX_WideString wsValue = it.second;
+ wsValue.Replace(L"&", L"&amp;");
+ wsValue.Replace(L"<", L"&lt;");
+ wsValue.Replace(L">", L"&gt;");
+ wsValue.Replace(L"\'", L"&apos;");
+ wsValue.Replace(L"\"", L"&quot;");
+
+ ws = L" ";
+ ws += it.first;
+ ws += L"=\"";
+ ws += wsValue;
+ ws += L"\"";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+
+ for (auto target : pInstruction->GetTargetData()) {
+ ws = L" \"";
+ ws += target;
+ ws += L"\"";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+ ws = L"?>";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+ break;
+ }
+ case FX_XMLNODE_Element: {
+ CFX_WideString ws;
+ ws = L"<";
+ ws += static_cast<CFX_XMLElement*>(pNode)->GetName();
+ pXMLStream->WriteString(ws.AsStringC());
+
+ for (auto it : static_cast<CFX_XMLElement*>(pNode)->GetAttributes()) {
+ CFX_WideString wsValue = it.second;
+ wsValue.Replace(L"&", L"&amp;");
+ wsValue.Replace(L"<", L"&lt;");
+ wsValue.Replace(L">", L"&gt;");
+ wsValue.Replace(L"\'", L"&apos;");
+ wsValue.Replace(L"\"", L"&quot;");
+
+ ws = L" ";
+ ws += it.first;
+ ws += L"=\"";
+ ws += wsValue;
+ ws += L"\"";
+ pXMLStream->WriteString(ws.AsStringC());
+ }
+ if (pNode->m_pChild) {
+ ws = L"\n>";
+ pXMLStream->WriteString(ws.AsStringC());
+ CFX_XMLNode* pChild = pNode->m_pChild;
+ while (pChild) {
+ pChild->SaveXMLNode(pXMLStream);
+ pChild = pChild->m_pNext;
+ }
+ ws = L"</";
+ ws += static_cast<CFX_XMLElement*>(pNode)->GetName();
+ ws += L"\n>";
+ } else {
+ ws = L"\n/>";
+ }
+ pXMLStream->WriteString(ws.AsStringC());
+ break;
+ }
+ case FX_XMLNODE_Text: {
+ CFX_WideString ws = static_cast<CFX_XMLText*>(pNode)->GetText();
+ ws.Replace(L"&", L"&amp;");
+ ws.Replace(L"<", L"&lt;");
+ ws.Replace(L">", L"&gt;");
+ ws.Replace(L"\'", L"&apos;");
+ ws.Replace(L"\"", L"&quot;");
+ pXMLStream->WriteString(ws.AsStringC());
+ break;
+ }
+ case FX_XMLNODE_CharData: {
+ CFX_WideString ws = L"<![CDATA[";
+ ws += static_cast<CFX_XMLCharData*>(pNode)->GetText();
+ ws += L"]]>";
+ pXMLStream->WriteString(ws.AsStringC());
+ break;
+ }
+ case FX_XMLNODE_Unknown:
+ default:
+ break;
+ }
+}
diff --git a/core/fxcrt/xml/cfx_xmlnode.h b/core/fxcrt/xml/cfx_xmlnode.h
new file mode 100644
index 0000000000..7cbc2b6642
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlnode.h
@@ -0,0 +1,75 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLNODE_H_
+#define CORE_FXCRT_XML_CFX_XMLNODE_H_
+
+#include <memory>
+
+#include "core/fxcrt/cfx_retain_ptr.h"
+#include "core/fxcrt/cfx_seekablestreamproxy.h"
+
+enum FX_XMLNODETYPE {
+ FX_XMLNODE_Unknown = 0,
+ FX_XMLNODE_Instruction,
+ FX_XMLNODE_Element,
+ FX_XMLNODE_Text,
+ FX_XMLNODE_CharData,
+};
+
+struct FX_XMLNODE {
+ int32_t iNodeNum;
+ FX_XMLNODETYPE eNodeType;
+};
+
+class CFX_XMLNode {
+ public:
+ enum NodeItem {
+ Root = 0,
+ Parent,
+ FirstSibling,
+ PriorSibling,
+ NextSibling,
+ LastSibling,
+ FirstNeighbor,
+ PriorNeighbor,
+ NextNeighbor,
+ LastNeighbor,
+ FirstChild,
+ LastChild
+ };
+
+ CFX_XMLNode();
+ virtual ~CFX_XMLNode();
+
+ virtual FX_XMLNODETYPE GetType() const;
+ virtual std::unique_ptr<CFX_XMLNode> Clone();
+
+ int32_t CountChildNodes() const;
+ CFX_XMLNode* GetChildNode(int32_t index) const;
+ int32_t GetChildNodeIndex(CFX_XMLNode* pNode) const;
+ int32_t InsertChildNode(CFX_XMLNode* pNode, int32_t index = -1);
+ void RemoveChildNode(CFX_XMLNode* pNode);
+ void DeleteChildren();
+
+ CFX_XMLNode* GetPath(const wchar_t* pPath,
+ int32_t iLength = -1,
+ bool bQualifiedName = true) const;
+
+ int32_t GetNodeLevel() const;
+ CFX_XMLNode* GetNodeItem(CFX_XMLNode::NodeItem eItem) const;
+ bool InsertNodeItem(CFX_XMLNode::NodeItem eItem, CFX_XMLNode* pNode);
+ CFX_XMLNode* RemoveNodeItem(CFX_XMLNode::NodeItem eItem);
+
+ void SaveXMLNode(const CFX_RetainPtr<CFX_SeekableStreamProxy>& pXMLStream);
+
+ CFX_XMLNode* m_pParent;
+ CFX_XMLNode* m_pChild;
+ CFX_XMLNode* m_pPrior;
+ CFX_XMLNode* m_pNext;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLNODE_H_
diff --git a/core/fxcrt/xml/cfx_xmlparser.cpp b/core/fxcrt/xml/cfx_xmlparser.cpp
new file mode 100644
index 0000000000..0e328f33ea
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlparser.cpp
@@ -0,0 +1,171 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlparser.h"
+
+#include "core/fxcrt/fx_basic.h"
+#include "core/fxcrt/xml/cfx_xmlchardata.h"
+#include "core/fxcrt/xml/cfx_xmlelement.h"
+#include "core/fxcrt/xml/cfx_xmlinstruction.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+#include "core/fxcrt/xml/cfx_xmltext.h"
+#include "third_party/base/ptr_util.h"
+
+CFX_XMLParser::CFX_XMLParser(
+ CFX_XMLNode* pParent,
+ const CFX_RetainPtr<CFX_SeekableStreamProxy>& pStream)
+ : m_nElementStart(0),
+ m_dwCheckStatus(0),
+ m_dwCurrentCheckStatus(0),
+ m_pStream(pStream),
+ m_pParser(pdfium::MakeUnique<CFX_XMLSyntaxParser>(m_pStream)),
+ m_pParent(pParent),
+ m_pChild(nullptr),
+ m_syntaxParserResult(FX_XmlSyntaxResult::None) {
+ ASSERT(m_pParent && m_pStream);
+ m_NodeStack.push(m_pParent);
+}
+
+CFX_XMLParser::~CFX_XMLParser() {}
+
+int32_t CFX_XMLParser::DoParser(IFX_Pause* pPause) {
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::Error)
+ return -1;
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString)
+ return 100;
+
+ int32_t iCount = 0;
+ while (true) {
+ m_syntaxParserResult = m_pParser->DoSyntaxParse();
+ switch (m_syntaxParserResult) {
+ case FX_XmlSyntaxResult::InstructionOpen:
+ break;
+ case FX_XmlSyntaxResult::InstructionClose:
+ if (m_pChild) {
+ if (m_pChild->GetType() != FX_XMLNODE_Instruction) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ break;
+ }
+ }
+ m_pChild = m_pParent;
+ break;
+ case FX_XmlSyntaxResult::ElementOpen:
+ if (m_dwCheckStatus != 0x03 && m_NodeStack.size() == 2)
+ m_nElementStart = m_pParser->GetCurrentPos() - 1;
+ break;
+ case FX_XmlSyntaxResult::ElementBreak:
+ break;
+ case FX_XmlSyntaxResult::ElementClose:
+ if (m_pChild->GetType() != FX_XMLNODE_Element) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ break;
+ }
+ m_ws1 = m_pParser->GetTagName();
+ m_ws2 = static_cast<CFX_XMLElement*>(m_pChild)->GetName();
+ if (m_ws1.GetLength() > 0 && m_ws1 != m_ws2) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ break;
+ }
+ if (!m_NodeStack.empty())
+ m_NodeStack.pop();
+ if (m_NodeStack.empty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ break;
+ } else if (m_dwCurrentCheckStatus != 0 && m_NodeStack.size() == 2) {
+ m_nSize[m_dwCurrentCheckStatus - 1] =
+ m_pParser->GetCurrentBinaryPos() -
+ m_nStart[m_dwCurrentCheckStatus - 1];
+ m_dwCurrentCheckStatus = 0;
+ }
+ m_pParent = m_NodeStack.top();
+ m_pChild = m_pParent;
+ iCount++;
+ break;
+ case FX_XmlSyntaxResult::TargetName:
+ m_ws1 = m_pParser->GetTargetName();
+ if (m_ws1 == L"originalXFAVersion" || m_ws1 == L"acrobat") {
+ m_pChild = new CFX_XMLInstruction(m_ws1);
+ m_pParent->InsertChildNode(m_pChild);
+ } else {
+ m_pChild = nullptr;
+ }
+ m_ws1.clear();
+ break;
+ case FX_XmlSyntaxResult::TagName:
+ m_ws1 = m_pParser->GetTagName();
+ m_pChild = new CFX_XMLElement(m_ws1);
+ m_pParent->InsertChildNode(m_pChild);
+ m_NodeStack.push(m_pChild);
+ m_pParent = m_pChild;
+
+ if (m_dwCheckStatus != 0x03 && m_NodeStack.size() == 3) {
+ CFX_WideString wsTag =
+ static_cast<CFX_XMLElement*>(m_pChild)->GetLocalTagName();
+ if (wsTag == L"template") {
+ m_dwCheckStatus |= 0x01;
+ m_dwCurrentCheckStatus = 0x01;
+ m_nStart[0] = m_pParser->GetCurrentBinaryPos() -
+ (m_pParser->GetCurrentPos() - m_nElementStart);
+ } else if (wsTag == L"datasets") {
+ m_dwCheckStatus |= 0x02;
+ m_dwCurrentCheckStatus = 0x02;
+ m_nStart[1] = m_pParser->GetCurrentBinaryPos() -
+ (m_pParser->GetCurrentPos() - m_nElementStart);
+ }
+ }
+ break;
+ case FX_XmlSyntaxResult::AttriName:
+ m_ws1 = m_pParser->GetAttributeName();
+ break;
+ case FX_XmlSyntaxResult::AttriValue:
+ if (m_pChild) {
+ m_ws2 = m_pParser->GetAttributeName();
+ if (m_pChild->GetType() == FX_XMLNODE_Element)
+ static_cast<CFX_XMLElement*>(m_pChild)->SetString(m_ws1, m_ws2);
+ }
+ m_ws1.clear();
+ break;
+ case FX_XmlSyntaxResult::Text:
+ m_ws1 = m_pParser->GetTextData();
+ m_pChild = new CFX_XMLText(m_ws1);
+ m_pParent->InsertChildNode(m_pChild);
+ m_pChild = m_pParent;
+ break;
+ case FX_XmlSyntaxResult::CData:
+ m_ws1 = m_pParser->GetTextData();
+ m_pChild = new CFX_XMLCharData(m_ws1);
+ m_pParent->InsertChildNode(m_pChild);
+ m_pChild = m_pParent;
+ break;
+ case FX_XmlSyntaxResult::TargetData:
+ if (m_pChild) {
+ if (m_pChild->GetType() != FX_XMLNODE_Instruction) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ break;
+ }
+ auto* instruction = static_cast<CFX_XMLInstruction*>(m_pChild);
+ if (!m_ws1.IsEmpty())
+ instruction->AppendData(m_ws1);
+ instruction->AppendData(m_pParser->GetTargetData());
+ }
+ m_ws1.clear();
+ break;
+ default:
+ break;
+ }
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::Error ||
+ m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) {
+ break;
+ }
+ if (pPause && iCount > 500 && pPause->NeedToPauseNow()) {
+ break;
+ }
+ }
+ return (m_syntaxParserResult == FX_XmlSyntaxResult::Error ||
+ m_NodeStack.size() != 1)
+ ? -1
+ : m_pParser->GetStatus();
+}
diff --git a/core/fxcrt/xml/cfx_xmlparser.h b/core/fxcrt/xml/cfx_xmlparser.h
new file mode 100644
index 0000000000..dc3cc4c297
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlparser.h
@@ -0,0 +1,47 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLPARSER_H_
+#define CORE_FXCRT_XML_CFX_XMLPARSER_H_
+
+#include <memory>
+#include <stack>
+
+#include "core/fxcrt/cfx_retain_ptr.h"
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h"
+
+class CFX_XMLElement;
+class CFX_XMLNode;
+class CFX_SeekableStreamProxy;
+class IFX_Pause;
+
+class CFX_XMLParser {
+ public:
+ CFX_XMLParser(CFX_XMLNode* pParent,
+ const CFX_RetainPtr<CFX_SeekableStreamProxy>& pStream);
+ ~CFX_XMLParser();
+
+ int32_t DoParser(IFX_Pause* pPause);
+
+ FX_FILESIZE m_nStart[2];
+ size_t m_nSize[2];
+ FX_FILESIZE m_nElementStart;
+ uint16_t m_dwCheckStatus;
+ uint16_t m_dwCurrentCheckStatus;
+
+ private:
+ CFX_RetainPtr<CFX_SeekableStreamProxy> m_pStream;
+ std::unique_ptr<CFX_XMLSyntaxParser> m_pParser;
+ CFX_XMLNode* m_pParent;
+ CFX_XMLNode* m_pChild;
+ std::stack<CFX_XMLNode*> m_NodeStack;
+ CFX_WideString m_ws1;
+ CFX_WideString m_ws2;
+ FX_XmlSyntaxResult m_syntaxParserResult;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLPARSER_H_
diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp b/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp
new file mode 100644
index 0000000000..e7bef71085
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp
@@ -0,0 +1,698 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h"
+
+#include <algorithm>
+
+#include "core/fxcrt/fx_ext.h"
+#include "core/fxcrt/fx_safe_types.h"
+
+namespace {
+
+const uint32_t kMaxCharRange = 0x10ffff;
+
+bool IsXMLWhiteSpace(wchar_t ch) {
+ return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
+}
+
+struct FX_XMLNAMECHAR {
+ uint16_t wStart;
+ uint16_t wEnd;
+ bool bStartChar;
+};
+
+const FX_XMLNAMECHAR g_XMLNameChars[] = {
+ {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
+ {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
+ {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
+ {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
+ {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
+ {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
+ {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
+};
+
+bool IsXMLNameChar(wchar_t ch, bool bFirstChar) {
+ int32_t iStart = 0;
+ int32_t iEnd = FX_ArraySize(g_XMLNameChars) - 1;
+ while (iStart <= iEnd) {
+ int32_t iMid = (iStart + iEnd) / 2;
+ if (ch < g_XMLNameChars[iMid].wStart) {
+ iEnd = iMid - 1;
+ } else if (ch > g_XMLNameChars[iMid].wEnd) {
+ iStart = iMid + 1;
+ } else {
+ return bFirstChar ? g_XMLNameChars[iMid].bStartChar : true;
+ }
+ }
+ return false;
+}
+
+int32_t GetUTF8EncodeLength(const std::vector<wchar_t>& src,
+ FX_FILESIZE iSrcLen) {
+ uint32_t unicode = 0;
+ int32_t iDstNum = 0;
+ const wchar_t* pSrc = src.data();
+ while (iSrcLen-- > 0) {
+ unicode = *pSrc++;
+ int nbytes = 0;
+ if ((uint32_t)unicode < 0x80) {
+ nbytes = 1;
+ } else if ((uint32_t)unicode < 0x800) {
+ nbytes = 2;
+ } else if ((uint32_t)unicode < 0x10000) {
+ nbytes = 3;
+ } else if ((uint32_t)unicode < 0x200000) {
+ nbytes = 4;
+ } else if ((uint32_t)unicode < 0x4000000) {
+ nbytes = 5;
+ } else {
+ nbytes = 6;
+ }
+ iDstNum += nbytes;
+ }
+ return iDstNum;
+}
+
+} // namespace
+
+CFX_XMLSyntaxParser::CFX_XMLSyntaxParser(
+ const CFX_RetainPtr<CFX_SeekableStreamProxy>& pStream)
+ : m_pStream(pStream),
+ m_iXMLPlaneSize(32 * 1024),
+ m_iCurrentPos(0),
+ m_iCurrentNodeNum(-1),
+ m_iLastNodeNum(-1),
+ m_iParsedBytes(0),
+ m_ParsedChars(0),
+ m_iBufferChars(0),
+ m_bEOS(false),
+ m_Start(0),
+ m_End(0),
+ m_iAllocStep(m_BlockBuffer.GetAllocStep()),
+ m_pCurrentBlock(nullptr),
+ m_iIndexInBlock(0),
+ m_iTextDataLength(0),
+ m_syntaxParserResult(FX_XmlSyntaxResult::None),
+ m_syntaxParserState(FDE_XmlSyntaxState::Text),
+ m_wQuotationMark(0),
+ m_iEntityStart(-1) {
+ ASSERT(pStream);
+
+ m_CurNode.iNodeNum = -1;
+ m_CurNode.eNodeType = FX_XMLNODE_Unknown;
+
+ m_iXMLPlaneSize =
+ std::min(m_iXMLPlaneSize,
+ pdfium::base::checked_cast<FX_STRSIZE>(m_pStream->GetLength()));
+ m_iCurrentPos = m_pStream->GetBOMLength();
+
+ FX_SAFE_STRSIZE alloc_size_safe = m_iXMLPlaneSize;
+ alloc_size_safe += 1; // For NUL.
+ if (!alloc_size_safe.IsValid() || alloc_size_safe.ValueOrDie() <= 0) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return;
+ }
+
+ m_Buffer.resize(pdfium::base::ValueOrDieForType<size_t>(alloc_size_safe));
+
+ m_BlockBuffer.InitBuffer();
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+}
+
+CFX_XMLSyntaxParser::~CFX_XMLSyntaxParser() {}
+
+FX_XmlSyntaxResult CFX_XMLSyntaxParser::DoSyntaxParse() {
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::Error ||
+ m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) {
+ return m_syntaxParserResult;
+ }
+
+ int32_t iStreamLength = m_pStream->GetLength();
+ int32_t iPos;
+
+ FX_XmlSyntaxResult syntaxParserResult = FX_XmlSyntaxResult::None;
+ while (true) {
+ if (m_Start >= m_End) {
+ if (m_bEOS || m_iCurrentPos >= iStreamLength) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString;
+ return m_syntaxParserResult;
+ }
+ m_ParsedChars += m_End;
+ m_iParsedBytes = m_iCurrentPos;
+ if (m_pStream->GetPosition() != m_iCurrentPos)
+ m_pStream->Seek(CFX_SeekableStreamProxy::Pos::Begin, m_iCurrentPos);
+
+ m_iBufferChars =
+ m_pStream->ReadString(m_Buffer.data(), m_iXMLPlaneSize, &m_bEOS);
+ iPos = m_pStream->GetPosition();
+ if (m_iBufferChars < 1) {
+ m_iCurrentPos = iStreamLength;
+ m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString;
+ return m_syntaxParserResult;
+ }
+ m_iCurrentPos = iPos;
+ m_Start = 0;
+ m_End = m_iBufferChars;
+ }
+
+ while (m_Start < m_End) {
+ wchar_t ch = m_Buffer[m_Start];
+ switch (m_syntaxParserState) {
+ case FDE_XmlSyntaxState::Text:
+ if (ch == L'<') {
+ if (!m_BlockBuffer.IsEmpty()) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_iEntityStart = -1;
+ syntaxParserResult = FX_XmlSyntaxResult::Text;
+ } else {
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::Node;
+ }
+ } else {
+ ParseTextChar(ch);
+ }
+ break;
+ case FDE_XmlSyntaxState::Node:
+ if (ch == L'!') {
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipCommentOrDecl;
+ } else if (ch == L'/') {
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseElement;
+ } else if (ch == L'?') {
+ m_iLastNodeNum++;
+ m_iCurrentNodeNum = m_iLastNodeNum;
+ m_CurNode.iNodeNum = m_iLastNodeNum;
+ m_CurNode.eNodeType = FX_XMLNODE_Instruction;
+ m_XMLNodeStack.push(m_CurNode);
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::Target;
+ syntaxParserResult = FX_XmlSyntaxResult::InstructionOpen;
+ } else {
+ m_iLastNodeNum++;
+ m_iCurrentNodeNum = m_iLastNodeNum;
+ m_CurNode.iNodeNum = m_iLastNodeNum;
+ m_CurNode.eNodeType = FX_XMLNODE_Element;
+ m_XMLNodeStack.push(m_CurNode);
+ m_syntaxParserState = FDE_XmlSyntaxState::Tag;
+ syntaxParserResult = FX_XmlSyntaxResult::ElementOpen;
+ }
+ break;
+ case FDE_XmlSyntaxState::Target:
+ case FDE_XmlSyntaxState::Tag:
+ if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
+ if (m_BlockBuffer.IsEmpty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (m_syntaxParserState != FDE_XmlSyntaxState::Target)
+ syntaxParserResult = FX_XmlSyntaxResult::TagName;
+ else
+ syntaxParserResult = FX_XmlSyntaxResult::TargetName;
+
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriName;
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriName:
+ if (m_BlockBuffer.IsEmpty() && IsXMLWhiteSpace(ch)) {
+ m_Start++;
+ break;
+ }
+ if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
+ if (m_BlockBuffer.IsEmpty()) {
+ if (m_CurNode.eNodeType == FX_XMLNODE_Element) {
+ if (ch == L'>' || ch == L'/') {
+ m_syntaxParserState = FDE_XmlSyntaxState::BreakElement;
+ break;
+ }
+ } else if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
+ if (ch == L'?') {
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction;
+ m_Start++;
+ } else {
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ }
+ break;
+ }
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ } else {
+ if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
+ if (ch != '=' && !IsXMLWhiteSpace(ch)) {
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ break;
+ }
+ }
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriEqualSign;
+ syntaxParserResult = FX_XmlSyntaxResult::AttriName;
+ }
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriEqualSign:
+ if (IsXMLWhiteSpace(ch)) {
+ m_Start++;
+ break;
+ }
+ if (ch != L'=') {
+ if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) {
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ break;
+ }
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ } else {
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriQuotation;
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriQuotation:
+ if (IsXMLWhiteSpace(ch)) {
+ m_Start++;
+ break;
+ }
+ if (ch != L'\"' && ch != L'\'') {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ } else {
+ m_wQuotationMark = ch;
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriValue;
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::AttriValue:
+ if (ch == m_wQuotationMark) {
+ if (m_iEntityStart > -1) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_wQuotationMark = 0;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_Start++;
+ m_syntaxParserState = FDE_XmlSyntaxState::AttriName;
+ syntaxParserResult = FX_XmlSyntaxResult::AttriValue;
+ } else {
+ ParseTextChar(ch);
+ }
+ break;
+ case FDE_XmlSyntaxState::CloseInstruction:
+ if (ch != L'>') {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_syntaxParserState = FDE_XmlSyntaxState::TargetData;
+ } else if (!m_BlockBuffer.IsEmpty()) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ syntaxParserResult = FX_XmlSyntaxResult::TargetData;
+ } else {
+ m_Start++;
+ if (m_XMLNodeStack.empty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_XMLNodeStack.pop();
+ if (!m_XMLNodeStack.empty()) {
+ m_CurNode = m_XMLNodeStack.top();
+ } else {
+ m_CurNode.iNodeNum = -1;
+ m_CurNode.eNodeType = FX_XMLNODE_Unknown;
+ }
+ m_iCurrentNodeNum = m_CurNode.iNodeNum;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ syntaxParserResult = FX_XmlSyntaxResult::InstructionClose;
+ }
+ break;
+ case FDE_XmlSyntaxState::BreakElement:
+ if (ch == L'>') {
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ syntaxParserResult = FX_XmlSyntaxResult::ElementBreak;
+ } else if (ch == L'/') {
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseElement;
+ } else {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_Start++;
+ break;
+ case FDE_XmlSyntaxState::CloseElement:
+ if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) {
+ if (ch == L'>') {
+ if (m_XMLNodeStack.empty()) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ m_XMLNodeStack.pop();
+ if (!m_XMLNodeStack.empty()) {
+ m_CurNode = m_XMLNodeStack.top();
+ } else {
+ m_CurNode.iNodeNum = -1;
+ m_CurNode.eNodeType = FX_XMLNODE_Unknown;
+ }
+ m_iCurrentNodeNum = m_CurNode.iNodeNum;
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ syntaxParserResult = FX_XmlSyntaxResult::ElementClose;
+ } else if (!IsXMLWhiteSpace(ch)) {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ }
+ m_Start++;
+ break;
+ case FDE_XmlSyntaxState::SkipCommentOrDecl:
+ if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"--", 2) == 0) {
+ m_Start += 2;
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipComment;
+ } else if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"[CDATA[", 7) ==
+ 0) {
+ m_Start += 7;
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipCData;
+ } else {
+ m_syntaxParserState = FDE_XmlSyntaxState::SkipDeclNode;
+ m_SkipChar = L'>';
+ m_SkipStack.push(L'>');
+ }
+ break;
+ case FDE_XmlSyntaxState::SkipCData: {
+ if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"]]>", 3) == 0) {
+ m_Start += 3;
+ syntaxParserResult = FX_XmlSyntaxResult::CData;
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock)
+ return FX_XmlSyntaxResult::Error;
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ }
+ case FDE_XmlSyntaxState::SkipDeclNode:
+ if (m_SkipChar == L'\'' || m_SkipChar == L'\"') {
+ m_Start++;
+ if (ch != m_SkipChar)
+ break;
+
+ m_SkipStack.pop();
+ if (m_SkipStack.empty())
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ else
+ m_SkipChar = m_SkipStack.top();
+ } else {
+ switch (ch) {
+ case L'<':
+ m_SkipChar = L'>';
+ m_SkipStack.push(L'>');
+ break;
+ case L'[':
+ m_SkipChar = L']';
+ m_SkipStack.push(L']');
+ break;
+ case L'(':
+ m_SkipChar = L')';
+ m_SkipStack.push(L')');
+ break;
+ case L'\'':
+ m_SkipChar = L'\'';
+ m_SkipStack.push(L'\'');
+ break;
+ case L'\"':
+ m_SkipChar = L'\"';
+ m_SkipStack.push(L'\"');
+ break;
+ default:
+ if (ch == m_SkipChar) {
+ m_SkipStack.pop();
+ if (m_SkipStack.empty()) {
+ if (m_BlockBuffer.GetDataLength() >= 9)
+ (void)m_BlockBuffer.GetTextData(0, 7);
+
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ } else {
+ m_SkipChar = m_SkipStack.top();
+ }
+ }
+ break;
+ }
+ if (!m_SkipStack.empty()) {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ }
+ m_Start++;
+ }
+ break;
+ case FDE_XmlSyntaxState::SkipComment:
+ if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"-->", 3) == 0) {
+ m_Start += 2;
+ m_syntaxParserState = FDE_XmlSyntaxState::Text;
+ }
+
+ m_Start++;
+ break;
+ case FDE_XmlSyntaxState::TargetData:
+ if (IsXMLWhiteSpace(ch)) {
+ if (m_BlockBuffer.IsEmpty()) {
+ m_Start++;
+ break;
+ } else if (m_wQuotationMark == 0) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_wQuotationMark = 0;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_Start++;
+ syntaxParserResult = FX_XmlSyntaxResult::TargetData;
+ break;
+ }
+ }
+ if (ch == '?') {
+ m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction;
+ m_Start++;
+ } else if (ch == '\"') {
+ if (m_wQuotationMark == 0) {
+ m_wQuotationMark = ch;
+ m_Start++;
+ } else if (ch == m_wQuotationMark) {
+ m_iTextDataLength = m_BlockBuffer.GetDataLength();
+ m_wQuotationMark = 0;
+ m_BlockBuffer.Reset(true);
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_Start++;
+ syntaxParserResult = FX_XmlSyntaxResult::TargetData;
+ } else {
+ m_syntaxParserResult = FX_XmlSyntaxResult::Error;
+ return m_syntaxParserResult;
+ }
+ } else {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock) {
+ return FX_XmlSyntaxResult::Error;
+ }
+ }
+ m_pCurrentBlock[m_iIndexInBlock++] = ch;
+ m_BlockBuffer.IncrementDataLength();
+ m_Start++;
+ }
+ break;
+ default:
+ break;
+ }
+ if (syntaxParserResult != FX_XmlSyntaxResult::None)
+ return syntaxParserResult;
+ }
+ }
+ return FX_XmlSyntaxResult::Text;
+}
+
+int32_t CFX_XMLSyntaxParser::GetStatus() const {
+ if (!m_pStream)
+ return -1;
+
+ int32_t iStreamLength = m_pStream->GetLength();
+ if (iStreamLength < 1)
+ return 100;
+
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::Error)
+ return -1;
+
+ if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString)
+ return 100;
+ return m_iParsedBytes * 100 / iStreamLength;
+}
+
+FX_FILESIZE CFX_XMLSyntaxParser::GetCurrentBinaryPos() const {
+ if (!m_pStream)
+ return 0;
+
+ int32_t nDstLen = GetUTF8EncodeLength(m_Buffer, m_Start);
+ return m_iParsedBytes + nDstLen;
+}
+
+void CFX_XMLSyntaxParser::ParseTextChar(wchar_t character) {
+ if (m_iIndexInBlock == m_iAllocStep) {
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ if (!m_pCurrentBlock)
+ return;
+ }
+
+ m_pCurrentBlock[m_iIndexInBlock++] = character;
+ m_BlockBuffer.IncrementDataLength();
+ if (m_iEntityStart > -1 && character == L';') {
+ CFX_WideString csEntity = m_BlockBuffer.GetTextData(
+ m_iEntityStart + 1,
+ m_BlockBuffer.GetDataLength() - 1 - m_iEntityStart - 1);
+ int32_t iLen = csEntity.GetLength();
+ if (iLen > 0) {
+ if (csEntity[0] == L'#') {
+ uint32_t ch = 0;
+ wchar_t w;
+ if (iLen > 1 && csEntity[1] == L'x') {
+ for (int32_t i = 2; i < iLen; i++) {
+ w = csEntity[i];
+ if (w >= L'0' && w <= L'9') {
+ ch = (ch << 4) + w - L'0';
+ } else if (w >= L'A' && w <= L'F') {
+ ch = (ch << 4) + w - 55;
+ } else if (w >= L'a' && w <= L'f') {
+ ch = (ch << 4) + w - 87;
+ } else {
+ break;
+ }
+ }
+ } else {
+ for (int32_t i = 1; i < iLen; i++) {
+ w = csEntity[i];
+ if (w < L'0' || w > L'9')
+ break;
+ ch = ch * 10 + w - L'0';
+ }
+ }
+ if (ch > kMaxCharRange)
+ ch = ' ';
+
+ character = static_cast<wchar_t>(ch);
+ if (character != 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, character);
+ m_iEntityStart++;
+ }
+ } else {
+ if (csEntity.Compare(L"amp") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'&');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"lt") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'<');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"gt") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'>');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"apos") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'\'');
+ m_iEntityStart++;
+ } else if (csEntity.Compare(L"quot") == 0) {
+ m_BlockBuffer.SetTextChar(m_iEntityStart, L'\"');
+ m_iEntityStart++;
+ }
+ }
+ }
+ if (m_iEntityStart >= 0 &&
+ m_BlockBuffer.GetDataLength() > static_cast<size_t>(m_iEntityStart)) {
+ m_BlockBuffer.DeleteTextChars(m_BlockBuffer.GetDataLength() -
+ m_iEntityStart);
+ }
+ std::tie(m_pCurrentBlock, m_iIndexInBlock) =
+ m_BlockBuffer.GetAvailableBlock();
+ m_iEntityStart = -1;
+ } else if (m_iEntityStart < 0 && character == L'&') {
+ m_iEntityStart = m_BlockBuffer.GetDataLength() - 1;
+ }
+ m_Start++;
+}
diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.h b/core/fxcrt/xml/cfx_xmlsyntaxparser.h
new file mode 100644
index 0000000000..519f2833d6
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlsyntaxparser.h
@@ -0,0 +1,128 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_
+#define CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_
+
+#include <stack>
+#include <vector>
+
+#include "core/fxcrt/cfx_blockbuffer.h"
+#include "core/fxcrt/cfx_retain_ptr.h"
+#include "core/fxcrt/cfx_seekablestreamproxy.h"
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+
+enum class FX_XmlSyntaxResult {
+ None,
+ InstructionOpen,
+ InstructionClose,
+ ElementOpen,
+ ElementBreak,
+ ElementClose,
+ TargetName,
+ TagName,
+ AttriName,
+ AttriValue,
+ Text,
+ CData,
+ TargetData,
+ Error,
+ EndOfString
+};
+
+class CFX_XMLSyntaxParser {
+ public:
+ explicit CFX_XMLSyntaxParser(
+ const CFX_RetainPtr<CFX_SeekableStreamProxy>& pStream);
+ ~CFX_XMLSyntaxParser();
+
+ FX_XmlSyntaxResult DoSyntaxParse();
+
+ int32_t GetStatus() const;
+ FX_FILESIZE GetCurrentPos() const { return m_ParsedChars + m_Start; }
+ FX_FILESIZE GetCurrentBinaryPos() const;
+ int32_t GetCurrentNodeNumber() const { return m_iCurrentNodeNum; }
+ int32_t GetLastNodeNumber() const { return m_iLastNodeNum; }
+
+ CFX_WideString GetTargetName() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ CFX_WideString GetTagName() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ CFX_WideString GetAttributeName() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ CFX_WideString GetAttributeValue() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ CFX_WideString GetTextData() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ CFX_WideString GetTargetData() const {
+ return m_BlockBuffer.GetTextData(0, m_iTextDataLength);
+ }
+
+ protected:
+ enum class FDE_XmlSyntaxState {
+ Text,
+ Node,
+ Target,
+ Tag,
+ AttriName,
+ AttriEqualSign,
+ AttriQuotation,
+ AttriValue,
+ Entity,
+ EntityDecimal,
+ EntityHex,
+ CloseInstruction,
+ BreakElement,
+ CloseElement,
+ SkipDeclNode,
+ DeclCharData,
+ SkipComment,
+ SkipCommentOrDecl,
+ SkipCData,
+ TargetData
+ };
+
+ void ParseTextChar(wchar_t ch);
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> m_pStream;
+ FX_STRSIZE m_iXMLPlaneSize;
+ int32_t m_iCurrentPos;
+ int32_t m_iCurrentNodeNum;
+ int32_t m_iLastNodeNum;
+ int32_t m_iParsedBytes;
+ FX_FILESIZE m_ParsedChars;
+ std::vector<wchar_t> m_Buffer;
+ int32_t m_iBufferChars;
+ bool m_bEOS;
+ FX_FILESIZE m_Start; // Start position in m_Buffer
+ FX_FILESIZE m_End; // End position in m_Buffer
+ FX_XMLNODE m_CurNode;
+ std::stack<FX_XMLNODE> m_XMLNodeStack;
+ CFX_BlockBuffer m_BlockBuffer;
+ int32_t m_iAllocStep;
+ wchar_t* m_pCurrentBlock; // Pointer into CFX_BlockBuffer
+ int32_t m_iIndexInBlock;
+ int32_t m_iTextDataLength;
+ FX_XmlSyntaxResult m_syntaxParserResult;
+ FDE_XmlSyntaxState m_syntaxParserState;
+ wchar_t m_wQuotationMark;
+ int32_t m_iEntityStart;
+ std::stack<wchar_t> m_SkipStack;
+ wchar_t m_SkipChar;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_
diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp b/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp
new file mode 100644
index 0000000000..60c0d34655
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp
@@ -0,0 +1,527 @@
+// Copyright 2016 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h"
+
+#include <memory>
+
+#include "core/fxcrt/cfx_seekablestreamproxy.h"
+#include "core/fxcrt/fx_codepage.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "testing/test_support.h"
+
+class CFX_XMLSyntaxParserTest : public pdfium::FPDF_Test {};
+
+TEST_F(CFX_XMLSyntaxParserTest, CData) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <![CDATA[\n"
+ " if (a[1] < 3)\n"
+ " app.alert(\"Tclams\");\n"
+ " ]]>\n"
+ "</script>";
+
+ const wchar_t* cdata =
+ L"\n"
+ L" if (a[1] < 3)\n"
+ L" app.alert(\"Tclams\");\n"
+ L" ";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::CData, parser.DoSyntaxParse());
+ ASSERT_EQ(cdata, parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, CDataWithInnerScript) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <![CDATA[\n"
+ " if (a[1] < 3)\n"
+ " app.alert(\"Tclams\");\n"
+ " </script>\n"
+ " ]]>\n"
+ "</script>";
+
+ const wchar_t* cdata =
+ L"\n"
+ L" if (a[1] < 3)\n"
+ L" app.alert(\"Tclams\");\n"
+ L" </script>\n"
+ L" ";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::CData, parser.DoSyntaxParse());
+ ASSERT_EQ(cdata, parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, ArrowBangArrow) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <!>\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <![>\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ // Parser walks to end of input.
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, IncompleteCData) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <![CDATA>\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ // Parser walks to end of input.
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, UnClosedCData) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <![CDATA[\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ // Parser walks to end of input.
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, EmptyCData) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <![CDATA[]]>\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::CData, parser.DoSyntaxParse());
+ ASSERT_EQ(L"", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, Comment) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <!-- A Comment -->\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, IncorrectCommentStart) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <!- A Comment -->\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, CommentEmpty) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <!---->\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, CommentThreeDash) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <!--->\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, CommentTwoDash) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">\n"
+ " <!-->\n"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"\n ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, Entities) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">"
+ "&#66;"
+ "&#x54;"
+ "&#x00000000000000000048;"
+ "&#x0000000000000000AB48;"
+ "&#x0000000000000000000;"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L"BTH\xab48", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, EntityOverflowHex) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">"
+ "&#xaDBDFFFFF;"
+ "&#xafffffffffffffffffffffffffffffffff;"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L" ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
+
+TEST_F(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) {
+ const char* input =
+ "<script contentType=\"application/x-javascript\">"
+ "&#2914910205;"
+ "&#29149102052342342134521341234512351234213452315;"
+ "</script>";
+
+ CFX_RetainPtr<CFX_SeekableStreamProxy> stream =
+ pdfium::MakeRetain<CFX_SeekableStreamProxy>(
+ reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input));
+ stream->SetCodePage(FX_CODEPAGE_UTF8);
+
+ CFX_XMLSyntaxParser parser(stream);
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse());
+ ASSERT_EQ(L"contentType", parser.GetAttributeName());
+ ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse());
+ ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse());
+ ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse());
+ ASSERT_EQ(L" ", parser.GetTextData());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse());
+ ASSERT_EQ(L"script", parser.GetTagName());
+
+ ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse());
+}
diff --git a/core/fxcrt/xml/cfx_xmltext.cpp b/core/fxcrt/xml/cfx_xmltext.cpp
new file mode 100644
index 0000000000..4c41fcd9f8
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmltext.cpp
@@ -0,0 +1,22 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/xml/cfx_xmltext.h"
+
+#include "third_party/base/ptr_util.h"
+
+CFX_XMLText::CFX_XMLText(const CFX_WideString& wsText)
+ : CFX_XMLNode(), m_wsText(wsText) {}
+
+CFX_XMLText::~CFX_XMLText() {}
+
+FX_XMLNODETYPE CFX_XMLText::GetType() const {
+ return FX_XMLNODE_Text;
+}
+
+std::unique_ptr<CFX_XMLNode> CFX_XMLText::Clone() {
+ return pdfium::MakeUnique<CFX_XMLText>(m_wsText);
+}
diff --git a/core/fxcrt/xml/cfx_xmltext.h b/core/fxcrt/xml/cfx_xmltext.h
new file mode 100644
index 0000000000..20fb858302
--- /dev/null
+++ b/core/fxcrt/xml/cfx_xmltext.h
@@ -0,0 +1,31 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_XML_CFX_XMLTEXT_H_
+#define CORE_FXCRT_XML_CFX_XMLTEXT_H_
+
+#include <memory>
+
+#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/xml/cfx_xmlnode.h"
+
+class CFX_XMLText : public CFX_XMLNode {
+ public:
+ explicit CFX_XMLText(const CFX_WideString& wsText);
+ ~CFX_XMLText() override;
+
+ // CFX_XMLNode
+ FX_XMLNODETYPE GetType() const override;
+ std::unique_ptr<CFX_XMLNode> Clone() override;
+
+ CFX_WideString GetText() const { return m_wsText; }
+ void SetText(const CFX_WideString& wsText) { m_wsText = wsText; }
+
+ private:
+ CFX_WideString m_wsText;
+};
+
+#endif // CORE_FXCRT_XML_CFX_XMLTEXT_H_