From 0d86ecb08e1b2c204333b1f1f6b0b014e5b2971c Mon Sep 17 00:00:00 2001 From: Dan Sinclair Date: Wed, 19 Apr 2017 09:19:57 -0400 Subject: Move fde XML parser to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL moves the XML parser from FDE into FXCRT and renames to CFX_ from CFDE_. Change-Id: I21a9590bf74daf5517df630d7e7a5de89da99ea4 Reviewed-on: https://pdfium-review.googlesource.com/4312 Commit-Queue: dsinclair Reviewed-by: Tom Sepez Reviewed-by: Nicolás Peña --- core/fxcrt/xml/cfx_xmlattributenode.cpp | 35 ++ core/fxcrt/xml/cfx_xmlattributenode.h | 44 ++ core/fxcrt/xml/cfx_xmlchardata.cpp | 22 + core/fxcrt/xml/cfx_xmlchardata.h | 24 + core/fxcrt/xml/cfx_xmldoc.cpp | 160 ++++++ core/fxcrt/xml/cfx_xmldoc.h | 37 ++ core/fxcrt/xml/cfx_xmlelement.cpp | 102 ++++ core/fxcrt/xml/cfx_xmlelement.h | 33 ++ core/fxcrt/xml/cfx_xmlinstruction.cpp | 36 ++ core/fxcrt/xml/cfx_xmlinstruction.h | 35 ++ core/fxcrt/xml/cfx_xmlnode.cpp | 441 +++++++++++++++ core/fxcrt/xml/cfx_xmlnode.h | 75 +++ core/fxcrt/xml/cfx_xmlparser.cpp | 171 ++++++ core/fxcrt/xml/cfx_xmlparser.h | 47 ++ core/fxcrt/xml/cfx_xmlsyntaxparser.cpp | 698 ++++++++++++++++++++++++ core/fxcrt/xml/cfx_xmlsyntaxparser.h | 128 +++++ core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp | 527 ++++++++++++++++++ core/fxcrt/xml/cfx_xmltext.cpp | 22 + core/fxcrt/xml/cfx_xmltext.h | 31 ++ 19 files changed, 2668 insertions(+) create mode 100644 core/fxcrt/xml/cfx_xmlattributenode.cpp create mode 100644 core/fxcrt/xml/cfx_xmlattributenode.h create mode 100644 core/fxcrt/xml/cfx_xmlchardata.cpp create mode 100644 core/fxcrt/xml/cfx_xmlchardata.h create mode 100644 core/fxcrt/xml/cfx_xmldoc.cpp create mode 100644 core/fxcrt/xml/cfx_xmldoc.h create mode 100644 core/fxcrt/xml/cfx_xmlelement.cpp create mode 100644 core/fxcrt/xml/cfx_xmlelement.h create mode 100644 core/fxcrt/xml/cfx_xmlinstruction.cpp create mode 100644 core/fxcrt/xml/cfx_xmlinstruction.h create mode 100644 core/fxcrt/xml/cfx_xmlnode.cpp create mode 100644 core/fxcrt/xml/cfx_xmlnode.h create mode 100644 core/fxcrt/xml/cfx_xmlparser.cpp create mode 100644 core/fxcrt/xml/cfx_xmlparser.h create mode 100644 core/fxcrt/xml/cfx_xmlsyntaxparser.cpp create mode 100644 core/fxcrt/xml/cfx_xmlsyntaxparser.h create mode 100644 core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp create mode 100644 core/fxcrt/xml/cfx_xmltext.cpp create mode 100644 core/fxcrt/xml/cfx_xmltext.h (limited to 'core/fxcrt/xml') diff --git a/core/fxcrt/xml/cfx_xmlattributenode.cpp b/core/fxcrt/xml/cfx_xmlattributenode.cpp new file mode 100644 index 0000000000..9c81efc109 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlattributenode.cpp @@ -0,0 +1,35 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlattributenode.h" + +#include "core/fxcrt/fx_ext.h" + +CFX_XMLAttributeNode::CFX_XMLAttributeNode(const CFX_WideString& name) + : CFX_XMLNode(), name_(name) { + ASSERT(name_.GetLength() > 0); +} + +CFX_XMLAttributeNode::~CFX_XMLAttributeNode() {} + +bool CFX_XMLAttributeNode::HasAttribute(const CFX_WideString& name) const { + return attrs_.find(name) != attrs_.end(); +} + +CFX_WideString CFX_XMLAttributeNode::GetString( + const CFX_WideString& name) const { + auto it = attrs_.find(name); + return it != attrs_.end() ? it->second : CFX_WideString(); +} + +void CFX_XMLAttributeNode::SetString(const CFX_WideString& name, + const CFX_WideString& value) { + attrs_[name] = value; +} + +void CFX_XMLAttributeNode::RemoveAttribute(const CFX_WideString& name) { + attrs_.erase(name); +} diff --git a/core/fxcrt/xml/cfx_xmlattributenode.h b/core/fxcrt/xml/cfx_xmlattributenode.h new file mode 100644 index 0000000000..e8f358eb59 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlattributenode.h @@ -0,0 +1,44 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLATTRIBUTENODE_H_ +#define CORE_FXCRT_XML_CFX_XMLATTRIBUTENODE_H_ + +#include +#include + +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" + +class CFX_XMLAttributeNode : public CFX_XMLNode { + public: + explicit CFX_XMLAttributeNode(const CFX_WideString& name); + ~CFX_XMLAttributeNode() override; + + // CFX_XMLNode + FX_XMLNODETYPE GetType() const override = 0; + std::unique_ptr Clone() override = 0; + + CFX_WideString GetName() const { return name_; } + const std::map& GetAttributes() const { + return attrs_; + } + void SetAttributes(const std::map& attrs) { + attrs_ = attrs; + } + bool HasAttribute(const CFX_WideString& name) const; + + void SetString(const CFX_WideString& name, const CFX_WideString& value); + CFX_WideString GetString(const CFX_WideString& name) const; + + void RemoveAttribute(const CFX_WideString& name); + + private: + CFX_WideString name_; + std::map attrs_; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLATTRIBUTENODE_H_ diff --git a/core/fxcrt/xml/cfx_xmlchardata.cpp b/core/fxcrt/xml/cfx_xmlchardata.cpp new file mode 100644 index 0000000000..185bd064df --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlchardata.cpp @@ -0,0 +1,22 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlchardata.h" + +#include "third_party/base/ptr_util.h" + +CFX_XMLCharData::CFX_XMLCharData(const CFX_WideString& wsCData) + : CFX_XMLText(wsCData) {} + +CFX_XMLCharData::~CFX_XMLCharData() {} + +FX_XMLNODETYPE CFX_XMLCharData::GetType() const { + return FX_XMLNODE_CharData; +} + +std::unique_ptr CFX_XMLCharData::Clone() { + return pdfium::MakeUnique(GetText()); +} diff --git a/core/fxcrt/xml/cfx_xmlchardata.h b/core/fxcrt/xml/cfx_xmlchardata.h new file mode 100644 index 0000000000..c5c007be90 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlchardata.h @@ -0,0 +1,24 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLCHARDATA_H_ +#define CORE_FXCRT_XML_CFX_XMLCHARDATA_H_ + +#include + +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmltext.h" + +class CFX_XMLCharData : public CFX_XMLText { + public: + explicit CFX_XMLCharData(const CFX_WideString& wsCData); + ~CFX_XMLCharData() override; + + FX_XMLNODETYPE GetType() const override; + std::unique_ptr Clone() override; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLCHARDATA_H_ diff --git a/core/fxcrt/xml/cfx_xmldoc.cpp b/core/fxcrt/xml/cfx_xmldoc.cpp new file mode 100644 index 0000000000..4f58da91f2 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmldoc.cpp @@ -0,0 +1,160 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmldoc.h" + +#include +#include + +#include "core/fxcrt/fx_codepage.h" +#include "core/fxcrt/xml/cfx_xmlchardata.h" +#include "core/fxcrt/xml/cfx_xmlelement.h" +#include "core/fxcrt/xml/cfx_xmlinstruction.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" +#include "core/fxcrt/xml/cfx_xmltext.h" +#include "third_party/base/ptr_util.h" +#include "third_party/base/stl_util.h" + +CFX_XMLDoc::CFX_XMLDoc() + : m_iStatus(0), m_pRoot(pdfium::MakeUnique()) { + m_pRoot->InsertChildNode(new CFX_XMLInstruction(L"xml")); +} + +CFX_XMLDoc::~CFX_XMLDoc() {} + +bool CFX_XMLDoc::LoadXML(std::unique_ptr pXMLParser) { + if (!pXMLParser) + return false; + + m_iStatus = 0; + m_pStream.Reset(); + m_pRoot->DeleteChildren(); + m_pXMLParser = std::move(pXMLParser); + return true; +} + +int32_t CFX_XMLDoc::DoLoad(IFX_Pause* pPause) { + if (m_iStatus < 100) + m_iStatus = m_pXMLParser->DoParser(pPause); + + return m_iStatus; +} + +void CFX_XMLDoc::CloseXML() { + m_pXMLParser.reset(); +} + +void CFX_XMLDoc::SaveXMLNode( + const CFX_RetainPtr& pXMLStream, + CFX_XMLNode* pINode) { + CFX_XMLNode* pNode = (CFX_XMLNode*)pINode; + switch (pNode->GetType()) { + case FX_XMLNODE_Instruction: { + CFX_WideString ws; + CFX_XMLInstruction* pInstruction = (CFX_XMLInstruction*)pNode; + if (pInstruction->GetName().CompareNoCase(L"xml") == 0) { + ws = L"GetCodePage(); + if (wCodePage == FX_CODEPAGE_UTF16LE) { + ws += L"UTF-16"; + } else if (wCodePage == FX_CODEPAGE_UTF16BE) { + ws += L"UTF-16be"; + } else { + ws += L"UTF-8"; + } + ws += L"\"?>"; + pXMLStream->WriteString(ws.AsStringC()); + } else { + ws.Format(L"GetName().c_str()); + pXMLStream->WriteString(ws.AsStringC()); + + for (auto it : pInstruction->GetAttributes()) { + CFX_WideString wsValue = it.second; + wsValue.Replace(L"&", L"&"); + wsValue.Replace(L"<", L"<"); + wsValue.Replace(L">", L">"); + wsValue.Replace(L"\'", L"'"); + wsValue.Replace(L"\"", L"""); + + ws = L" "; + ws += it.first; + ws += L"=\""; + ws += wsValue; + ws += L"\""; + pXMLStream->WriteString(ws.AsStringC()); + } + + for (auto target : pInstruction->GetTargetData()) { + ws = L" \""; + ws += target; + ws += L"\""; + pXMLStream->WriteString(ws.AsStringC()); + } + ws = L"?>"; + pXMLStream->WriteString(ws.AsStringC()); + } + break; + } + case FX_XMLNODE_Element: { + CFX_WideString ws; + ws = L"<"; + ws += static_cast(pNode)->GetName(); + pXMLStream->WriteString(ws.AsStringC()); + + for (auto it : static_cast(pNode)->GetAttributes()) { + CFX_WideString wsValue = it.second; + wsValue.Replace(L"&", L"&"); + wsValue.Replace(L"<", L"<"); + wsValue.Replace(L">", L">"); + wsValue.Replace(L"\'", L"'"); + wsValue.Replace(L"\"", L"""); + + ws = L" "; + ws += it.first; + ws += L"=\""; + ws += wsValue; + ws += L"\""; + pXMLStream->WriteString(ws.AsStringC()); + } + if (pNode->m_pChild) { + ws = L"\n>"; + pXMLStream->WriteString(ws.AsStringC()); + CFX_XMLNode* pChild = pNode->m_pChild; + while (pChild) { + SaveXMLNode(pXMLStream, static_cast(pChild)); + pChild = pChild->m_pNext; + } + ws = L"(pNode)->GetName(); + ws += L"\n>"; + } else { + ws = L"\n/>"; + } + pXMLStream->WriteString(ws.AsStringC()); + break; + } + case FX_XMLNODE_Text: { + CFX_WideString ws = static_cast(pNode)->GetText(); + ws.Replace(L"&", L"&"); + ws.Replace(L"<", L"<"); + ws.Replace(L">", L">"); + ws.Replace(L"\'", L"'"); + ws.Replace(L"\"", L"""); + pXMLStream->WriteString(ws.AsStringC()); + break; + } + case FX_XMLNODE_CharData: { + CFX_WideString ws = L"(pNode)->GetText(); + ws += L"]]>"; + pXMLStream->WriteString(ws.AsStringC()); + break; + } + case FX_XMLNODE_Unknown: + default: + break; + } +} diff --git a/core/fxcrt/xml/cfx_xmldoc.h b/core/fxcrt/xml/cfx_xmldoc.h new file mode 100644 index 0000000000..5966c096ba --- /dev/null +++ b/core/fxcrt/xml/cfx_xmldoc.h @@ -0,0 +1,37 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLDOC_H_ +#define CORE_FXCRT_XML_CFX_XMLDOC_H_ + +#include + +#include "core/fxcrt/cfx_retain_ptr.h" +#include "core/fxcrt/cfx_seekablestreamproxy.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" +#include "core/fxcrt/xml/cfx_xmlparser.h" + +class CFX_XMLDoc { + public: + CFX_XMLDoc(); + ~CFX_XMLDoc(); + + bool LoadXML(std::unique_ptr pXMLParser); + int32_t DoLoad(IFX_Pause* pPause); + void CloseXML(); + + CFX_XMLNode* GetRoot() const { return m_pRoot.get(); } + void SaveXMLNode(const CFX_RetainPtr& pXMLStream, + CFX_XMLNode* pNode); + + private: + int32_t m_iStatus; + std::unique_ptr m_pRoot; + std::unique_ptr m_pXMLParser; + CFX_RetainPtr m_pStream; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLDOC_H_ diff --git a/core/fxcrt/xml/cfx_xmlelement.cpp b/core/fxcrt/xml/cfx_xmlelement.cpp new file mode 100644 index 0000000000..c6b70e1cc4 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlelement.cpp @@ -0,0 +1,102 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlelement.h" + +#include "core/fxcrt/fx_ext.h" +#include "core/fxcrt/xml/cfx_xmlchardata.h" +#include "core/fxcrt/xml/cfx_xmltext.h" +#include "third_party/base/ptr_util.h" +#include "third_party/base/stl_util.h" + +CFX_XMLElement::CFX_XMLElement(const CFX_WideString& wsTag) + : CFX_XMLAttributeNode(wsTag) {} + +CFX_XMLElement::~CFX_XMLElement() {} + +FX_XMLNODETYPE CFX_XMLElement::GetType() const { + return FX_XMLNODE_Element; +} + +std::unique_ptr CFX_XMLElement::Clone() { + auto pClone = pdfium::MakeUnique(GetName()); + pClone->SetAttributes(GetAttributes()); + + CFX_WideString wsText; + CFX_XMLNode* pChild = m_pChild; + while (pChild) { + switch (pChild->GetType()) { + case FX_XMLNODE_Text: + wsText += static_cast(pChild)->GetText(); + break; + default: + break; + } + pChild = pChild->m_pNext; + } + pClone->SetTextData(wsText); + return pClone; +} + +CFX_WideString CFX_XMLElement::GetLocalTagName() const { + FX_STRSIZE iFind = GetName().Find(L':', 0); + if (iFind < 0) + return GetName(); + return GetName().Right(GetName().GetLength() - iFind - 1); +} + +CFX_WideString CFX_XMLElement::GetNamespacePrefix() const { + FX_STRSIZE iFind = GetName().Find(L':', 0); + if (iFind < 0) + return CFX_WideString(); + return GetName().Left(iFind); +} + +CFX_WideString CFX_XMLElement::GetNamespaceURI() const { + CFX_WideString wsAttri(L"xmlns"); + CFX_WideString wsPrefix = GetNamespacePrefix(); + if (wsPrefix.GetLength() > 0) { + wsAttri += L":"; + wsAttri += wsPrefix; + } + + auto* pNode = static_cast(this); + while (pNode) { + if (pNode->GetType() != FX_XMLNODE_Element) + break; + + auto* pElement = static_cast(pNode); + if (!pElement->HasAttribute(wsAttri)) { + pNode = pNode->GetNodeItem(CFX_XMLNode::Parent); + continue; + } + return pElement->GetString(wsAttri); + } + return CFX_WideString(); +} + +CFX_WideString CFX_XMLElement::GetTextData() const { + CFX_WideTextBuf buffer; + CFX_XMLNode* pChild = m_pChild; + while (pChild) { + switch (pChild->GetType()) { + case FX_XMLNODE_Text: + case FX_XMLNODE_CharData: + buffer << static_cast(pChild)->GetText(); + break; + default: + break; + } + pChild = pChild->m_pNext; + } + return buffer.MakeString(); +} + +void CFX_XMLElement::SetTextData(const CFX_WideString& wsText) { + if (wsText.GetLength() < 1) + return; + InsertChildNode(new CFX_XMLText(wsText)); +} diff --git a/core/fxcrt/xml/cfx_xmlelement.h b/core/fxcrt/xml/cfx_xmlelement.h new file mode 100644 index 0000000000..20780342af --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlelement.h @@ -0,0 +1,33 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLELEMENT_H_ +#define CORE_FXCRT_XML_CFX_XMLELEMENT_H_ + +#include +#include + +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmlattributenode.h" + +class CFX_XMLElement : public CFX_XMLAttributeNode { + public: + explicit CFX_XMLElement(const CFX_WideString& wsTag); + ~CFX_XMLElement() override; + + // CFX_XMLNode + FX_XMLNODETYPE GetType() const override; + std::unique_ptr Clone() override; + + CFX_WideString GetLocalTagName() const; + CFX_WideString GetNamespacePrefix() const; + CFX_WideString GetNamespaceURI() const; + + CFX_WideString GetTextData() const; + void SetTextData(const CFX_WideString& wsText); +}; + +#endif // CORE_FXCRT_XML_CFX_XMLELEMENT_H_ diff --git a/core/fxcrt/xml/cfx_xmlinstruction.cpp b/core/fxcrt/xml/cfx_xmlinstruction.cpp new file mode 100644 index 0000000000..d07b92e3c1 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlinstruction.cpp @@ -0,0 +1,36 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlinstruction.h" + +#include "core/fxcrt/fx_ext.h" +#include "third_party/base/ptr_util.h" +#include "third_party/base/stl_util.h" + +CFX_XMLInstruction::CFX_XMLInstruction(const CFX_WideString& wsTarget) + : CFX_XMLAttributeNode(wsTarget) {} + +CFX_XMLInstruction::~CFX_XMLInstruction() {} + +FX_XMLNODETYPE CFX_XMLInstruction::GetType() const { + return FX_XMLNODE_Instruction; +} + +std::unique_ptr CFX_XMLInstruction::Clone() { + auto pClone = pdfium::MakeUnique(GetName()); + pClone->SetAttributes(GetAttributes()); + pClone->m_TargetData = m_TargetData; + return pClone; +} + +void CFX_XMLInstruction::AppendData(const CFX_WideString& wsData) { + m_TargetData.push_back(wsData); +} + +void CFX_XMLInstruction::RemoveData(int32_t index) { + if (pdfium::IndexInBounds(m_TargetData, index)) + m_TargetData.erase(m_TargetData.begin() + index); +} diff --git a/core/fxcrt/xml/cfx_xmlinstruction.h b/core/fxcrt/xml/cfx_xmlinstruction.h new file mode 100644 index 0000000000..99554fc239 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlinstruction.h @@ -0,0 +1,35 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLINSTRUCTION_H_ +#define CORE_FXCRT_XML_CFX_XMLINSTRUCTION_H_ + +#include +#include + +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmlattributenode.h" + +class CFX_XMLInstruction : public CFX_XMLAttributeNode { + public: + explicit CFX_XMLInstruction(const CFX_WideString& wsTarget); + ~CFX_XMLInstruction() override; + + // CFX_XMLNode + FX_XMLNODETYPE GetType() const override; + std::unique_ptr Clone() override; + + const std::vector& GetTargetData() const { + return m_TargetData; + } + void AppendData(const CFX_WideString& wsData); + void RemoveData(int32_t index); + + private: + std::vector m_TargetData; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLINSTRUCTION_H_ diff --git a/core/fxcrt/xml/cfx_xmlnode.cpp b/core/fxcrt/xml/cfx_xmlnode.cpp new file mode 100644 index 0000000000..47b3105f10 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlnode.cpp @@ -0,0 +1,441 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlnode.h" + +#include + +#include "core/fxcrt/fx_codepage.h" +#include "core/fxcrt/xml/cfx_xmlchardata.h" +#include "core/fxcrt/xml/cfx_xmlelement.h" +#include "core/fxcrt/xml/cfx_xmlinstruction.h" +#include "core/fxcrt/xml/cfx_xmltext.h" +#include "third_party/base/stl_util.h" + +CFX_XMLNode::CFX_XMLNode() + : m_pParent(nullptr), + m_pChild(nullptr), + m_pPrior(nullptr), + m_pNext(nullptr) {} + +FX_XMLNODETYPE CFX_XMLNode::GetType() const { + return FX_XMLNODE_Unknown; +} + +CFX_XMLNode::~CFX_XMLNode() { + DeleteChildren(); +} + +void CFX_XMLNode::DeleteChildren() { + CFX_XMLNode* pChild = m_pChild; + while (pChild) { + CFX_XMLNode* pNext = pChild->m_pNext; + delete pChild; + pChild = pNext; + } + m_pChild = nullptr; +} + +int32_t CFX_XMLNode::CountChildNodes() const { + int32_t iCount = 0; + CFX_XMLNode* pChild = m_pChild; + while (pChild) { + iCount++; + pChild = pChild->m_pNext; + } + return iCount; +} + +CFX_XMLNode* CFX_XMLNode::GetChildNode(int32_t index) const { + CFX_XMLNode* pChild = m_pChild; + while (pChild) { + if (index == 0) { + return pChild; + } + index--; + pChild = pChild->m_pNext; + } + return nullptr; +} + +int32_t CFX_XMLNode::GetChildNodeIndex(CFX_XMLNode* pNode) const { + int32_t index = 0; + CFX_XMLNode* pChild = m_pChild; + while (pChild) { + if (pChild == pNode) { + return index; + } + index++; + pChild = pChild->m_pNext; + } + return -1; +} + +CFX_XMLNode* CFX_XMLNode::GetPath(const wchar_t* pPath, + int32_t iLength, + bool bQualifiedName) const { + ASSERT(pPath); + if (iLength < 0) { + iLength = FXSYS_wcslen(pPath); + } + if (iLength == 0) { + return nullptr; + } + CFX_WideString csPath; + const wchar_t* pStart = pPath; + const wchar_t* pEnd = pPath + iLength; + wchar_t ch; + while (pStart < pEnd) { + ch = *pStart++; + if (ch == L'/') { + break; + } else { + csPath += ch; + } + } + iLength -= pStart - pPath; + CFX_XMLNode* pFind = nullptr; + if (csPath.GetLength() < 1) { + pFind = GetNodeItem(CFX_XMLNode::Root); + } else if (csPath.Compare(L"..") == 0) { + pFind = m_pParent; + } else if (csPath.Compare(L".") == 0) { + pFind = (CFX_XMLNode*)this; + } else { + CFX_WideString wsTag; + CFX_XMLNode* pNode = m_pChild; + while (pNode) { + if (pNode->GetType() == FX_XMLNODE_Element) { + if (bQualifiedName) + wsTag = static_cast(pNode)->GetName(); + else + wsTag = static_cast(pNode)->GetLocalTagName(); + + if (wsTag.Compare(csPath) == 0) { + if (iLength < 1) + pFind = pNode; + else + pFind = pNode->GetPath(pStart, iLength, bQualifiedName); + + if (pFind) + return pFind; + } + } + pNode = pNode->m_pNext; + } + } + if (!pFind || iLength < 1) + return pFind; + return pFind->GetPath(pStart, iLength, bQualifiedName); +} + +int32_t CFX_XMLNode::InsertChildNode(CFX_XMLNode* pNode, int32_t index) { + pNode->m_pParent = this; + if (!m_pChild) { + m_pChild = pNode; + pNode->m_pPrior = nullptr; + pNode->m_pNext = nullptr; + return 0; + } + if (index == 0) { + pNode->m_pNext = m_pChild; + pNode->m_pPrior = nullptr; + m_pChild->m_pPrior = pNode; + m_pChild = pNode; + return 0; + } + int32_t iCount = 0; + CFX_XMLNode* pFind = m_pChild; + while (++iCount != index && pFind->m_pNext) { + pFind = pFind->m_pNext; + } + pNode->m_pPrior = pFind; + pNode->m_pNext = pFind->m_pNext; + if (pFind->m_pNext) + pFind->m_pNext->m_pPrior = pNode; + pFind->m_pNext = pNode; + return iCount; +} + +void CFX_XMLNode::RemoveChildNode(CFX_XMLNode* pNode) { + ASSERT(m_pChild && pNode); + if (m_pChild == pNode) { + m_pChild = pNode->m_pNext; + } else { + pNode->m_pPrior->m_pNext = pNode->m_pNext; + } + if (pNode->m_pNext) + pNode->m_pNext->m_pPrior = pNode->m_pPrior; + pNode->m_pParent = nullptr; + pNode->m_pNext = nullptr; + pNode->m_pPrior = nullptr; +} + +CFX_XMLNode* CFX_XMLNode::GetNodeItem(CFX_XMLNode::NodeItem eItem) const { + switch (eItem) { + case CFX_XMLNode::Root: { + CFX_XMLNode* pParent = (CFX_XMLNode*)this; + while (pParent->m_pParent) { + pParent = pParent->m_pParent; + } + return pParent; + } + case CFX_XMLNode::Parent: + return m_pParent; + case CFX_XMLNode::FirstSibling: { + CFX_XMLNode* pItem = (CFX_XMLNode*)this; + while (pItem->m_pPrior) { + pItem = pItem->m_pPrior; + } + return pItem == (CFX_XMLNode*)this ? nullptr : pItem; + } + case CFX_XMLNode::PriorSibling: + return m_pPrior; + case CFX_XMLNode::NextSibling: + return m_pNext; + case CFX_XMLNode::LastSibling: { + CFX_XMLNode* pItem = (CFX_XMLNode*)this; + while (pItem->m_pNext) + pItem = pItem->m_pNext; + return pItem == (CFX_XMLNode*)this ? nullptr : pItem; + } + case CFX_XMLNode::FirstNeighbor: { + CFX_XMLNode* pParent = (CFX_XMLNode*)this; + while (pParent->m_pParent) + pParent = pParent->m_pParent; + return pParent == (CFX_XMLNode*)this ? nullptr : pParent; + } + case CFX_XMLNode::PriorNeighbor: { + if (!m_pPrior) + return m_pParent; + + CFX_XMLNode* pItem = m_pPrior; + while (pItem->m_pChild) { + pItem = pItem->m_pChild; + while (pItem->m_pNext) + pItem = pItem->m_pNext; + } + return pItem; + } + case CFX_XMLNode::NextNeighbor: { + if (m_pChild) + return m_pChild; + if (m_pNext) + return m_pNext; + CFX_XMLNode* pItem = m_pParent; + while (pItem) { + if (pItem->m_pNext) + return pItem->m_pNext; + pItem = pItem->m_pParent; + } + return nullptr; + } + case CFX_XMLNode::LastNeighbor: { + CFX_XMLNode* pItem = (CFX_XMLNode*)this; + while (pItem->m_pParent) { + pItem = pItem->m_pParent; + } + while (true) { + while (pItem->m_pNext) + pItem = pItem->m_pNext; + if (!pItem->m_pChild) + break; + pItem = pItem->m_pChild; + } + return pItem == (CFX_XMLNode*)this ? nullptr : pItem; + } + case CFX_XMLNode::FirstChild: + return m_pChild; + case CFX_XMLNode::LastChild: { + if (!m_pChild) + return nullptr; + + CFX_XMLNode* pChild = m_pChild; + while (pChild->m_pNext) + pChild = pChild->m_pNext; + return pChild; + } + default: + break; + } + return nullptr; +} + +int32_t CFX_XMLNode::GetNodeLevel() const { + int32_t iLevel = 0; + const CFX_XMLNode* pItem = m_pParent; + while (pItem) { + iLevel++; + pItem = pItem->m_pParent; + } + return iLevel; +} + +bool CFX_XMLNode::InsertNodeItem(CFX_XMLNode::NodeItem eItem, + CFX_XMLNode* pNode) { + switch (eItem) { + case CFX_XMLNode::NextSibling: { + pNode->m_pParent = m_pParent; + pNode->m_pNext = m_pNext; + pNode->m_pPrior = this; + if (m_pNext) { + m_pNext->m_pPrior = pNode; + } + m_pNext = pNode; + return true; + } + case CFX_XMLNode::PriorSibling: { + pNode->m_pParent = m_pParent; + pNode->m_pNext = this; + pNode->m_pPrior = m_pPrior; + if (m_pPrior) { + m_pPrior->m_pNext = pNode; + } else if (m_pParent) { + m_pParent->m_pChild = pNode; + } + m_pPrior = pNode; + return true; + } + default: + return false; + } +} + +CFX_XMLNode* CFX_XMLNode::RemoveNodeItem(CFX_XMLNode::NodeItem eItem) { + CFX_XMLNode* pNode = nullptr; + switch (eItem) { + case CFX_XMLNode::NextSibling: + if (m_pNext) { + pNode = m_pNext; + m_pNext = pNode->m_pNext; + if (m_pNext) { + m_pNext->m_pPrior = this; + } + pNode->m_pParent = nullptr; + pNode->m_pNext = nullptr; + pNode->m_pPrior = nullptr; + } + break; + default: + break; + } + return pNode; +} + +std::unique_ptr CFX_XMLNode::Clone() { + return nullptr; +} + +void CFX_XMLNode::SaveXMLNode( + const CFX_RetainPtr& pXMLStream) { + CFX_XMLNode* pNode = (CFX_XMLNode*)this; + switch (pNode->GetType()) { + case FX_XMLNODE_Instruction: { + CFX_WideString ws; + CFX_XMLInstruction* pInstruction = (CFX_XMLInstruction*)pNode; + if (pInstruction->GetName().CompareNoCase(L"xml") == 0) { + ws = L"GetCodePage(); + if (wCodePage == FX_CODEPAGE_UTF16LE) { + ws += L"UTF-16"; + } else if (wCodePage == FX_CODEPAGE_UTF16BE) { + ws += L"UTF-16be"; + } else { + ws += L"UTF-8"; + } + ws += L"\"?>"; + pXMLStream->WriteString(ws.AsStringC()); + } else { + ws.Format(L"GetName().c_str()); + pXMLStream->WriteString(ws.AsStringC()); + + for (auto it : pInstruction->GetAttributes()) { + CFX_WideString wsValue = it.second; + wsValue.Replace(L"&", L"&"); + wsValue.Replace(L"<", L"<"); + wsValue.Replace(L">", L">"); + wsValue.Replace(L"\'", L"'"); + wsValue.Replace(L"\"", L"""); + + ws = L" "; + ws += it.first; + ws += L"=\""; + ws += wsValue; + ws += L"\""; + pXMLStream->WriteString(ws.AsStringC()); + } + + for (auto target : pInstruction->GetTargetData()) { + ws = L" \""; + ws += target; + ws += L"\""; + pXMLStream->WriteString(ws.AsStringC()); + } + ws = L"?>"; + pXMLStream->WriteString(ws.AsStringC()); + } + break; + } + case FX_XMLNODE_Element: { + CFX_WideString ws; + ws = L"<"; + ws += static_cast(pNode)->GetName(); + pXMLStream->WriteString(ws.AsStringC()); + + for (auto it : static_cast(pNode)->GetAttributes()) { + CFX_WideString wsValue = it.second; + wsValue.Replace(L"&", L"&"); + wsValue.Replace(L"<", L"<"); + wsValue.Replace(L">", L">"); + wsValue.Replace(L"\'", L"'"); + wsValue.Replace(L"\"", L"""); + + ws = L" "; + ws += it.first; + ws += L"=\""; + ws += wsValue; + ws += L"\""; + pXMLStream->WriteString(ws.AsStringC()); + } + if (pNode->m_pChild) { + ws = L"\n>"; + pXMLStream->WriteString(ws.AsStringC()); + CFX_XMLNode* pChild = pNode->m_pChild; + while (pChild) { + pChild->SaveXMLNode(pXMLStream); + pChild = pChild->m_pNext; + } + ws = L"(pNode)->GetName(); + ws += L"\n>"; + } else { + ws = L"\n/>"; + } + pXMLStream->WriteString(ws.AsStringC()); + break; + } + case FX_XMLNODE_Text: { + CFX_WideString ws = static_cast(pNode)->GetText(); + ws.Replace(L"&", L"&"); + ws.Replace(L"<", L"<"); + ws.Replace(L">", L">"); + ws.Replace(L"\'", L"'"); + ws.Replace(L"\"", L"""); + pXMLStream->WriteString(ws.AsStringC()); + break; + } + case FX_XMLNODE_CharData: { + CFX_WideString ws = L"(pNode)->GetText(); + ws += L"]]>"; + pXMLStream->WriteString(ws.AsStringC()); + break; + } + case FX_XMLNODE_Unknown: + default: + break; + } +} diff --git a/core/fxcrt/xml/cfx_xmlnode.h b/core/fxcrt/xml/cfx_xmlnode.h new file mode 100644 index 0000000000..7cbc2b6642 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlnode.h @@ -0,0 +1,75 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLNODE_H_ +#define CORE_FXCRT_XML_CFX_XMLNODE_H_ + +#include + +#include "core/fxcrt/cfx_retain_ptr.h" +#include "core/fxcrt/cfx_seekablestreamproxy.h" + +enum FX_XMLNODETYPE { + FX_XMLNODE_Unknown = 0, + FX_XMLNODE_Instruction, + FX_XMLNODE_Element, + FX_XMLNODE_Text, + FX_XMLNODE_CharData, +}; + +struct FX_XMLNODE { + int32_t iNodeNum; + FX_XMLNODETYPE eNodeType; +}; + +class CFX_XMLNode { + public: + enum NodeItem { + Root = 0, + Parent, + FirstSibling, + PriorSibling, + NextSibling, + LastSibling, + FirstNeighbor, + PriorNeighbor, + NextNeighbor, + LastNeighbor, + FirstChild, + LastChild + }; + + CFX_XMLNode(); + virtual ~CFX_XMLNode(); + + virtual FX_XMLNODETYPE GetType() const; + virtual std::unique_ptr Clone(); + + int32_t CountChildNodes() const; + CFX_XMLNode* GetChildNode(int32_t index) const; + int32_t GetChildNodeIndex(CFX_XMLNode* pNode) const; + int32_t InsertChildNode(CFX_XMLNode* pNode, int32_t index = -1); + void RemoveChildNode(CFX_XMLNode* pNode); + void DeleteChildren(); + + CFX_XMLNode* GetPath(const wchar_t* pPath, + int32_t iLength = -1, + bool bQualifiedName = true) const; + + int32_t GetNodeLevel() const; + CFX_XMLNode* GetNodeItem(CFX_XMLNode::NodeItem eItem) const; + bool InsertNodeItem(CFX_XMLNode::NodeItem eItem, CFX_XMLNode* pNode); + CFX_XMLNode* RemoveNodeItem(CFX_XMLNode::NodeItem eItem); + + void SaveXMLNode(const CFX_RetainPtr& pXMLStream); + + CFX_XMLNode* m_pParent; + CFX_XMLNode* m_pChild; + CFX_XMLNode* m_pPrior; + CFX_XMLNode* m_pNext; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLNODE_H_ diff --git a/core/fxcrt/xml/cfx_xmlparser.cpp b/core/fxcrt/xml/cfx_xmlparser.cpp new file mode 100644 index 0000000000..0e328f33ea --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlparser.cpp @@ -0,0 +1,171 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlparser.h" + +#include "core/fxcrt/fx_basic.h" +#include "core/fxcrt/xml/cfx_xmlchardata.h" +#include "core/fxcrt/xml/cfx_xmlelement.h" +#include "core/fxcrt/xml/cfx_xmlinstruction.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" +#include "core/fxcrt/xml/cfx_xmltext.h" +#include "third_party/base/ptr_util.h" + +CFX_XMLParser::CFX_XMLParser( + CFX_XMLNode* pParent, + const CFX_RetainPtr& pStream) + : m_nElementStart(0), + m_dwCheckStatus(0), + m_dwCurrentCheckStatus(0), + m_pStream(pStream), + m_pParser(pdfium::MakeUnique(m_pStream)), + m_pParent(pParent), + m_pChild(nullptr), + m_syntaxParserResult(FX_XmlSyntaxResult::None) { + ASSERT(m_pParent && m_pStream); + m_NodeStack.push(m_pParent); +} + +CFX_XMLParser::~CFX_XMLParser() {} + +int32_t CFX_XMLParser::DoParser(IFX_Pause* pPause) { + if (m_syntaxParserResult == FX_XmlSyntaxResult::Error) + return -1; + if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) + return 100; + + int32_t iCount = 0; + while (true) { + m_syntaxParserResult = m_pParser->DoSyntaxParse(); + switch (m_syntaxParserResult) { + case FX_XmlSyntaxResult::InstructionOpen: + break; + case FX_XmlSyntaxResult::InstructionClose: + if (m_pChild) { + if (m_pChild->GetType() != FX_XMLNODE_Instruction) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + break; + } + } + m_pChild = m_pParent; + break; + case FX_XmlSyntaxResult::ElementOpen: + if (m_dwCheckStatus != 0x03 && m_NodeStack.size() == 2) + m_nElementStart = m_pParser->GetCurrentPos() - 1; + break; + case FX_XmlSyntaxResult::ElementBreak: + break; + case FX_XmlSyntaxResult::ElementClose: + if (m_pChild->GetType() != FX_XMLNODE_Element) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + break; + } + m_ws1 = m_pParser->GetTagName(); + m_ws2 = static_cast(m_pChild)->GetName(); + if (m_ws1.GetLength() > 0 && m_ws1 != m_ws2) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + break; + } + if (!m_NodeStack.empty()) + m_NodeStack.pop(); + if (m_NodeStack.empty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + break; + } else if (m_dwCurrentCheckStatus != 0 && m_NodeStack.size() == 2) { + m_nSize[m_dwCurrentCheckStatus - 1] = + m_pParser->GetCurrentBinaryPos() - + m_nStart[m_dwCurrentCheckStatus - 1]; + m_dwCurrentCheckStatus = 0; + } + m_pParent = m_NodeStack.top(); + m_pChild = m_pParent; + iCount++; + break; + case FX_XmlSyntaxResult::TargetName: + m_ws1 = m_pParser->GetTargetName(); + if (m_ws1 == L"originalXFAVersion" || m_ws1 == L"acrobat") { + m_pChild = new CFX_XMLInstruction(m_ws1); + m_pParent->InsertChildNode(m_pChild); + } else { + m_pChild = nullptr; + } + m_ws1.clear(); + break; + case FX_XmlSyntaxResult::TagName: + m_ws1 = m_pParser->GetTagName(); + m_pChild = new CFX_XMLElement(m_ws1); + m_pParent->InsertChildNode(m_pChild); + m_NodeStack.push(m_pChild); + m_pParent = m_pChild; + + if (m_dwCheckStatus != 0x03 && m_NodeStack.size() == 3) { + CFX_WideString wsTag = + static_cast(m_pChild)->GetLocalTagName(); + if (wsTag == L"template") { + m_dwCheckStatus |= 0x01; + m_dwCurrentCheckStatus = 0x01; + m_nStart[0] = m_pParser->GetCurrentBinaryPos() - + (m_pParser->GetCurrentPos() - m_nElementStart); + } else if (wsTag == L"datasets") { + m_dwCheckStatus |= 0x02; + m_dwCurrentCheckStatus = 0x02; + m_nStart[1] = m_pParser->GetCurrentBinaryPos() - + (m_pParser->GetCurrentPos() - m_nElementStart); + } + } + break; + case FX_XmlSyntaxResult::AttriName: + m_ws1 = m_pParser->GetAttributeName(); + break; + case FX_XmlSyntaxResult::AttriValue: + if (m_pChild) { + m_ws2 = m_pParser->GetAttributeName(); + if (m_pChild->GetType() == FX_XMLNODE_Element) + static_cast(m_pChild)->SetString(m_ws1, m_ws2); + } + m_ws1.clear(); + break; + case FX_XmlSyntaxResult::Text: + m_ws1 = m_pParser->GetTextData(); + m_pChild = new CFX_XMLText(m_ws1); + m_pParent->InsertChildNode(m_pChild); + m_pChild = m_pParent; + break; + case FX_XmlSyntaxResult::CData: + m_ws1 = m_pParser->GetTextData(); + m_pChild = new CFX_XMLCharData(m_ws1); + m_pParent->InsertChildNode(m_pChild); + m_pChild = m_pParent; + break; + case FX_XmlSyntaxResult::TargetData: + if (m_pChild) { + if (m_pChild->GetType() != FX_XMLNODE_Instruction) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + break; + } + auto* instruction = static_cast(m_pChild); + if (!m_ws1.IsEmpty()) + instruction->AppendData(m_ws1); + instruction->AppendData(m_pParser->GetTargetData()); + } + m_ws1.clear(); + break; + default: + break; + } + if (m_syntaxParserResult == FX_XmlSyntaxResult::Error || + m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) { + break; + } + if (pPause && iCount > 500 && pPause->NeedToPauseNow()) { + break; + } + } + return (m_syntaxParserResult == FX_XmlSyntaxResult::Error || + m_NodeStack.size() != 1) + ? -1 + : m_pParser->GetStatus(); +} diff --git a/core/fxcrt/xml/cfx_xmlparser.h b/core/fxcrt/xml/cfx_xmlparser.h new file mode 100644 index 0000000000..dc3cc4c297 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlparser.h @@ -0,0 +1,47 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLPARSER_H_ +#define CORE_FXCRT_XML_CFX_XMLPARSER_H_ + +#include +#include + +#include "core/fxcrt/cfx_retain_ptr.h" +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h" + +class CFX_XMLElement; +class CFX_XMLNode; +class CFX_SeekableStreamProxy; +class IFX_Pause; + +class CFX_XMLParser { + public: + CFX_XMLParser(CFX_XMLNode* pParent, + const CFX_RetainPtr& pStream); + ~CFX_XMLParser(); + + int32_t DoParser(IFX_Pause* pPause); + + FX_FILESIZE m_nStart[2]; + size_t m_nSize[2]; + FX_FILESIZE m_nElementStart; + uint16_t m_dwCheckStatus; + uint16_t m_dwCurrentCheckStatus; + + private: + CFX_RetainPtr m_pStream; + std::unique_ptr m_pParser; + CFX_XMLNode* m_pParent; + CFX_XMLNode* m_pChild; + std::stack m_NodeStack; + CFX_WideString m_ws1; + CFX_WideString m_ws2; + FX_XmlSyntaxResult m_syntaxParserResult; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLPARSER_H_ diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp b/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp new file mode 100644 index 0000000000..e7bef71085 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp @@ -0,0 +1,698 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h" + +#include + +#include "core/fxcrt/fx_ext.h" +#include "core/fxcrt/fx_safe_types.h" + +namespace { + +const uint32_t kMaxCharRange = 0x10ffff; + +bool IsXMLWhiteSpace(wchar_t ch) { + return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09; +} + +struct FX_XMLNAMECHAR { + uint16_t wStart; + uint16_t wEnd; + bool bStartChar; +}; + +const FX_XMLNAMECHAR g_XMLNameChars[] = { + {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false}, + {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true}, + {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true}, + {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true}, + {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false}, + {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true}, + {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true}, +}; + +bool IsXMLNameChar(wchar_t ch, bool bFirstChar) { + int32_t iStart = 0; + int32_t iEnd = FX_ArraySize(g_XMLNameChars) - 1; + while (iStart <= iEnd) { + int32_t iMid = (iStart + iEnd) / 2; + if (ch < g_XMLNameChars[iMid].wStart) { + iEnd = iMid - 1; + } else if (ch > g_XMLNameChars[iMid].wEnd) { + iStart = iMid + 1; + } else { + return bFirstChar ? g_XMLNameChars[iMid].bStartChar : true; + } + } + return false; +} + +int32_t GetUTF8EncodeLength(const std::vector& src, + FX_FILESIZE iSrcLen) { + uint32_t unicode = 0; + int32_t iDstNum = 0; + const wchar_t* pSrc = src.data(); + while (iSrcLen-- > 0) { + unicode = *pSrc++; + int nbytes = 0; + if ((uint32_t)unicode < 0x80) { + nbytes = 1; + } else if ((uint32_t)unicode < 0x800) { + nbytes = 2; + } else if ((uint32_t)unicode < 0x10000) { + nbytes = 3; + } else if ((uint32_t)unicode < 0x200000) { + nbytes = 4; + } else if ((uint32_t)unicode < 0x4000000) { + nbytes = 5; + } else { + nbytes = 6; + } + iDstNum += nbytes; + } + return iDstNum; +} + +} // namespace + +CFX_XMLSyntaxParser::CFX_XMLSyntaxParser( + const CFX_RetainPtr& pStream) + : m_pStream(pStream), + m_iXMLPlaneSize(32 * 1024), + m_iCurrentPos(0), + m_iCurrentNodeNum(-1), + m_iLastNodeNum(-1), + m_iParsedBytes(0), + m_ParsedChars(0), + m_iBufferChars(0), + m_bEOS(false), + m_Start(0), + m_End(0), + m_iAllocStep(m_BlockBuffer.GetAllocStep()), + m_pCurrentBlock(nullptr), + m_iIndexInBlock(0), + m_iTextDataLength(0), + m_syntaxParserResult(FX_XmlSyntaxResult::None), + m_syntaxParserState(FDE_XmlSyntaxState::Text), + m_wQuotationMark(0), + m_iEntityStart(-1) { + ASSERT(pStream); + + m_CurNode.iNodeNum = -1; + m_CurNode.eNodeType = FX_XMLNODE_Unknown; + + m_iXMLPlaneSize = + std::min(m_iXMLPlaneSize, + pdfium::base::checked_cast(m_pStream->GetLength())); + m_iCurrentPos = m_pStream->GetBOMLength(); + + FX_SAFE_STRSIZE alloc_size_safe = m_iXMLPlaneSize; + alloc_size_safe += 1; // For NUL. + if (!alloc_size_safe.IsValid() || alloc_size_safe.ValueOrDie() <= 0) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return; + } + + m_Buffer.resize(pdfium::base::ValueOrDieForType(alloc_size_safe)); + + m_BlockBuffer.InitBuffer(); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); +} + +CFX_XMLSyntaxParser::~CFX_XMLSyntaxParser() {} + +FX_XmlSyntaxResult CFX_XMLSyntaxParser::DoSyntaxParse() { + if (m_syntaxParserResult == FX_XmlSyntaxResult::Error || + m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) { + return m_syntaxParserResult; + } + + int32_t iStreamLength = m_pStream->GetLength(); + int32_t iPos; + + FX_XmlSyntaxResult syntaxParserResult = FX_XmlSyntaxResult::None; + while (true) { + if (m_Start >= m_End) { + if (m_bEOS || m_iCurrentPos >= iStreamLength) { + m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString; + return m_syntaxParserResult; + } + m_ParsedChars += m_End; + m_iParsedBytes = m_iCurrentPos; + if (m_pStream->GetPosition() != m_iCurrentPos) + m_pStream->Seek(CFX_SeekableStreamProxy::Pos::Begin, m_iCurrentPos); + + m_iBufferChars = + m_pStream->ReadString(m_Buffer.data(), m_iXMLPlaneSize, &m_bEOS); + iPos = m_pStream->GetPosition(); + if (m_iBufferChars < 1) { + m_iCurrentPos = iStreamLength; + m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString; + return m_syntaxParserResult; + } + m_iCurrentPos = iPos; + m_Start = 0; + m_End = m_iBufferChars; + } + + while (m_Start < m_End) { + wchar_t ch = m_Buffer[m_Start]; + switch (m_syntaxParserState) { + case FDE_XmlSyntaxState::Text: + if (ch == L'<') { + if (!m_BlockBuffer.IsEmpty()) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_iEntityStart = -1; + syntaxParserResult = FX_XmlSyntaxResult::Text; + } else { + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::Node; + } + } else { + ParseTextChar(ch); + } + break; + case FDE_XmlSyntaxState::Node: + if (ch == L'!') { + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::SkipCommentOrDecl; + } else if (ch == L'/') { + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::CloseElement; + } else if (ch == L'?') { + m_iLastNodeNum++; + m_iCurrentNodeNum = m_iLastNodeNum; + m_CurNode.iNodeNum = m_iLastNodeNum; + m_CurNode.eNodeType = FX_XMLNODE_Instruction; + m_XMLNodeStack.push(m_CurNode); + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::Target; + syntaxParserResult = FX_XmlSyntaxResult::InstructionOpen; + } else { + m_iLastNodeNum++; + m_iCurrentNodeNum = m_iLastNodeNum; + m_CurNode.iNodeNum = m_iLastNodeNum; + m_CurNode.eNodeType = FX_XMLNODE_Element; + m_XMLNodeStack.push(m_CurNode); + m_syntaxParserState = FDE_XmlSyntaxState::Tag; + syntaxParserResult = FX_XmlSyntaxResult::ElementOpen; + } + break; + case FDE_XmlSyntaxState::Target: + case FDE_XmlSyntaxState::Tag: + if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { + if (m_BlockBuffer.IsEmpty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (m_syntaxParserState != FDE_XmlSyntaxState::Target) + syntaxParserResult = FX_XmlSyntaxResult::TagName; + else + syntaxParserResult = FX_XmlSyntaxResult::TargetName; + + m_syntaxParserState = FDE_XmlSyntaxState::AttriName; + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriName: + if (m_BlockBuffer.IsEmpty() && IsXMLWhiteSpace(ch)) { + m_Start++; + break; + } + if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { + if (m_BlockBuffer.IsEmpty()) { + if (m_CurNode.eNodeType == FX_XMLNODE_Element) { + if (ch == L'>' || ch == L'/') { + m_syntaxParserState = FDE_XmlSyntaxState::BreakElement; + break; + } + } else if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { + if (ch == L'?') { + m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction; + m_Start++; + } else { + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + } + break; + } + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } else { + if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { + if (ch != '=' && !IsXMLWhiteSpace(ch)) { + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + break; + } + } + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::AttriEqualSign; + syntaxParserResult = FX_XmlSyntaxResult::AttriName; + } + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriEqualSign: + if (IsXMLWhiteSpace(ch)) { + m_Start++; + break; + } + if (ch != L'=') { + if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + break; + } + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } else { + m_syntaxParserState = FDE_XmlSyntaxState::AttriQuotation; + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriQuotation: + if (IsXMLWhiteSpace(ch)) { + m_Start++; + break; + } + if (ch != L'\"' && ch != L'\'') { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } else { + m_wQuotationMark = ch; + m_syntaxParserState = FDE_XmlSyntaxState::AttriValue; + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriValue: + if (ch == m_wQuotationMark) { + if (m_iEntityStart > -1) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_wQuotationMark = 0; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::AttriName; + syntaxParserResult = FX_XmlSyntaxResult::AttriValue; + } else { + ParseTextChar(ch); + } + break; + case FDE_XmlSyntaxState::CloseInstruction: + if (ch != L'>') { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + } else if (!m_BlockBuffer.IsEmpty()) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + syntaxParserResult = FX_XmlSyntaxResult::TargetData; + } else { + m_Start++; + if (m_XMLNodeStack.empty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_XMLNodeStack.pop(); + if (!m_XMLNodeStack.empty()) { + m_CurNode = m_XMLNodeStack.top(); + } else { + m_CurNode.iNodeNum = -1; + m_CurNode.eNodeType = FX_XMLNODE_Unknown; + } + m_iCurrentNodeNum = m_CurNode.iNodeNum; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + syntaxParserResult = FX_XmlSyntaxResult::InstructionClose; + } + break; + case FDE_XmlSyntaxState::BreakElement: + if (ch == L'>') { + m_syntaxParserState = FDE_XmlSyntaxState::Text; + syntaxParserResult = FX_XmlSyntaxResult::ElementBreak; + } else if (ch == L'/') { + m_syntaxParserState = FDE_XmlSyntaxState::CloseElement; + } else { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_Start++; + break; + case FDE_XmlSyntaxState::CloseElement: + if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { + if (ch == L'>') { + if (m_XMLNodeStack.empty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_XMLNodeStack.pop(); + if (!m_XMLNodeStack.empty()) { + m_CurNode = m_XMLNodeStack.top(); + } else { + m_CurNode.iNodeNum = -1; + m_CurNode.eNodeType = FX_XMLNODE_Unknown; + } + m_iCurrentNodeNum = m_CurNode.iNodeNum; + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + syntaxParserResult = FX_XmlSyntaxResult::ElementClose; + } else if (!IsXMLWhiteSpace(ch)) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + } + m_Start++; + break; + case FDE_XmlSyntaxState::SkipCommentOrDecl: + if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"--", 2) == 0) { + m_Start += 2; + m_syntaxParserState = FDE_XmlSyntaxState::SkipComment; + } else if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"[CDATA[", 7) == + 0) { + m_Start += 7; + m_syntaxParserState = FDE_XmlSyntaxState::SkipCData; + } else { + m_syntaxParserState = FDE_XmlSyntaxState::SkipDeclNode; + m_SkipChar = L'>'; + m_SkipStack.push(L'>'); + } + break; + case FDE_XmlSyntaxState::SkipCData: { + if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"]]>", 3) == 0) { + m_Start += 3; + syntaxParserResult = FX_XmlSyntaxResult::CData; + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) + return FX_XmlSyntaxResult::Error; + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + } + case FDE_XmlSyntaxState::SkipDeclNode: + if (m_SkipChar == L'\'' || m_SkipChar == L'\"') { + m_Start++; + if (ch != m_SkipChar) + break; + + m_SkipStack.pop(); + if (m_SkipStack.empty()) + m_syntaxParserState = FDE_XmlSyntaxState::Text; + else + m_SkipChar = m_SkipStack.top(); + } else { + switch (ch) { + case L'<': + m_SkipChar = L'>'; + m_SkipStack.push(L'>'); + break; + case L'[': + m_SkipChar = L']'; + m_SkipStack.push(L']'); + break; + case L'(': + m_SkipChar = L')'; + m_SkipStack.push(L')'); + break; + case L'\'': + m_SkipChar = L'\''; + m_SkipStack.push(L'\''); + break; + case L'\"': + m_SkipChar = L'\"'; + m_SkipStack.push(L'\"'); + break; + default: + if (ch == m_SkipChar) { + m_SkipStack.pop(); + if (m_SkipStack.empty()) { + if (m_BlockBuffer.GetDataLength() >= 9) + (void)m_BlockBuffer.GetTextData(0, 7); + + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + } else { + m_SkipChar = m_SkipStack.top(); + } + } + break; + } + if (!m_SkipStack.empty()) { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + } + m_Start++; + } + break; + case FDE_XmlSyntaxState::SkipComment: + if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"-->", 3) == 0) { + m_Start += 2; + m_syntaxParserState = FDE_XmlSyntaxState::Text; + } + + m_Start++; + break; + case FDE_XmlSyntaxState::TargetData: + if (IsXMLWhiteSpace(ch)) { + if (m_BlockBuffer.IsEmpty()) { + m_Start++; + break; + } else if (m_wQuotationMark == 0) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_wQuotationMark = 0; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_Start++; + syntaxParserResult = FX_XmlSyntaxResult::TargetData; + break; + } + } + if (ch == '?') { + m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction; + m_Start++; + } else if (ch == '\"') { + if (m_wQuotationMark == 0) { + m_wQuotationMark = ch; + m_Start++; + } else if (ch == m_wQuotationMark) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_wQuotationMark = 0; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_Start++; + syntaxParserResult = FX_XmlSyntaxResult::TargetData; + } else { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + default: + break; + } + if (syntaxParserResult != FX_XmlSyntaxResult::None) + return syntaxParserResult; + } + } + return FX_XmlSyntaxResult::Text; +} + +int32_t CFX_XMLSyntaxParser::GetStatus() const { + if (!m_pStream) + return -1; + + int32_t iStreamLength = m_pStream->GetLength(); + if (iStreamLength < 1) + return 100; + + if (m_syntaxParserResult == FX_XmlSyntaxResult::Error) + return -1; + + if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) + return 100; + return m_iParsedBytes * 100 / iStreamLength; +} + +FX_FILESIZE CFX_XMLSyntaxParser::GetCurrentBinaryPos() const { + if (!m_pStream) + return 0; + + int32_t nDstLen = GetUTF8EncodeLength(m_Buffer, m_Start); + return m_iParsedBytes + nDstLen; +} + +void CFX_XMLSyntaxParser::ParseTextChar(wchar_t character) { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) + return; + } + + m_pCurrentBlock[m_iIndexInBlock++] = character; + m_BlockBuffer.IncrementDataLength(); + if (m_iEntityStart > -1 && character == L';') { + CFX_WideString csEntity = m_BlockBuffer.GetTextData( + m_iEntityStart + 1, + m_BlockBuffer.GetDataLength() - 1 - m_iEntityStart - 1); + int32_t iLen = csEntity.GetLength(); + if (iLen > 0) { + if (csEntity[0] == L'#') { + uint32_t ch = 0; + wchar_t w; + if (iLen > 1 && csEntity[1] == L'x') { + for (int32_t i = 2; i < iLen; i++) { + w = csEntity[i]; + if (w >= L'0' && w <= L'9') { + ch = (ch << 4) + w - L'0'; + } else if (w >= L'A' && w <= L'F') { + ch = (ch << 4) + w - 55; + } else if (w >= L'a' && w <= L'f') { + ch = (ch << 4) + w - 87; + } else { + break; + } + } + } else { + for (int32_t i = 1; i < iLen; i++) { + w = csEntity[i]; + if (w < L'0' || w > L'9') + break; + ch = ch * 10 + w - L'0'; + } + } + if (ch > kMaxCharRange) + ch = ' '; + + character = static_cast(ch); + if (character != 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, character); + m_iEntityStart++; + } + } else { + if (csEntity.Compare(L"amp") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'&'); + m_iEntityStart++; + } else if (csEntity.Compare(L"lt") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'<'); + m_iEntityStart++; + } else if (csEntity.Compare(L"gt") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'>'); + m_iEntityStart++; + } else if (csEntity.Compare(L"apos") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'\''); + m_iEntityStart++; + } else if (csEntity.Compare(L"quot") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'\"'); + m_iEntityStart++; + } + } + } + if (m_iEntityStart >= 0 && + m_BlockBuffer.GetDataLength() > static_cast(m_iEntityStart)) { + m_BlockBuffer.DeleteTextChars(m_BlockBuffer.GetDataLength() - + m_iEntityStart); + } + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_iEntityStart = -1; + } else if (m_iEntityStart < 0 && character == L'&') { + m_iEntityStart = m_BlockBuffer.GetDataLength() - 1; + } + m_Start++; +} diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.h b/core/fxcrt/xml/cfx_xmlsyntaxparser.h new file mode 100644 index 0000000000..519f2833d6 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlsyntaxparser.h @@ -0,0 +1,128 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_ +#define CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_ + +#include +#include + +#include "core/fxcrt/cfx_blockbuffer.h" +#include "core/fxcrt/cfx_retain_ptr.h" +#include "core/fxcrt/cfx_seekablestreamproxy.h" +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" + +enum class FX_XmlSyntaxResult { + None, + InstructionOpen, + InstructionClose, + ElementOpen, + ElementBreak, + ElementClose, + TargetName, + TagName, + AttriName, + AttriValue, + Text, + CData, + TargetData, + Error, + EndOfString +}; + +class CFX_XMLSyntaxParser { + public: + explicit CFX_XMLSyntaxParser( + const CFX_RetainPtr& pStream); + ~CFX_XMLSyntaxParser(); + + FX_XmlSyntaxResult DoSyntaxParse(); + + int32_t GetStatus() const; + FX_FILESIZE GetCurrentPos() const { return m_ParsedChars + m_Start; } + FX_FILESIZE GetCurrentBinaryPos() const; + int32_t GetCurrentNodeNumber() const { return m_iCurrentNodeNum; } + int32_t GetLastNodeNumber() const { return m_iLastNodeNum; } + + CFX_WideString GetTargetName() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + CFX_WideString GetTagName() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + CFX_WideString GetAttributeName() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + CFX_WideString GetAttributeValue() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + CFX_WideString GetTextData() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + CFX_WideString GetTargetData() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + protected: + enum class FDE_XmlSyntaxState { + Text, + Node, + Target, + Tag, + AttriName, + AttriEqualSign, + AttriQuotation, + AttriValue, + Entity, + EntityDecimal, + EntityHex, + CloseInstruction, + BreakElement, + CloseElement, + SkipDeclNode, + DeclCharData, + SkipComment, + SkipCommentOrDecl, + SkipCData, + TargetData + }; + + void ParseTextChar(wchar_t ch); + + CFX_RetainPtr m_pStream; + FX_STRSIZE m_iXMLPlaneSize; + int32_t m_iCurrentPos; + int32_t m_iCurrentNodeNum; + int32_t m_iLastNodeNum; + int32_t m_iParsedBytes; + FX_FILESIZE m_ParsedChars; + std::vector m_Buffer; + int32_t m_iBufferChars; + bool m_bEOS; + FX_FILESIZE m_Start; // Start position in m_Buffer + FX_FILESIZE m_End; // End position in m_Buffer + FX_XMLNODE m_CurNode; + std::stack m_XMLNodeStack; + CFX_BlockBuffer m_BlockBuffer; + int32_t m_iAllocStep; + wchar_t* m_pCurrentBlock; // Pointer into CFX_BlockBuffer + int32_t m_iIndexInBlock; + int32_t m_iTextDataLength; + FX_XmlSyntaxResult m_syntaxParserResult; + FDE_XmlSyntaxState m_syntaxParserState; + wchar_t m_wQuotationMark; + int32_t m_iEntityStart; + std::stack m_SkipStack; + wchar_t m_SkipChar; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_ diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp b/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp new file mode 100644 index 0000000000..60c0d34655 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp @@ -0,0 +1,527 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h" + +#include + +#include "core/fxcrt/cfx_seekablestreamproxy.h" +#include "core/fxcrt/fx_codepage.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "testing/test_support.h" + +class CFX_XMLSyntaxParserTest : public pdfium::FPDF_Test {}; + +TEST_F(CFX_XMLSyntaxParserTest, CData) { + const char* input = + ""; + + const wchar_t* cdata = + L"\n" + L" if (a[1] < 3)\n" + L" app.alert(\"Tclams\");\n" + L" "; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::CData, parser.DoSyntaxParse()); + ASSERT_EQ(cdata, parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, CDataWithInnerScript) { + const char* input = + "\n" + " ]]>\n" + ""; + + const wchar_t* cdata = + L"\n" + L" if (a[1] < 3)\n" + L" app.alert(\"Tclams\");\n" + L" \n" + L" "; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::CData, parser.DoSyntaxParse()); + ASSERT_EQ(cdata, parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, ArrowBangArrow) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + // Parser walks to end of input. + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, IncompleteCData) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + // Parser walks to end of input. + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, UnClosedCData) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + // Parser walks to end of input. + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, EmptyCData) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::CData, parser.DoSyntaxParse()); + ASSERT_EQ(L"", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, Comment) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, IncorrectCommentStart) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, CommentEmpty) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, CommentThreeDash) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, CommentTwoDash) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"\n ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, Entities) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L"BTH\xab48", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, EntityOverflowHex) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L" ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} + +TEST_F(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) { + const char* input = + ""; + + CFX_RetainPtr stream = + pdfium::MakeRetain( + reinterpret_cast(const_cast(input)), strlen(input)); + stream->SetCodePage(FX_CODEPAGE_UTF8); + + CFX_XMLSyntaxParser parser(stream); + ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::AttriName, parser.DoSyntaxParse()); + ASSERT_EQ(L"contentType", parser.GetAttributeName()); + ASSERT_EQ(FX_XmlSyntaxResult::AttriValue, parser.DoSyntaxParse()); + ASSERT_EQ(L"application/x-javascript", parser.GetAttributeValue()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementBreak, parser.DoSyntaxParse()); + ASSERT_EQ(FX_XmlSyntaxResult::Text, parser.DoSyntaxParse()); + ASSERT_EQ(L" ", parser.GetTextData()); + + ASSERT_EQ(FX_XmlSyntaxResult::ElementClose, parser.DoSyntaxParse()); + ASSERT_EQ(L"script", parser.GetTagName()); + + ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); +} diff --git a/core/fxcrt/xml/cfx_xmltext.cpp b/core/fxcrt/xml/cfx_xmltext.cpp new file mode 100644 index 0000000000..4c41fcd9f8 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmltext.cpp @@ -0,0 +1,22 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/xml/cfx_xmltext.h" + +#include "third_party/base/ptr_util.h" + +CFX_XMLText::CFX_XMLText(const CFX_WideString& wsText) + : CFX_XMLNode(), m_wsText(wsText) {} + +CFX_XMLText::~CFX_XMLText() {} + +FX_XMLNODETYPE CFX_XMLText::GetType() const { + return FX_XMLNODE_Text; +} + +std::unique_ptr CFX_XMLText::Clone() { + return pdfium::MakeUnique(m_wsText); +} diff --git a/core/fxcrt/xml/cfx_xmltext.h b/core/fxcrt/xml/cfx_xmltext.h new file mode 100644 index 0000000000..20fb858302 --- /dev/null +++ b/core/fxcrt/xml/cfx_xmltext.h @@ -0,0 +1,31 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_XML_CFX_XMLTEXT_H_ +#define CORE_FXCRT_XML_CFX_XMLTEXT_H_ + +#include + +#include "core/fxcrt/fx_string.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" + +class CFX_XMLText : public CFX_XMLNode { + public: + explicit CFX_XMLText(const CFX_WideString& wsText); + ~CFX_XMLText() override; + + // CFX_XMLNode + FX_XMLNODETYPE GetType() const override; + std::unique_ptr Clone() override; + + CFX_WideString GetText() const { return m_wsText; } + void SetText(const CFX_WideString& wsText) { m_wsText = wsText; } + + private: + CFX_WideString m_wsText; +}; + +#endif // CORE_FXCRT_XML_CFX_XMLTEXT_H_ -- cgit v1.2.3