From a995d6fd9b862dbd37aebb9c323766bb5d11d389 Mon Sep 17 00:00:00 2001 From: Dan Sinclair Date: Thu, 12 Apr 2018 13:15:59 +0000 Subject: Allow retrieving the XML tree from the CFX_XMLDoc This CL allows the CXFA_SimpleParser to retrieve the XML tree from the CFX_XMLDoc. This way, we don't have to keep the doc around and can store the pointer to the tree in the CXFA_SimpleParser. Change-Id: I5b478acbe61e6f1ca5fa04d03133a2b327a0cb1c Reviewed-on: https://pdfium-review.googlesource.com/30210 Reviewed-by: Henrique Nakashima Commit-Queue: dsinclair --- core/fxcrt/xml/cfx_xmldoc.cpp | 16 +++------ core/fxcrt/xml/cfx_xmldoc.h | 10 +++--- core/fxcrt/xml/cfx_xmlparser.cpp | 28 ++++++--------- core/fxcrt/xml/cfx_xmlparser.h | 2 +- testing/libfuzzer/pdf_xml_fuzzer.cc | 8 ++--- xfa/fxfa/parser/cxfa_document_parser.cpp | 15 ++++---- xfa/fxfa/parser/cxfa_simple_parser.cpp | 60 ++++++++++---------------------- xfa/fxfa/parser/cxfa_simple_parser.h | 9 ++--- 8 files changed, 52 insertions(+), 96 deletions(-) diff --git a/core/fxcrt/xml/cfx_xmldoc.cpp b/core/fxcrt/xml/cfx_xmldoc.cpp index 57be180f9a..c14255d432 100644 --- a/core/fxcrt/xml/cfx_xmldoc.cpp +++ b/core/fxcrt/xml/cfx_xmldoc.cpp @@ -18,20 +18,14 @@ #include "third_party/base/ptr_util.h" #include "third_party/base/stl_util.h" -CFX_XMLDoc::CFX_XMLDoc(const RetainPtr& pStream) - : m_pRoot(pdfium::MakeUnique()), - m_pXMLParser(pdfium::MakeUnique(m_pRoot.get(), pStream)) { - ASSERT(pStream); - +CFX_XMLDoc::CFX_XMLDoc() : m_pRoot(pdfium::MakeUnique()) { m_pRoot->AppendChild(new CFX_XMLInstruction(L"xml")); } CFX_XMLDoc::~CFX_XMLDoc() {} -int32_t CFX_XMLDoc::Load() { - return m_pXMLParser->Parse(); -} - -void CFX_XMLDoc::CloseXML() { - m_pXMLParser.reset(); +bool CFX_XMLDoc::Load(const RetainPtr& pStream) { + ASSERT(pStream); + CFX_XMLParser parser(m_pRoot.get(), pStream); + return parser.Parse(); } diff --git a/core/fxcrt/xml/cfx_xmldoc.h b/core/fxcrt/xml/cfx_xmldoc.h index 55976b01b3..915acd325d 100644 --- a/core/fxcrt/xml/cfx_xmldoc.h +++ b/core/fxcrt/xml/cfx_xmldoc.h @@ -8,6 +8,7 @@ #define CORE_FXCRT_XML_CFX_XMLDOC_H_ #include +#include #include "core/fxcrt/cfx_seekablestreamproxy.h" #include "core/fxcrt/retain_ptr.h" @@ -16,18 +17,15 @@ class CFX_XMLDoc { public: - explicit CFX_XMLDoc(const RetainPtr& pStream); + CFX_XMLDoc(); ~CFX_XMLDoc(); - int32_t Load(); - void CloseXML(); + bool Load(const RetainPtr& pStream); - CFX_XMLNode* GetRoot() const { return m_pRoot.get(); } - CFX_XMLParser* GetParser() const { return m_pXMLParser.get(); } + std::unique_ptr GetTree() { return std::move(m_pRoot); } private: std::unique_ptr m_pRoot; - std::unique_ptr m_pXMLParser; }; #endif // CORE_FXCRT_XML_CFX_XMLDOC_H_ diff --git a/core/fxcrt/xml/cfx_xmlparser.cpp b/core/fxcrt/xml/cfx_xmlparser.cpp index 39196d0264..d0acbc7818 100644 --- a/core/fxcrt/xml/cfx_xmlparser.cpp +++ b/core/fxcrt/xml/cfx_xmlparser.cpp @@ -132,36 +132,36 @@ CFX_XMLParser::CFX_XMLParser(CFX_XMLNode* pParent, CFX_XMLParser::~CFX_XMLParser() {} -int32_t CFX_XMLParser::Parse() { +bool CFX_XMLParser::Parse() { int32_t iCount = 0; while (true) { FX_XmlSyntaxResult result = DoSyntaxParse(); if (result == FX_XmlSyntaxResult::Error) - return -1; + return false; if (result == FX_XmlSyntaxResult::EndOfString) break; switch (result) { case FX_XmlSyntaxResult::InstructionClose: if (m_pChild && m_pChild->GetType() != FX_XMLNODE_Instruction) - return -1; + return false; m_pChild = m_pParent; break; case FX_XmlSyntaxResult::ElementClose: if (m_pChild->GetType() != FX_XMLNODE_Element) - return -1; + return false; m_ws1 = GetTagName(); if (m_ws1.GetLength() > 0 && m_ws1 != static_cast(m_pChild)->GetName()) { - return -1; + return false; } if (!m_NodeStack.empty()) m_NodeStack.pop(); if (m_NodeStack.empty()) - return -1; + return false; m_pParent = m_NodeStack.top(); m_pChild = m_pParent; @@ -209,7 +209,7 @@ int32_t CFX_XMLParser::Parse() { case FX_XmlSyntaxResult::TargetData: if (m_pChild) { if (m_pChild->GetType() != FX_XMLNODE_Instruction) - return -1; + return false; auto* instruction = static_cast(m_pChild); if (!m_ws1.IsEmpty()) @@ -226,7 +226,7 @@ int32_t CFX_XMLParser::Parse() { break; } } - return m_NodeStack.size() != 1 ? -1 : GetStatus(); + return m_NodeStack.size() != 1 ? false : GetStatus(); } FX_XmlSyntaxResult CFX_XMLParser::DoSyntaxParse() { @@ -698,18 +698,12 @@ FX_XmlSyntaxResult CFX_XMLParser::DoSyntaxParse() { int32_t CFX_XMLParser::GetStatus() const { if (!m_pStream) - return -1; + return false; int32_t iStreamLength = m_pStream->GetLength(); if (iStreamLength < 1) - return 100; - - if (m_syntaxParserResult == FX_XmlSyntaxResult::Error) - return -1; - - if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) - return 100; - return m_iParsedBytes * 100 / iStreamLength; + return true; + return m_syntaxParserResult != FX_XmlSyntaxResult::Error; } FX_FILESIZE CFX_XMLParser::GetCurrentBinaryPos() const { diff --git a/core/fxcrt/xml/cfx_xmlparser.h b/core/fxcrt/xml/cfx_xmlparser.h index c7587e9151..f4e3c03ee7 100644 --- a/core/fxcrt/xml/cfx_xmlparser.h +++ b/core/fxcrt/xml/cfx_xmlparser.h @@ -47,7 +47,7 @@ class CFX_XMLParser { const RetainPtr& pStream); virtual ~CFX_XMLParser(); - int32_t Parse(); + bool Parse(); protected: FX_XmlSyntaxResult DoSyntaxParse(); diff --git a/testing/libfuzzer/pdf_xml_fuzzer.cc b/testing/libfuzzer/pdf_xml_fuzzer.cc index 82627e1496..2ad57cf8bc 100644 --- a/testing/libfuzzer/pdf_xml_fuzzer.cc +++ b/testing/libfuzzer/pdf_xml_fuzzer.cc @@ -22,12 +22,12 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { RetainPtr stream = pdfium::MakeRetain(const_cast(data), size); - auto doc = pdfium::MakeUnique(stream); - if (doc->Load() < 100) + CFX_XMLDoc doc; + if (!doc.Load(stream)) return 0; - CFX_XMLNode* pXMLFakeRoot = doc->GetRoot(); - for (CFX_XMLNode* pXMLNode = pXMLFakeRoot->GetFirstChild(); pXMLNode; + auto root = doc.GetTree(); + for (CFX_XMLNode* pXMLNode = root->GetFirstChild(); pXMLNode; pXMLNode = pXMLNode->GetNextSibling()) { if (pXMLNode->GetType() == FX_XMLNODE_Element) break; diff --git a/xfa/fxfa/parser/cxfa_document_parser.cpp b/xfa/fxfa/parser/cxfa_document_parser.cpp index 18f9955210..883f684768 100644 --- a/xfa/fxfa/parser/cxfa_document_parser.cpp +++ b/xfa/fxfa/parser/cxfa_document_parser.cpp @@ -20,18 +20,15 @@ CXFA_DocumentParser::~CXFA_DocumentParser() { int32_t CXFA_DocumentParser::Parse(const RetainPtr& pStream, XFA_PacketType ePacketID) { - m_pDocument.reset(); - m_nodeParser.CloseParser(); - m_pDocument = pdfium::MakeUnique(GetNotify()); m_nodeParser.SetFactory(m_pDocument.get()); - int32_t nRetStatus = m_nodeParser.Parse(pStream, ePacketID); - if (nRetStatus == XFA_PARSESTATUS_Done) { - ASSERT(m_pDocument); - m_pDocument->SetRoot(m_nodeParser.GetRootNode()); - } - return nRetStatus; + if (!m_nodeParser.Parse(pStream, ePacketID)) + return XFA_PARSESTATUS_StatusErr; + + ASSERT(m_pDocument); + m_pDocument->SetRoot(m_nodeParser.GetRootNode()); + return XFA_PARSESTATUS_Done; } CXFA_FFNotify* CXFA_DocumentParser::GetNotify() const { diff --git a/xfa/fxfa/parser/cxfa_simple_parser.cpp b/xfa/fxfa/parser/cxfa_simple_parser.cpp index 225271fae8..232f1fbec8 100644 --- a/xfa/fxfa/parser/cxfa_simple_parser.cpp +++ b/xfa/fxfa/parser/cxfa_simple_parser.cpp @@ -100,11 +100,8 @@ const PacketInfo* GetPacketByName(const WideStringView& wsName) { return nullptr; } -CFX_XMLNode* GetDocumentNode(CFX_XMLDoc* pXMLDoc) { - if (!pXMLDoc) - return nullptr; - - for (CFX_XMLNode* pXMLNode = pXMLDoc->GetRoot()->GetFirstChild(); pXMLNode; +CFX_XMLNode* GetDocumentNode(CFX_XMLNode* pRootNode) { + for (CFX_XMLNode* pXMLNode = pRootNode->GetFirstChild(); pXMLNode; pXMLNode = pXMLNode->GetNextSibling()) { if (pXMLNode->GetType() != FX_XMLNODE_Element) continue; @@ -338,49 +335,33 @@ void CXFA_SimpleParser::SetFactory(CXFA_Document* pFactory) { m_pFactory = pFactory; } -int32_t CXFA_SimpleParser::Parse(const RetainPtr& pStream, - XFA_PacketType ePacketID) { - CloseParser(); - - m_pFileRead = pStream; - m_pStream = pdfium::MakeRetain(pStream, false); - uint16_t wCodePage = m_pStream->GetCodePage(); +bool CXFA_SimpleParser::Parse(const RetainPtr& pStream, + XFA_PacketType ePacketID) { + auto pStreamProxy = + pdfium::MakeRetain(pStream, false); + uint16_t wCodePage = pStreamProxy->GetCodePage(); if (wCodePage != FX_CODEPAGE_UTF16LE && wCodePage != FX_CODEPAGE_UTF16BE && wCodePage != FX_CODEPAGE_UTF8) { - m_pStream->SetCodePage(FX_CODEPAGE_UTF8); + pStreamProxy->SetCodePage(FX_CODEPAGE_UTF8); } - m_pXMLDoc = pdfium::MakeUnique(m_pStream); - if (!m_pXMLDoc) - return XFA_PARSESTATUS_StatusErr; - - int32_t iRet = m_pXMLDoc->Load(); - if (iRet < 0) - return XFA_PARSESTATUS_SyntaxErr; - if (iRet < 100) - return iRet / 2; - - m_pRootNode = ParseAsXDPPacket(GetDocumentNode(m_pXMLDoc.get()), ePacketID); - m_pXMLDoc->CloseXML(); - m_pStream.Reset(); - - if (!m_pRootNode) - return XFA_PARSESTATUS_StatusErr; + CFX_XMLDoc doc; + if (!doc.Load(pStreamProxy)) + return false; - return XFA_PARSESTATUS_Done; + m_pNodeTree = doc.GetTree(); + m_pRootNode = ParseAsXDPPacket(GetDocumentNode(m_pNodeTree.get()), ePacketID); + return !!m_pRootNode; } CFX_XMLNode* CXFA_SimpleParser::ParseXMLData(const ByteString& wsXML) { - CloseParser(); - auto pStream = pdfium::MakeRetain( const_cast(wsXML.raw_str()), wsXML.GetLength()); - m_pXMLDoc = pdfium::MakeUnique(pStream); + CFX_XMLDoc doc; + if (doc.Load(pStream)) + m_pNodeTree = doc.GetTree(); - int32_t iRet = m_pXMLDoc->Load(); - if (iRet < 0 || iRet >= 100) - m_pXMLDoc->CloseXML(); - return iRet < 100 ? nullptr : GetDocumentNode(m_pXMLDoc.get()); + return m_pNodeTree ? GetDocumentNode(m_pNodeTree.get()) : nullptr; } void CXFA_SimpleParser::ConstructXFANode(CXFA_Node* pXFANode, @@ -1180,8 +1161,3 @@ void CXFA_SimpleParser::ParseInstruction(CXFA_Node* pXFANode, } } } - -void CXFA_SimpleParser::CloseParser() { - m_pXMLDoc.reset(); - m_pStream.Reset(); -} diff --git a/xfa/fxfa/parser/cxfa_simple_parser.h b/xfa/fxfa/parser/cxfa_simple_parser.h index cef9c9af54..61edf8e8e6 100644 --- a/xfa/fxfa/parser/cxfa_simple_parser.h +++ b/xfa/fxfa/parser/cxfa_simple_parser.h @@ -25,13 +25,12 @@ class CXFA_SimpleParser { explicit CXFA_SimpleParser(CXFA_Document* pFactory); ~CXFA_SimpleParser(); - int32_t Parse(const RetainPtr& pStream, - XFA_PacketType ePacketID); + bool Parse(const RetainPtr& pStream, + XFA_PacketType ePacketID); CFX_XMLNode* ParseXMLData(const ByteString& wsXML); void ConstructXFANode(CXFA_Node* pXFANode, CFX_XMLNode* pXMLNode); CXFA_Node* GetRootNode() const; - void CloseParser(); // Called later for the ctor with no parameters. void SetFactory(CXFA_Document* pFactory); @@ -71,10 +70,8 @@ class CXFA_SimpleParser { CFX_XMLInstruction* pXMLInstruction, XFA_PacketType ePacketID); - std::unique_ptr m_pXMLDoc; - RetainPtr m_pStream; - RetainPtr m_pFileRead; UnownedPtr m_pFactory; + std::unique_ptr m_pNodeTree; // TODO(dsinclair): Figure out who owns this. CXFA_Node* m_pRootNode = nullptr; const bool m_bDocumentParser; -- cgit v1.2.3