// Copyright 2016 PDFium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include "xfa/fxfa/parser/cxfa_simple_parser.h" #include #include #include "core/fxcrt/cfx_checksumcontext.h" #include "core/fxcrt/cfx_seekablestreamproxy.h" #include "core/fxcrt/cfx_widetextbuf.h" #include "core/fxcrt/fx_codepage.h" #include "core/fxcrt/fx_extension.h" #include "core/fxcrt/xml/cfx_xmlchardata.h" #include "core/fxcrt/xml/cfx_xmldoc.h" #include "core/fxcrt/xml/cfx_xmlelement.h" #include "core/fxcrt/xml/cfx_xmlinstruction.h" #include "core/fxcrt/xml/cfx_xmlnode.h" #include "core/fxcrt/xml/cfx_xmlparser.h" #include "core/fxcrt/xml/cfx_xmltext.h" #include "fxjs/xfa/cjx_object.h" #include "third_party/base/logging.h" #include "third_party/base/ptr_util.h" #include "xfa/fxfa/fxfa.h" #include "xfa/fxfa/parser/cxfa_document.h" #include "xfa/fxfa/parser/cxfa_node.h" #include "xfa/fxfa/parser/cxfa_subform.h" #include "xfa/fxfa/parser/cxfa_template.h" #include "xfa/fxfa/parser/xfa_basic_data.h" #include "xfa/fxfa/parser/xfa_utils.h" namespace { struct PacketInfo { uint32_t hash; const wchar_t* name; XFA_PacketType packet_type; const wchar_t* uri; uint32_t flags; }; const PacketInfo PacketData[] = { {0x0, nullptr, XFA_PacketType::User, nullptr, XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTMANY}, {0x811929d, L"sourceSet", XFA_PacketType::SourceSet, L"http://www.xfa.org/schema/xfa-source-set/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0xb843dba, L"pdf", XFA_PacketType::Pdf, L"http://ns.adobe.com/xdp/pdf/", XFA_XDPPACKET_FLAGS_COMPLETEMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0xc56afbf, L"xdc", XFA_PacketType::Xdc, L"http://www.xfa.org/schema/xdc/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0xc56afcc, L"xdp", XFA_PacketType::Xdp, L"http://ns.adobe.com/xdp/", XFA_XDPPACKET_FLAGS_COMPLETEMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0x132a8fbc, L"xmpmeta", XFA_PacketType::Xmpmeta, L"http://ns.adobe.com/xmpmeta/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTMANY}, {0x48d004a8, L"xfdf", XFA_PacketType::Xfdf, L"http://ns.adobe.com/xfdf/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0x4e1e39b6, L"config", XFA_PacketType::Config, L"http://www.xfa.org/schema/xci/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0x5473b6dc, L"localeSet", XFA_PacketType::LocaleSet, L"http://www.xfa.org/schema/xfa-locale-set/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0x6038580a, L"stylesheet", XFA_PacketType::Stylesheet, L"http://www.w3.org/1999/XSL/Transform", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTMANY}, {0x803550fc, L"template", XFA_PacketType::Template, L"http://www.xfa.org/schema/xfa-template/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0x8b036f32, L"signature", XFA_PacketType::Signature, L"http://www.w3.org/2000/09/xmldsig#", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0x99b95079, L"datasets", XFA_PacketType::Datasets, L"http://www.xfa.org/schema/xfa-data/", XFA_XDPPACKET_FLAGS_PREFIXMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0xcd309ff4, L"form", XFA_PacketType::Form, L"http://www.xfa.org/schema/xfa-form/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, {0xe14c801c, L"connectionSet", XFA_PacketType::ConnectionSet, L"http://www.xfa.org/schema/xfa-connection-set/", XFA_XDPPACKET_FLAGS_NOMATCH | XFA_XDPPACKET_FLAGS_SUPPORTONE}, }; const PacketInfo* GetPacketByIndex(XFA_PacketType ePacket) { return PacketData + static_cast(ePacket); } const PacketInfo* GetPacketByName(const WideStringView& wsName) { if (wsName.IsEmpty()) return nullptr; uint32_t hash = FX_HashCode_GetW(wsName, false); auto* elem = std::lower_bound( std::begin(PacketData), std::end(PacketData), hash, [](const PacketInfo& a, uint32_t hash) { return a.hash < hash; }); if (elem != std::end(PacketData) && elem->hash == hash) return elem; return nullptr; } CFX_XMLNode* GetDocumentNode(CFX_XMLDoc* pXMLDoc) { if (!pXMLDoc) return nullptr; for (CFX_XMLNode* pXMLNode = pXMLDoc->GetRoot()->GetFirstChild(); pXMLNode; pXMLNode = pXMLNode->GetNextSibling()) { if (pXMLNode->GetType() != FX_XMLNODE_Element) continue; return pXMLNode; } return nullptr; } WideString GetElementTagNamespaceURI(CFX_XMLElement* pElement) { WideString wsNodeStr = pElement->GetNamespacePrefix(); WideString wsNamespaceURI; if (!XFA_FDEExtension_ResolveNamespaceQualifier(pElement, wsNodeStr, &wsNamespaceURI)) { return WideString(); } return wsNamespaceURI; } bool MatchNodeName(CFX_XMLNode* pNode, const WideStringView& wsLocalTagName, const WideStringView& wsNamespaceURIPrefix, uint32_t eMatchFlags = XFA_XDPPACKET_FLAGS_NOMATCH) { if (!pNode || pNode->GetType() != FX_XMLNODE_Element) return false; CFX_XMLElement* pElement = reinterpret_cast(pNode); WideString wsNodeStr = pElement->GetLocalTagName(); if (wsNodeStr != wsLocalTagName) return false; wsNodeStr = GetElementTagNamespaceURI(pElement); if (eMatchFlags & XFA_XDPPACKET_FLAGS_NOMATCH) return true; if (eMatchFlags & XFA_XDPPACKET_FLAGS_PREFIXMATCH) { return wsNodeStr.Left(wsNamespaceURIPrefix.GetLength()) == wsNamespaceURIPrefix; } return wsNodeStr == wsNamespaceURIPrefix; } bool GetAttributeLocalName(const WideStringView& wsAttributeName, WideString& wsLocalAttrName) { WideString wsAttrName(wsAttributeName); auto pos = wsAttrName.Find(L':', 0); if (!pos.has_value()) { wsLocalAttrName = wsAttrName; return false; } wsLocalAttrName = wsAttrName.Right(wsAttrName.GetLength() - pos.value() - 1); return true; } bool ResolveAttribute(CFX_XMLElement* pElement, const WideString& wsAttrName, WideString& wsLocalAttrName, WideString& wsNamespaceURI) { WideString wsNSPrefix; if (GetAttributeLocalName(wsAttrName.AsStringView(), wsLocalAttrName)) { wsNSPrefix = wsAttrName.Left(wsAttrName.GetLength() - wsLocalAttrName.GetLength() - 1); } if (wsLocalAttrName == L"xmlns" || wsNSPrefix == L"xmlns" || wsNSPrefix == L"xml") { return false; } if (!XFA_FDEExtension_ResolveNamespaceQualifier(pElement, wsNSPrefix, &wsNamespaceURI)) { wsNamespaceURI.clear(); return false; } return true; } bool FindAttributeWithNS(CFX_XMLElement* pElement, const WideStringView& wsLocalAttributeName, const WideStringView& wsNamespaceURIPrefix, WideString& wsValue, bool bMatchNSAsPrefix = false) { if (!pElement) return false; WideString wsAttrNS; for (auto it : pElement->GetAttributes()) { auto pos = it.first.Find(L':', 0); WideString wsNSPrefix; if (!pos.has_value()) { if (wsLocalAttributeName != it.first) continue; } else { if (wsLocalAttributeName != it.first.Right(it.first.GetLength() - pos.value() - 1)) { continue; } wsNSPrefix = it.first.Left(pos.value()); } if (!XFA_FDEExtension_ResolveNamespaceQualifier(pElement, wsNSPrefix, &wsAttrNS)) { continue; } if (bMatchNSAsPrefix) { if (wsAttrNS.Left(wsNamespaceURIPrefix.GetLength()) != wsNamespaceURIPrefix) { continue; } } else { if (wsAttrNS != wsNamespaceURIPrefix) continue; } wsValue = it.second; return true; } return false; } CFX_XMLNode* GetDataSetsFromXDP(CFX_XMLNode* pXMLDocumentNode) { const PacketInfo* datasets_packet = GetPacketByIndex(XFA_PacketType::Datasets); if (MatchNodeName(pXMLDocumentNode, datasets_packet->name, datasets_packet->uri, datasets_packet->flags)) { return pXMLDocumentNode; } const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Xdp); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) { return nullptr; } for (CFX_XMLNode* pDatasetsNode = pXMLDocumentNode->GetFirstChild(); pDatasetsNode; pDatasetsNode = pDatasetsNode->GetNextSibling()) { if (MatchNodeName(pDatasetsNode, datasets_packet->name, datasets_packet->uri, datasets_packet->flags)) { return pDatasetsNode; } } return nullptr; } bool IsStringAllWhitespace(WideString wsText) { wsText.TrimRight(L"\x20\x9\xD\xA"); return wsText.IsEmpty(); } void ConvertXMLToPlainText(CFX_XMLElement* pRootXMLNode, WideString& wsOutput) { for (CFX_XMLNode* pXMLChild = pRootXMLNode->GetFirstChild(); pXMLChild; pXMLChild = pXMLChild->GetNextSibling()) { switch (pXMLChild->GetType()) { case FX_XMLNODE_Element: { WideString wsTextData = static_cast(pXMLChild)->GetTextData(); wsTextData += L"\n"; wsOutput += wsTextData; break; } case FX_XMLNODE_Text: case FX_XMLNODE_CharData: { WideString wsText = static_cast(pXMLChild)->GetText(); if (IsStringAllWhitespace(wsText)) continue; wsOutput = wsText; break; } default: NOTREACHED(); break; } } } WideString GetPlainTextFromRichText(CFX_XMLNode* pXMLNode) { if (!pXMLNode) return L""; WideString wsPlainText; switch (pXMLNode->GetType()) { case FX_XMLNODE_Element: { CFX_XMLElement* pXMLElement = static_cast(pXMLNode); WideString wsTag = pXMLElement->GetLocalTagName(); uint32_t uTag = FX_HashCode_GetW(wsTag.AsStringView(), true); if (uTag == 0x0001f714) { wsPlainText += L"\n"; } else if (uTag == 0x00000070) { if (!wsPlainText.IsEmpty()) { wsPlainText += L"\n"; } } else if (uTag == 0xa48ac63) { if (!wsPlainText.IsEmpty() && wsPlainText[wsPlainText.GetLength() - 1] != '\n') { wsPlainText += L"\n"; } } break; } case FX_XMLNODE_Text: case FX_XMLNODE_CharData: { WideString wsContent = static_cast(pXMLNode)->GetText(); wsPlainText += wsContent; break; } default: break; } for (CFX_XMLNode* pChildXML = pXMLNode->GetFirstChild(); pChildXML; pChildXML = pChildXML->GetNextSibling()) { wsPlainText += GetPlainTextFromRichText(pChildXML); } return wsPlainText; } } // namespace bool XFA_RecognizeRichText(CFX_XMLElement* pRichTextXMLNode) { return pRichTextXMLNode && GetElementTagNamespaceURI(pRichTextXMLNode) == L"http://www.w3.org/1999/xhtml"; } CXFA_SimpleParser::CXFA_SimpleParser() : m_bDocumentParser(true) {} CXFA_SimpleParser::CXFA_SimpleParser(CXFA_Document* pFactory) : m_pFactory(pFactory), m_bDocumentParser(false) {} CXFA_SimpleParser::~CXFA_SimpleParser() {} void CXFA_SimpleParser::SetFactory(CXFA_Document* pFactory) { ASSERT(m_bDocumentParser); m_pFactory = pFactory; } void CXFA_SimpleParser::StartParse(const RetainPtr& pStream, XFA_PacketType ePacketID) { CloseParser(); m_pFileRead = pStream; m_pStream = pdfium::MakeRetain(pStream, false); uint16_t wCodePage = m_pStream->GetCodePage(); if (wCodePage != FX_CODEPAGE_UTF16LE && wCodePage != FX_CODEPAGE_UTF16BE && wCodePage != FX_CODEPAGE_UTF8) { m_pStream->SetCodePage(FX_CODEPAGE_UTF8); } m_pXMLDoc = pdfium::MakeUnique(m_pStream); m_bParseStarted = true; m_ePacketID = ePacketID; } int32_t CXFA_SimpleParser::DoParse() { if (!m_pXMLDoc || !m_bParseStarted) return XFA_PARSESTATUS_StatusErr; int32_t iRet = m_pXMLDoc->DoLoad(); if (iRet < 0) return XFA_PARSESTATUS_SyntaxErr; if (iRet < 100) return iRet / 2; m_pRootNode = ParseAsXDPPacket(GetDocumentNode(m_pXMLDoc.get()), m_ePacketID); m_pXMLDoc->CloseXML(); m_pStream.Reset(); if (!m_pRootNode) return XFA_PARSESTATUS_StatusErr; return XFA_PARSESTATUS_Done; } CFX_XMLNode* CXFA_SimpleParser::ParseXMLData(const ByteString& wsXML) { CloseParser(); auto pStream = pdfium::MakeRetain( const_cast(wsXML.raw_str()), wsXML.GetLength()); m_pXMLDoc = pdfium::MakeUnique(pStream); m_pXMLDoc->GetParser()->m_dwCheckStatus = 0x03; int32_t iRet = m_pXMLDoc->DoLoad(); if (iRet < 0 || iRet >= 100) m_pXMLDoc->CloseXML(); return iRet < 100 ? nullptr : GetDocumentNode(m_pXMLDoc.get()); } void CXFA_SimpleParser::ConstructXFANode(CXFA_Node* pXFANode, CFX_XMLNode* pXMLNode) { XFA_PacketType ePacketID = pXFANode->GetPacketType(); if (ePacketID == XFA_PacketType::Datasets) { if (pXFANode->GetElementType() == XFA_Element::DataValue) { for (CFX_XMLNode* pXMLChild = pXMLNode->GetFirstChild(); pXMLChild; pXMLChild = pXMLChild->GetNextSibling()) { FX_XMLNODETYPE eNodeType = pXMLChild->GetType(); if (eNodeType == FX_XMLNODE_Instruction) continue; if (eNodeType == FX_XMLNODE_Element) { CXFA_Node* pXFAChild = m_pFactory->CreateNode( XFA_PacketType::Datasets, XFA_Element::DataValue); if (!pXFAChild) return; CFX_XMLElement* child = static_cast(pXMLChild); WideString wsNodeStr = child->GetLocalTagName(); pXFAChild->JSObject()->SetCData(XFA_Attribute::Name, wsNodeStr, false, false); WideString wsChildValue = GetPlainTextFromRichText(child); if (!wsChildValue.IsEmpty()) pXFAChild->JSObject()->SetCData(XFA_Attribute::Value, wsChildValue, false, false); pXFANode->InsertChild(pXFAChild, nullptr); pXFAChild->SetXMLMappingNode(pXMLChild); pXFAChild->SetFlag(XFA_NodeFlag_Initialized); break; } } m_pRootNode = pXFANode; } else { m_pRootNode = DataLoader(pXFANode, pXMLNode, true); } } else if (pXFANode->IsContentNode()) { ParseContentNode(pXFANode, pXMLNode, ePacketID); m_pRootNode = pXFANode; } else { m_pRootNode = NormalLoader(pXFANode, pXMLNode, ePacketID, true); } } CXFA_Node* CXFA_SimpleParser::GetRootNode() const { return m_pRootNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket(CFX_XMLNode* pXMLDocumentNode, XFA_PacketType ePacketID) { switch (ePacketID) { case XFA_PacketType::Xdp: return ParseAsXDPPacket_XDP(pXMLDocumentNode); case XFA_PacketType::Config: return ParseAsXDPPacket_Config(pXMLDocumentNode); case XFA_PacketType::Template: return ParseAsXDPPacket_Template(pXMLDocumentNode); case XFA_PacketType::Form: return ParseAsXDPPacket_Form(pXMLDocumentNode); case XFA_PacketType::Datasets: return ParseAsXDPPacket_Data(pXMLDocumentNode); case XFA_PacketType::Xdc: return ParseAsXDPPacket_Xdc(pXMLDocumentNode); case XFA_PacketType::LocaleSet: return ParseAsXDPPacket_LocaleConnectionSourceSet( pXMLDocumentNode, XFA_PacketType::LocaleSet, XFA_Element::LocaleSet); case XFA_PacketType::ConnectionSet: return ParseAsXDPPacket_LocaleConnectionSourceSet( pXMLDocumentNode, XFA_PacketType::ConnectionSet, XFA_Element::ConnectionSet); case XFA_PacketType::SourceSet: return ParseAsXDPPacket_LocaleConnectionSourceSet( pXMLDocumentNode, XFA_PacketType::SourceSet, XFA_Element::SourceSet); default: return ParseAsXDPPacket_User(pXMLDocumentNode); } } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_XDP( CFX_XMLNode* pXMLDocumentNode) { const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Xdp); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) { return nullptr; } CXFA_Node* pXFARootNode = m_pFactory->CreateNode(XFA_PacketType::Xdp, XFA_Element::Xfa); if (!pXFARootNode) return nullptr; m_pRootNode = pXFARootNode; pXFARootNode->JSObject()->SetCData(XFA_Attribute::Name, L"xfa", false, false); CFX_XMLElement* pElement = static_cast(pXMLDocumentNode); for (auto it : pElement->GetAttributes()) { if (it.first == L"uuid") pXFARootNode->JSObject()->SetCData(XFA_Attribute::Uuid, it.second, false, false); else if (it.first == L"timeStamp") pXFARootNode->JSObject()->SetCData(XFA_Attribute::TimeStamp, it.second, false, false); } CFX_XMLNode* pXMLConfigDOMRoot = nullptr; CXFA_Node* pXFAConfigDOMRoot = nullptr; for (CFX_XMLNode* pChildItem = pXMLDocumentNode->GetFirstChild(); pChildItem; pChildItem = pChildItem->GetNextSibling()) { const PacketInfo* pPacketInfo = GetPacketByIndex(XFA_PacketType::Config); if (!MatchNodeName(pChildItem, pPacketInfo->name, pPacketInfo->uri, pPacketInfo->flags)) { continue; } if (pXFARootNode->GetFirstChildByName(pPacketInfo->hash)) return nullptr; pXMLConfigDOMRoot = pChildItem; pXFAConfigDOMRoot = ParseAsXDPPacket_Config(pXMLConfigDOMRoot); if (pXFAConfigDOMRoot) pXFARootNode->InsertChild(pXFAConfigDOMRoot, nullptr); } CFX_XMLNode* pXMLDatasetsDOMRoot = nullptr; CFX_XMLNode* pXMLFormDOMRoot = nullptr; CFX_XMLNode* pXMLTemplateDOMRoot = nullptr; for (CFX_XMLNode* pChildItem = pXMLDocumentNode->GetFirstChild(); pChildItem; pChildItem = pChildItem->GetNextSibling()) { if (!pChildItem || pChildItem->GetType() != FX_XMLNODE_Element) continue; if (pChildItem == pXMLConfigDOMRoot) continue; CFX_XMLElement* pElement = reinterpret_cast(pChildItem); WideString wsPacketName = pElement->GetLocalTagName(); const PacketInfo* pPacketInfo = GetPacketByName(wsPacketName.AsStringView()); if (pPacketInfo && pPacketInfo->uri) { if (!MatchNodeName(pElement, pPacketInfo->name, pPacketInfo->uri, pPacketInfo->flags)) { pPacketInfo = nullptr; } } XFA_PacketType ePacket = pPacketInfo ? pPacketInfo->packet_type : XFA_PacketType::User; if (ePacket == XFA_PacketType::Xdp) continue; if (ePacket == XFA_PacketType::Datasets) { if (pXMLDatasetsDOMRoot) return nullptr; pXMLDatasetsDOMRoot = pElement; } else if (ePacket == XFA_PacketType::Form) { if (pXMLFormDOMRoot) return nullptr; pXMLFormDOMRoot = pElement; } else if (ePacket == XFA_PacketType::Template) { // Found a duplicate template packet. if (pXMLTemplateDOMRoot) return nullptr; CXFA_Node* pPacketNode = ParseAsXDPPacket(pElement, ePacket); if (pPacketNode) { pXMLTemplateDOMRoot = pElement; pXFARootNode->InsertChild(pPacketNode, nullptr); } } else { CXFA_Node* pPacketNode = ParseAsXDPPacket(pElement, ePacket); if (pPacketNode) { if (pPacketInfo && (pPacketInfo->flags & XFA_XDPPACKET_FLAGS_SUPPORTONE) && pXFARootNode->GetFirstChildByName(pPacketInfo->hash)) { return nullptr; } pXFARootNode->InsertChild(pPacketNode, nullptr); } } } // No template is found. if (!pXMLTemplateDOMRoot) return nullptr; if (pXMLDatasetsDOMRoot) { CXFA_Node* pPacketNode = ParseAsXDPPacket(pXMLDatasetsDOMRoot, XFA_PacketType::Datasets); if (pPacketNode) pXFARootNode->InsertChild(pPacketNode, nullptr); } if (pXMLFormDOMRoot) { CXFA_Node* pPacketNode = ParseAsXDPPacket(pXMLFormDOMRoot, XFA_PacketType::Form); if (pPacketNode) pXFARootNode->InsertChild(pPacketNode, nullptr); } pXFARootNode->SetXMLMappingNode(pXMLDocumentNode); return pXFARootNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_Config( CFX_XMLNode* pXMLDocumentNode) { const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Config); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) { return nullptr; } CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Config, XFA_Element::Config); if (!pNode) return nullptr; pNode->JSObject()->SetCData(XFA_Attribute::Name, packet->name, false, false); if (!NormalLoader(pNode, pXMLDocumentNode, XFA_PacketType::Config, true)) return nullptr; pNode->SetXMLMappingNode(pXMLDocumentNode); return pNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_Template( CFX_XMLNode* pXMLDocumentNode) { const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Template); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) { return nullptr; } CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Template, XFA_Element::Template); if (!pNode) return nullptr; pNode->JSObject()->SetCData(XFA_Attribute::Name, packet->name, false, false); if (m_bDocumentParser) { CFX_XMLElement* pXMLDocumentElement = static_cast(pXMLDocumentNode); WideString wsNamespaceURI = pXMLDocumentElement->GetNamespaceURI(); if (wsNamespaceURI.IsEmpty()) wsNamespaceURI = pXMLDocumentElement->GetString(L"xmlns:xfa"); pNode->GetDocument()->RecognizeXFAVersionNumber(wsNamespaceURI); } if (!NormalLoader(pNode, pXMLDocumentNode, XFA_PacketType::Template, true)) return nullptr; pNode->SetXMLMappingNode(pXMLDocumentNode); return pNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_Form( CFX_XMLNode* pXMLDocumentNode) { const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Form); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) { return nullptr; } CFX_XMLElement* pXMLDocumentElement = static_cast(pXMLDocumentNode); WideString wsChecksum = pXMLDocumentElement->GetString(L"checksum"); if (wsChecksum.GetLength() != 28 || m_pXMLDoc->GetParser()->m_dwCheckStatus != 0x03) { return nullptr; } auto pChecksum = pdfium::MakeUnique(); pChecksum->StartChecksum(); pChecksum->UpdateChecksum(m_pFileRead, m_pXMLDoc->GetParser()->m_nStart[0], m_pXMLDoc->GetParser()->m_nSize[0]); pChecksum->UpdateChecksum(m_pFileRead, m_pXMLDoc->GetParser()->m_nStart[1], m_pXMLDoc->GetParser()->m_nSize[1]); pChecksum->FinishChecksum(); ByteString bsCheck = pChecksum->GetChecksum(); if (bsCheck != wsChecksum.UTF8Encode()) return nullptr; CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Form, XFA_Element::Form); if (!pNode) return nullptr; pNode->JSObject()->SetCData(XFA_Attribute::Name, packet->name, false, false); pNode->JSObject()->SetAttribute(XFA_Attribute::Checksum, wsChecksum.AsStringView(), false); CXFA_Template* pTemplateRoot = m_pRootNode->GetFirstChildByClass(XFA_Element::Template); CXFA_Subform* pTemplateChosen = pTemplateRoot ? pTemplateRoot->GetFirstChildByClass( XFA_Element::Subform) : nullptr; bool bUseAttribute = true; if (pTemplateChosen && pTemplateChosen->JSObject()->GetEnum(XFA_Attribute::RestoreState) != XFA_AttributeEnum::Auto) { bUseAttribute = false; } if (!NormalLoader(pNode, pXMLDocumentNode, XFA_PacketType::Form, bUseAttribute)) return nullptr; pNode->SetXMLMappingNode(pXMLDocumentNode); return pNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_Data( CFX_XMLNode* pXMLDocumentNode) { CFX_XMLNode* pDatasetsXMLNode = GetDataSetsFromXDP(pXMLDocumentNode); const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Datasets); if (pDatasetsXMLNode) { CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Datasets, XFA_Element::DataModel); if (!pNode) return nullptr; pNode->JSObject()->SetCData(XFA_Attribute::Name, packet->name, false, false); if (!DataLoader(pNode, pDatasetsXMLNode, false)) return nullptr; pNode->SetXMLMappingNode(pDatasetsXMLNode); return pNode; } MaybeOwned pDataXMLNode; if (MatchNodeName(pXMLDocumentNode, L"data", packet->uri, packet->flags)) { static_cast(pXMLDocumentNode) ->RemoveAttribute(L"xmlns:xfa"); pDataXMLNode.Reset(pXMLDocumentNode); } else { auto pDataElement = pdfium::MakeUnique(L"xfa:data"); CFX_XMLNode* pParentXMLNode = pXMLDocumentNode->GetParent(); if (pParentXMLNode) pParentXMLNode->RemoveChildNode(pXMLDocumentNode); ASSERT(pXMLDocumentNode->GetType() == FX_XMLNODE_Element); if (pXMLDocumentNode->GetType() == FX_XMLNODE_Element) { static_cast(pXMLDocumentNode) ->RemoveAttribute(L"xmlns:xfa"); } pDataElement->AppendChild(pXMLDocumentNode); pDataXMLNode.Reset(std::move(pDataElement)); } if (!pDataXMLNode) return nullptr; CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Datasets, XFA_Element::DataGroup); if (!pNode) return nullptr; WideString wsLocalName = static_cast(pDataXMLNode.Get())->GetLocalTagName(); pNode->JSObject()->SetCData(XFA_Attribute::Name, wsLocalName, false, false); if (!DataLoader(pNode, pDataXMLNode.Get(), true)) return nullptr; pNode->SetXMLMappingNode(std::move(pDataXMLNode)); return pNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_LocaleConnectionSourceSet( CFX_XMLNode* pXMLDocumentNode, XFA_PacketType packet_type, XFA_Element element) { const PacketInfo* packet = GetPacketByIndex(packet_type); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) { return nullptr; } CXFA_Node* pNode = m_pFactory->CreateNode(packet_type, element); if (!pNode) return nullptr; pNode->JSObject()->SetCData(XFA_Attribute::Name, packet->name, false, false); if (!NormalLoader(pNode, pXMLDocumentNode, packet_type, true)) return nullptr; pNode->SetXMLMappingNode(pXMLDocumentNode); return pNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_Xdc( CFX_XMLNode* pXMLDocumentNode) { const PacketInfo* packet = GetPacketByIndex(XFA_PacketType::Xdc); if (!MatchNodeName(pXMLDocumentNode, packet->name, packet->uri, packet->flags)) return nullptr; CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Xdc, XFA_Element::Xdc); if (!pNode) return nullptr; pNode->JSObject()->SetCData(XFA_Attribute::Name, packet->name, false, false); pNode->SetXMLMappingNode(pXMLDocumentNode); return pNode; } CXFA_Node* CXFA_SimpleParser::ParseAsXDPPacket_User( CFX_XMLNode* pXMLDocumentNode) { CXFA_Node* pNode = m_pFactory->CreateNode(XFA_PacketType::Xdp, XFA_Element::Packet); if (!pNode) return nullptr; WideString wsName = static_cast(pXMLDocumentNode)->GetLocalTagName(); pNode->JSObject()->SetCData(XFA_Attribute::Name, wsName, false, false); if (!UserPacketLoader(pNode, pXMLDocumentNode)) return nullptr; pNode->SetXMLMappingNode(pXMLDocumentNode); return pNode; } CXFA_Node* CXFA_SimpleParser::UserPacketLoader(CXFA_Node* pXFANode, CFX_XMLNode* pXMLDoc) { return pXFANode; } CXFA_Node* CXFA_SimpleParser::DataLoader(CXFA_Node* pXFANode, CFX_XMLNode* pXMLDoc, bool bDoTransform) { ParseDataGroup(pXFANode, pXMLDoc, XFA_PacketType::Datasets); return pXFANode; } CXFA_Node* CXFA_SimpleParser::NormalLoader(CXFA_Node* pXFANode, CFX_XMLNode* pXMLDoc, XFA_PacketType ePacketID, bool bUseAttribute) { bool bOneOfPropertyFound = false; for (CFX_XMLNode* pXMLChild = pXMLDoc->GetFirstChild(); pXMLChild; pXMLChild = pXMLChild->GetNextSibling()) { switch (pXMLChild->GetType()) { case FX_XMLNODE_Element: { CFX_XMLElement* pXMLElement = static_cast(pXMLChild); WideString wsTagName = pXMLElement->GetLocalTagName(); XFA_Element eType = CXFA_Node::NameToElement(wsTagName); if (eType == XFA_Element::Unknown) continue; if (pXFANode->HasPropertyFlags( eType, XFA_PROPERTYFLAG_OneOf | XFA_PROPERTYFLAG_DefaultOneOf)) { if (bOneOfPropertyFound) break; bOneOfPropertyFound = true; } CXFA_Node* pXFAChild = m_pFactory->CreateNode(ePacketID, eType); if (!pXFAChild) return nullptr; if (ePacketID == XFA_PacketType::Config) { pXFAChild->JSObject()->SetAttribute(XFA_Attribute::Name, wsTagName.AsStringView(), false); } bool IsNeedValue = true; for (auto it : pXMLElement->GetAttributes()) { WideString wsAttrName; GetAttributeLocalName(it.first.AsStringView(), wsAttrName); if (wsAttrName == L"nil" && it.second == L"true") IsNeedValue = false; XFA_Attribute attr = CXFA_Node::NameToAttribute(wsAttrName.AsStringView()); if (attr == XFA_Attribute::Unknown) continue; if (!bUseAttribute && attr != XFA_Attribute::Name && attr != XFA_Attribute::Save) { continue; } pXFAChild->JSObject()->SetAttribute(attr, it.second.AsStringView(), false); } pXFANode->InsertChild(pXFAChild, nullptr); if (eType == XFA_Element::Validate || eType == XFA_Element::Locale) { if (ePacketID == XFA_PacketType::Config) ParseContentNode(pXFAChild, pXMLElement, ePacketID); else NormalLoader(pXFAChild, pXMLElement, ePacketID, bUseAttribute); break; } switch (pXFAChild->GetObjectType()) { case XFA_ObjectType::ContentNode: case XFA_ObjectType::TextNode: case XFA_ObjectType::NodeC: case XFA_ObjectType::NodeV: if (IsNeedValue) ParseContentNode(pXFAChild, pXMLElement, ePacketID); break; default: NormalLoader(pXFAChild, pXMLElement, ePacketID, bUseAttribute); break; } } break; case FX_XMLNODE_Instruction: ParseInstruction(pXFANode, static_cast(pXMLChild), ePacketID); break; default: break; } } return pXFANode; } void CXFA_SimpleParser::ParseContentNode(CXFA_Node* pXFANode, CFX_XMLNode* pXMLNode, XFA_PacketType ePacketID) { XFA_Element element = XFA_Element::Sharptext; if (pXFANode->GetElementType() == XFA_Element::ExData) { WideString wsContentType = pXFANode->JSObject()->GetCData(XFA_Attribute::ContentType); if (wsContentType == L"text/html") element = XFA_Element::SharpxHTML; else if (wsContentType == L"text/xml") element = XFA_Element::Sharpxml; } if (element == XFA_Element::SharpxHTML) pXFANode->SetXMLMappingNode(pXMLNode); WideString wsValue; for (CFX_XMLNode* pXMLChild = pXMLNode->GetFirstChild(); pXMLChild; pXMLChild = pXMLChild->GetNextSibling()) { FX_XMLNODETYPE eNodeType = pXMLChild->GetType(); if (eNodeType == FX_XMLNODE_Instruction) continue; if (element == XFA_Element::SharpxHTML) { if (eNodeType != FX_XMLNODE_Element) break; if (XFA_RecognizeRichText(static_cast(pXMLChild))) wsValue += GetPlainTextFromRichText(static_cast(pXMLChild)); } else if (element == XFA_Element::Sharpxml) { if (eNodeType != FX_XMLNODE_Element) break; ConvertXMLToPlainText(static_cast(pXMLChild), wsValue); } else { if (eNodeType == FX_XMLNODE_Element) break; if (eNodeType == FX_XMLNODE_Text || eNodeType == FX_XMLNODE_CharData) wsValue = static_cast(pXMLChild)->GetText(); } break; } if (!wsValue.IsEmpty()) { if (pXFANode->IsContentNode()) { CXFA_Node* pContentRawDataNode = m_pFactory->CreateNode(ePacketID, element); ASSERT(pContentRawDataNode); pContentRawDataNode->JSObject()->SetCData(XFA_Attribute::Value, wsValue, false, false); pXFANode->InsertChild(pContentRawDataNode, nullptr); } else { pXFANode->JSObject()->SetCData(XFA_Attribute::Value, wsValue, false, false); } } } void CXFA_SimpleParser::ParseDataGroup(CXFA_Node* pXFANode, CFX_XMLNode* pXMLNode, XFA_PacketType ePacketID) { for (CFX_XMLNode* pXMLChild = pXMLNode->GetFirstChild(); pXMLChild; pXMLChild = pXMLChild->GetNextSibling()) { switch (pXMLChild->GetType()) { case FX_XMLNODE_Element: { CFX_XMLElement* pXMLElement = static_cast(pXMLChild); { WideString wsNamespaceURI = GetElementTagNamespaceURI(pXMLElement); if (wsNamespaceURI == L"http://www.xfa.com/schema/xfa-package/" || wsNamespaceURI == L"http://www.xfa.org/schema/xfa-package/" || wsNamespaceURI == L"http://www.w3.org/2001/XMLSchema-instance") { continue; } } XFA_Element eNodeType = XFA_Element::DataModel; if (eNodeType == XFA_Element::DataModel) { WideString wsDataNodeAttr; if (FindAttributeWithNS(pXMLElement, L"dataNode", L"http://www.xfa.org/schema/xfa-data/1.0/", wsDataNodeAttr)) { if (wsDataNodeAttr == L"dataGroup") eNodeType = XFA_Element::DataGroup; else if (wsDataNodeAttr == L"dataValue") eNodeType = XFA_Element::DataValue; } } WideString wsContentType; if (eNodeType == XFA_Element::DataModel) { if (FindAttributeWithNS(pXMLElement, L"contentType", L"http://www.xfa.org/schema/xfa-data/1.0/", wsContentType)) { if (!wsContentType.IsEmpty()) eNodeType = XFA_Element::DataValue; } } if (eNodeType == XFA_Element::DataModel) { for (CFX_XMLNode* pXMLDataChild = pXMLElement->GetFirstChild(); pXMLDataChild; pXMLDataChild = pXMLDataChild->GetNextSibling()) { if (pXMLDataChild->GetType() == FX_XMLNODE_Element) { if (!XFA_RecognizeRichText( static_cast(pXMLDataChild))) { eNodeType = XFA_Element::DataGroup; break; } } } } if (eNodeType == XFA_Element::DataModel) eNodeType = XFA_Element::DataValue; CXFA_Node* pXFAChild = m_pFactory->CreateNode(XFA_PacketType::Datasets, eNodeType); if (!pXFAChild) return; pXFAChild->JSObject()->SetCData( XFA_Attribute::Name, pXMLElement->GetLocalTagName(), false, false); bool bNeedValue = true; for (auto it : pXMLElement->GetAttributes()) { WideString wsName; WideString wsNS; if (!ResolveAttribute(pXMLElement, it.first, wsName, wsNS)) { continue; } if (wsName == L"nil" && it.second == L"true") { bNeedValue = false; continue; } if (wsNS == L"http://www.xfa.com/schema/xfa-package/" || wsNS == L"http://www.xfa.org/schema/xfa-package/" || wsNS == L"http://www.w3.org/2001/XMLSchema-instance" || wsNS == L"http://www.xfa.org/schema/xfa-data/1.0/") { continue; } CXFA_Node* pXFAMetaData = m_pFactory->CreateNode( XFA_PacketType::Datasets, XFA_Element::DataValue); if (!pXFAMetaData) return; pXFAMetaData->JSObject()->SetCData(XFA_Attribute::Name, wsName, false, false); pXFAMetaData->JSObject()->SetCData(XFA_Attribute::QualifiedName, it.first, false, false); pXFAMetaData->JSObject()->SetCData(XFA_Attribute::Value, it.second, false, false); pXFAMetaData->JSObject()->SetEnum(XFA_Attribute::Contains, XFA_AttributeEnum::MetaData, false); pXFAChild->InsertChild(pXFAMetaData, nullptr); pXFAMetaData->SetXMLMappingNode(pXMLElement); pXFAMetaData->SetFlag(XFA_NodeFlag_Initialized); } if (!bNeedValue) { WideString wsNilName(L"xsi:nil"); pXMLElement->RemoveAttribute(wsNilName.c_str()); } pXFANode->InsertChild(pXFAChild, nullptr); if (eNodeType == XFA_Element::DataGroup) ParseDataGroup(pXFAChild, pXMLElement, ePacketID); else if (bNeedValue) ParseDataValue(pXFAChild, pXMLChild, XFA_PacketType::Datasets); pXFAChild->SetXMLMappingNode(pXMLElement); pXFAChild->SetFlag(XFA_NodeFlag_Initialized); continue; } case FX_XMLNODE_CharData: case FX_XMLNODE_Text: { CFX_XMLText* pXMLText = static_cast(pXMLChild); WideString wsText = pXMLText->GetText(); if (IsStringAllWhitespace(wsText)) continue; CXFA_Node* pXFAChild = m_pFactory->CreateNode(XFA_PacketType::Datasets, XFA_Element::DataValue); if (!pXFAChild) return; pXFAChild->JSObject()->SetCData(XFA_Attribute::Value, wsText, false, false); pXFANode->InsertChild(pXFAChild, nullptr); pXFAChild->SetXMLMappingNode(pXMLText); pXFAChild->SetFlag(XFA_NodeFlag_Initialized); continue; } default: continue; } } } void CXFA_SimpleParser::ParseDataValue(CXFA_Node* pXFANode, CFX_XMLNode* pXMLNode, XFA_PacketType ePacketID) { CFX_WideTextBuf wsValueTextBuf; CFX_WideTextBuf wsCurValueTextBuf; bool bMarkAsCompound = false; CFX_XMLNode* pXMLCurValueNode = nullptr; for (CFX_XMLNode* pXMLChild = pXMLNode->GetFirstChild(); pXMLChild; pXMLChild = pXMLChild->GetNextSibling()) { FX_XMLNODETYPE eNodeType = pXMLChild->GetType(); if (eNodeType == FX_XMLNODE_Instruction) continue; if (eNodeType == FX_XMLNODE_Text || eNodeType == FX_XMLNODE_CharData) { WideString wsText = static_cast(pXMLChild)->GetText(); if (!pXMLCurValueNode) pXMLCurValueNode = pXMLChild; wsCurValueTextBuf << wsText; } else if (XFA_RecognizeRichText(static_cast(pXMLChild))) { WideString wsText = GetPlainTextFromRichText(static_cast(pXMLChild)); if (!pXMLCurValueNode) pXMLCurValueNode = pXMLChild; wsCurValueTextBuf << wsText; } else { bMarkAsCompound = true; if (pXMLCurValueNode) { WideString wsCurValue = wsCurValueTextBuf.MakeString(); if (!wsCurValue.IsEmpty()) { CXFA_Node* pXFAChild = m_pFactory->CreateNode(ePacketID, XFA_Element::DataValue); if (!pXFAChild) return; pXFAChild->JSObject()->SetCData(XFA_Attribute::Name, L"", false, false); pXFAChild->JSObject()->SetCData(XFA_Attribute::Value, wsCurValue, false, false); pXFANode->InsertChild(pXFAChild, nullptr); pXFAChild->SetXMLMappingNode(pXMLCurValueNode); pXFAChild->SetFlag(XFA_NodeFlag_Initialized); wsValueTextBuf << wsCurValue; wsCurValueTextBuf.Clear(); } pXMLCurValueNode = nullptr; } CXFA_Node* pXFAChild = m_pFactory->CreateNode(ePacketID, XFA_Element::DataValue); if (!pXFAChild) return; WideString wsNodeStr = static_cast(pXMLChild)->GetLocalTagName(); pXFAChild->JSObject()->SetCData(XFA_Attribute::Name, wsNodeStr, false, false); ParseDataValue(pXFAChild, pXMLChild, ePacketID); pXFANode->InsertChild(pXFAChild, nullptr); pXFAChild->SetXMLMappingNode(pXMLChild); pXFAChild->SetFlag(XFA_NodeFlag_Initialized); WideString wsCurValue = pXFAChild->JSObject()->GetCData(XFA_Attribute::Value); wsValueTextBuf << wsCurValue; } } if (pXMLCurValueNode) { WideString wsCurValue = wsCurValueTextBuf.MakeString(); if (!wsCurValue.IsEmpty()) { if (bMarkAsCompound) { CXFA_Node* pXFAChild = m_pFactory->CreateNode(ePacketID, XFA_Element::DataValue); if (!pXFAChild) return; pXFAChild->JSObject()->SetCData(XFA_Attribute::Name, L"", false, false); pXFAChild->JSObject()->SetCData(XFA_Attribute::Value, wsCurValue, false, false); pXFANode->InsertChild(pXFAChild, nullptr); pXFAChild->SetXMLMappingNode(pXMLCurValueNode); pXFAChild->SetFlag(XFA_NodeFlag_Initialized); } wsValueTextBuf << wsCurValue; wsCurValueTextBuf.Clear(); } pXMLCurValueNode = nullptr; } WideString wsNodeValue = wsValueTextBuf.MakeString(); pXFANode->JSObject()->SetCData(XFA_Attribute::Value, wsNodeValue, false, false); } void CXFA_SimpleParser::ParseInstruction(CXFA_Node* pXFANode, CFX_XMLInstruction* pXMLInstruction, XFA_PacketType ePacketID) { if (!m_bDocumentParser) return; WideString wsTargetName = pXMLInstruction->GetName(); const std::vector& target_data = pXMLInstruction->GetTargetData(); if (wsTargetName == L"originalXFAVersion") { if (target_data.size() > 1 && (pXFANode->GetDocument()->RecognizeXFAVersionNumber(target_data[0]) != XFA_VERSION_UNKNOWN) && target_data[1] == L"v2.7-scripting:1") { pXFANode->GetDocument()->SetFlag(XFA_DOCFLAG_Scripting, true); } } else if (wsTargetName == L"acrobat") { if (target_data.size() > 1 && target_data[0] == L"JavaScript" && target_data[1] == L"strictScoping") { pXFANode->GetDocument()->SetFlag(XFA_DOCFLAG_StrictScoping, true); } } } void CXFA_SimpleParser::CloseParser() { m_pXMLDoc.reset(); m_pStream.Reset(); }