diff options
-rw-r--r-- | BUILD.gn | 4 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_xmlparser.cpp | 709 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_xmlparser.h | 112 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_xmlparser_unittest.cpp (renamed from core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp) | 130 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_xmlsyntaxparser.cpp | 694 | ||||
-rw-r--r-- | core/fxcrt/xml/cfx_xmlsyntaxparser.h | 130 |
6 files changed, 888 insertions, 891 deletions
@@ -963,8 +963,6 @@ jumbo_static_library("fxcrt") { "core/fxcrt/xml/cfx_xmlnode.h", "core/fxcrt/xml/cfx_xmlparser.cpp", "core/fxcrt/xml/cfx_xmlparser.h", - "core/fxcrt/xml/cfx_xmlsyntaxparser.cpp", - "core/fxcrt/xml/cfx_xmlsyntaxparser.h", "core/fxcrt/xml/cfx_xmltext.cpp", "core/fxcrt/xml/cfx_xmltext.h", ] @@ -2925,7 +2923,7 @@ test("pdfium_unittests") { "core/fxcrt/css/cfx_cssdeclaration_unittest.cpp", "core/fxcrt/css/cfx_cssstylesheet_unittest.cpp", "core/fxcrt/css/cfx_cssvaluelistparser_unittest.cpp", - "core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp", + "core/fxcrt/xml/cfx_xmlparser_unittest.cpp", "fxbarcode/oned/BC_OnedCodaBarWriter_unittest.cpp", "fxbarcode/oned/BC_OnedCode128Writer_unittest.cpp", "fxbarcode/oned/BC_OnedCode39Writer_unittest.cpp", diff --git a/core/fxcrt/xml/cfx_xmlparser.cpp b/core/fxcrt/xml/cfx_xmlparser.cpp index 76ea32df6c..39196d0264 100644 --- a/core/fxcrt/xml/cfx_xmlparser.cpp +++ b/core/fxcrt/xml/cfx_xmlparser.cpp @@ -6,6 +6,12 @@ #include "core/fxcrt/xml/cfx_xmlparser.h" +#include <algorithm> +#include <cwctype> +#include <iterator> + +#include "core/fxcrt/fx_extension.h" +#include "core/fxcrt/fx_safe_types.h" #include "core/fxcrt/xml/cfx_xmlchardata.h" #include "core/fxcrt/xml/cfx_xmlelement.h" #include "core/fxcrt/xml/cfx_xmlinstruction.h" @@ -13,13 +19,115 @@ #include "core/fxcrt/xml/cfx_xmltext.h" #include "third_party/base/ptr_util.h" +namespace { + +const uint32_t kMaxCharRange = 0x10ffff; + +bool IsXMLWhiteSpace(wchar_t ch) { + return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09; +} + +struct FX_XMLNAMECHAR { + uint16_t wStart; + uint16_t wEnd; + bool bStartChar; +}; + +const FX_XMLNAMECHAR g_XMLNameChars[] = { + {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false}, + {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true}, + {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true}, + {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true}, + {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false}, + {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true}, + {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true}, +}; + +int32_t GetUTF8EncodeLength(const std::vector<wchar_t>& src, + FX_FILESIZE iSrcLen) { + uint32_t unicode = 0; + int32_t iDstNum = 0; + const wchar_t* pSrc = src.data(); + while (iSrcLen-- > 0) { + unicode = *pSrc++; + int nbytes = 0; + if ((uint32_t)unicode < 0x80) { + nbytes = 1; + } else if ((uint32_t)unicode < 0x800) { + nbytes = 2; + } else if ((uint32_t)unicode < 0x10000) { + nbytes = 3; + } else if ((uint32_t)unicode < 0x200000) { + nbytes = 4; + } else if ((uint32_t)unicode < 0x4000000) { + nbytes = 5; + } else { + nbytes = 6; + } + iDstNum += nbytes; + } + return iDstNum; +} + +} // namespace + +// static +bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) { + auto* it = std::lower_bound( + std::begin(g_XMLNameChars), std::end(g_XMLNameChars), ch, + [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; }); + return it != std::end(g_XMLNameChars) && ch >= it->wStart && + (!bFirstChar || it->bStartChar); +} + CFX_XMLParser::CFX_XMLParser(CFX_XMLNode* pParent, const RetainPtr<CFX_SeekableStreamProxy>& pStream) - : m_pParser(pdfium::MakeUnique<CFX_XMLSyntaxParser>(pStream)), - m_pParent(pParent), - m_pChild(nullptr) { - ASSERT(m_pParent && pStream); + : m_pParent(pParent), + m_pChild(nullptr), + m_pStream(pStream), + m_iXMLPlaneSize(1024), + m_iCurrentPos(0), + m_iCurrentNodeNum(-1), + m_iLastNodeNum(-1), + m_iParsedBytes(0), + m_ParsedChars(0), + m_iBufferChars(0), + m_bEOS(false), + m_Start(0), + m_End(0), + m_iAllocStep(m_BlockBuffer.GetAllocStep()), + m_pCurrentBlock(nullptr), + m_iIndexInBlock(0), + m_iTextDataLength(0), + m_syntaxParserResult(FX_XmlSyntaxResult::None), + m_syntaxParserState(FDE_XmlSyntaxState::Text), + m_wQuotationMark(0), + m_iEntityStart(-1) { + ASSERT(m_pParent); + ASSERT(pStream); + m_NodeStack.push(m_pParent); + + m_CurNode.iNodeNum = -1; + m_CurNode.eNodeType = FX_XMLNODE_Unknown; + + m_iXMLPlaneSize = + std::min(m_iXMLPlaneSize, + pdfium::base::checked_cast<size_t>(m_pStream->GetLength())); + m_iCurrentPos = m_pStream->GetBOMLength(); + + FX_SAFE_SIZE_T alloc_size_safe = m_iXMLPlaneSize; + alloc_size_safe += 1; // For NUL. + if (!alloc_size_safe.IsValid() || alloc_size_safe.ValueOrDie() <= 0) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return; + } + + m_Buffer.resize(pdfium::base::ValueOrDieForType<size_t>(alloc_size_safe)); + + m_BlockBuffer.InitBuffer(); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); } CFX_XMLParser::~CFX_XMLParser() {} @@ -27,7 +135,7 @@ CFX_XMLParser::~CFX_XMLParser() {} int32_t CFX_XMLParser::Parse() { int32_t iCount = 0; while (true) { - FX_XmlSyntaxResult result = m_pParser->DoSyntaxParse(); + FX_XmlSyntaxResult result = DoSyntaxParse(); if (result == FX_XmlSyntaxResult::Error) return -1; if (result == FX_XmlSyntaxResult::EndOfString) @@ -44,7 +152,7 @@ int32_t CFX_XMLParser::Parse() { if (m_pChild->GetType() != FX_XMLNODE_Element) return -1; - m_ws1 = m_pParser->GetTagName(); + m_ws1 = GetTagName(); if (m_ws1.GetLength() > 0 && m_ws1 != static_cast<CFX_XMLElement*>(m_pChild)->GetName()) { return -1; @@ -60,7 +168,7 @@ int32_t CFX_XMLParser::Parse() { iCount++; break; case FX_XmlSyntaxResult::TargetName: - m_ws1 = m_pParser->GetTargetName(); + m_ws1 = GetTargetName(); if (m_ws1 == L"originalXFAVersion" || m_ws1 == L"acrobat") { m_pChild = new CFX_XMLInstruction(m_ws1); m_pParent->AppendChild(m_pChild); @@ -70,30 +178,30 @@ int32_t CFX_XMLParser::Parse() { m_ws1.clear(); break; case FX_XmlSyntaxResult::TagName: - m_ws1 = m_pParser->GetTagName(); + m_ws1 = GetTagName(); m_pChild = new CFX_XMLElement(m_ws1); m_pParent->AppendChild(m_pChild); m_NodeStack.push(m_pChild); m_pParent = m_pChild; break; case FX_XmlSyntaxResult::AttriName: - m_ws1 = m_pParser->GetAttributeName(); + m_ws1 = GetAttributeName(); break; case FX_XmlSyntaxResult::AttriValue: if (m_pChild && m_pChild->GetType() == FX_XMLNODE_Element) { - static_cast<CFX_XMLElement*>(m_pChild)->SetString( - m_ws1, m_pParser->GetAttributeName()); + static_cast<CFX_XMLElement*>(m_pChild)->SetString(m_ws1, + GetAttributeName()); } m_ws1.clear(); break; case FX_XmlSyntaxResult::Text: - m_ws1 = m_pParser->GetTextData(); + m_ws1 = GetTextData(); m_pChild = new CFX_XMLText(m_ws1); m_pParent->AppendChild(m_pChild); m_pChild = m_pParent; break; case FX_XmlSyntaxResult::CData: - m_ws1 = m_pParser->GetTextData(); + m_ws1 = GetTextData(); m_pChild = new CFX_XMLCharData(m_ws1); m_pParent->AppendChild(m_pChild); m_pChild = m_pParent; @@ -107,7 +215,7 @@ int32_t CFX_XMLParser::Parse() { if (!m_ws1.IsEmpty()) instruction->AppendData(m_ws1); - instruction->AppendData(m_pParser->GetTargetData()); + instruction->AppendData(GetTargetData()); } m_ws1.clear(); break; @@ -118,5 +226,576 @@ int32_t CFX_XMLParser::Parse() { break; } } - return m_NodeStack.size() != 1 ? -1 : m_pParser->GetStatus(); + return m_NodeStack.size() != 1 ? -1 : GetStatus(); +} + +FX_XmlSyntaxResult CFX_XMLParser::DoSyntaxParse() { + if (m_syntaxParserResult == FX_XmlSyntaxResult::Error || + m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) { + return m_syntaxParserResult; + } + + FX_FILESIZE iStreamLength = m_pStream->GetLength(); + FX_FILESIZE iPos; + + FX_XmlSyntaxResult syntaxParserResult = FX_XmlSyntaxResult::None; + while (true) { + if (m_Start >= m_End) { + if (m_bEOS || m_iCurrentPos >= iStreamLength) { + m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString; + return m_syntaxParserResult; + } + m_ParsedChars += m_End; + m_iParsedBytes = m_iCurrentPos; + if (m_pStream->GetPosition() != m_iCurrentPos) + m_pStream->Seek(CFX_SeekableStreamProxy::From::Begin, m_iCurrentPos); + + m_iBufferChars = + m_pStream->ReadString(m_Buffer.data(), m_iXMLPlaneSize, &m_bEOS); + iPos = m_pStream->GetPosition(); + if (m_iBufferChars < 1) { + m_iCurrentPos = iStreamLength; + m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString; + return m_syntaxParserResult; + } + m_iCurrentPos = iPos; + m_Start = 0; + m_End = m_iBufferChars; + } + + while (m_Start < m_End) { + wchar_t ch = m_Buffer[m_Start]; + switch (m_syntaxParserState) { + case FDE_XmlSyntaxState::Text: + if (ch == L'<') { + if (!m_BlockBuffer.IsEmpty()) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_iEntityStart = -1; + syntaxParserResult = FX_XmlSyntaxResult::Text; + } else { + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::Node; + } + } else { + ParseTextChar(ch); + } + break; + case FDE_XmlSyntaxState::Node: + if (ch == L'!') { + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::SkipCommentOrDecl; + } else if (ch == L'/') { + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::CloseElement; + } else if (ch == L'?') { + m_iLastNodeNum++; + m_iCurrentNodeNum = m_iLastNodeNum; + m_CurNode.iNodeNum = m_iLastNodeNum; + m_CurNode.eNodeType = FX_XMLNODE_Instruction; + m_XMLNodeStack.push(m_CurNode); + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::Target; + syntaxParserResult = FX_XmlSyntaxResult::InstructionOpen; + } else { + m_iLastNodeNum++; + m_iCurrentNodeNum = m_iLastNodeNum; + m_CurNode.iNodeNum = m_iLastNodeNum; + m_CurNode.eNodeType = FX_XMLNODE_Element; + m_XMLNodeStack.push(m_CurNode); + m_syntaxParserState = FDE_XmlSyntaxState::Tag; + syntaxParserResult = FX_XmlSyntaxResult::ElementOpen; + } + break; + case FDE_XmlSyntaxState::Target: + case FDE_XmlSyntaxState::Tag: + if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { + if (m_BlockBuffer.IsEmpty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (m_syntaxParserState != FDE_XmlSyntaxState::Target) + syntaxParserResult = FX_XmlSyntaxResult::TagName; + else + syntaxParserResult = FX_XmlSyntaxResult::TargetName; + + m_syntaxParserState = FDE_XmlSyntaxState::AttriName; + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriName: + if (m_BlockBuffer.IsEmpty() && IsXMLWhiteSpace(ch)) { + m_Start++; + break; + } + if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { + if (m_BlockBuffer.IsEmpty()) { + if (m_CurNode.eNodeType == FX_XMLNODE_Element) { + if (ch == L'>' || ch == L'/') { + m_syntaxParserState = FDE_XmlSyntaxState::BreakElement; + break; + } + } else if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { + if (ch == L'?') { + m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction; + m_Start++; + } else { + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + } + break; + } + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } else { + if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { + if (ch != '=' && !IsXMLWhiteSpace(ch)) { + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + break; + } + } + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::AttriEqualSign; + syntaxParserResult = FX_XmlSyntaxResult::AttriName; + } + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriEqualSign: + if (IsXMLWhiteSpace(ch)) { + m_Start++; + break; + } + if (ch != L'=') { + if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + break; + } + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } else { + m_syntaxParserState = FDE_XmlSyntaxState::AttriQuotation; + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriQuotation: + if (IsXMLWhiteSpace(ch)) { + m_Start++; + break; + } + if (ch != L'\"' && ch != L'\'') { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } else { + m_wQuotationMark = ch; + m_syntaxParserState = FDE_XmlSyntaxState::AttriValue; + m_Start++; + } + break; + case FDE_XmlSyntaxState::AttriValue: + if (ch == m_wQuotationMark) { + if (m_iEntityStart > -1) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_wQuotationMark = 0; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_Start++; + m_syntaxParserState = FDE_XmlSyntaxState::AttriName; + syntaxParserResult = FX_XmlSyntaxResult::AttriValue; + } else { + ParseTextChar(ch); + } + break; + case FDE_XmlSyntaxState::CloseInstruction: + if (ch != L'>') { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_syntaxParserState = FDE_XmlSyntaxState::TargetData; + } else if (!m_BlockBuffer.IsEmpty()) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + syntaxParserResult = FX_XmlSyntaxResult::TargetData; + } else { + m_Start++; + if (m_XMLNodeStack.empty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_XMLNodeStack.pop(); + if (!m_XMLNodeStack.empty()) { + m_CurNode = m_XMLNodeStack.top(); + } else { + m_CurNode.iNodeNum = -1; + m_CurNode.eNodeType = FX_XMLNODE_Unknown; + } + m_iCurrentNodeNum = m_CurNode.iNodeNum; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + syntaxParserResult = FX_XmlSyntaxResult::InstructionClose; + } + break; + case FDE_XmlSyntaxState::BreakElement: + if (ch == L'>') { + m_syntaxParserState = FDE_XmlSyntaxState::Text; + syntaxParserResult = FX_XmlSyntaxResult::ElementBreak; + } else if (ch == L'/') { + m_syntaxParserState = FDE_XmlSyntaxState::CloseElement; + } else { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_Start++; + break; + case FDE_XmlSyntaxState::CloseElement: + if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { + if (ch == L'>') { + if (m_XMLNodeStack.empty()) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + m_XMLNodeStack.pop(); + if (!m_XMLNodeStack.empty()) { + m_CurNode = m_XMLNodeStack.top(); + } else { + m_CurNode.iNodeNum = -1; + m_CurNode.eNodeType = FX_XMLNODE_Unknown; + } + m_iCurrentNodeNum = m_CurNode.iNodeNum; + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + syntaxParserResult = FX_XmlSyntaxResult::ElementClose; + } else if (!IsXMLWhiteSpace(ch)) { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + } + m_Start++; + break; + case FDE_XmlSyntaxState::SkipCommentOrDecl: + if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"--", 2) == 0) { + m_Start += 2; + m_syntaxParserState = FDE_XmlSyntaxState::SkipComment; + } else if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"[CDATA[", 7) == + 0) { + m_Start += 7; + m_syntaxParserState = FDE_XmlSyntaxState::SkipCData; + } else { + m_syntaxParserState = FDE_XmlSyntaxState::SkipDeclNode; + m_SkipChar = L'>'; + m_SkipStack.push(L'>'); + } + break; + case FDE_XmlSyntaxState::SkipCData: { + if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"]]>", 3) == 0) { + m_Start += 3; + syntaxParserResult = FX_XmlSyntaxResult::CData; + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) + return FX_XmlSyntaxResult::Error; + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + } + case FDE_XmlSyntaxState::SkipDeclNode: + if (m_SkipChar == L'\'' || m_SkipChar == L'\"') { + m_Start++; + if (ch != m_SkipChar) + break; + + m_SkipStack.pop(); + if (m_SkipStack.empty()) + m_syntaxParserState = FDE_XmlSyntaxState::Text; + else + m_SkipChar = m_SkipStack.top(); + } else { + switch (ch) { + case L'<': + m_SkipChar = L'>'; + m_SkipStack.push(L'>'); + break; + case L'[': + m_SkipChar = L']'; + m_SkipStack.push(L']'); + break; + case L'(': + m_SkipChar = L')'; + m_SkipStack.push(L')'); + break; + case L'\'': + m_SkipChar = L'\''; + m_SkipStack.push(L'\''); + break; + case L'\"': + m_SkipChar = L'\"'; + m_SkipStack.push(L'\"'); + break; + default: + if (ch == m_SkipChar) { + m_SkipStack.pop(); + if (m_SkipStack.empty()) { + if (m_BlockBuffer.GetDataLength() >= 9) + (void)m_BlockBuffer.GetTextData(0, 7); + + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_syntaxParserState = FDE_XmlSyntaxState::Text; + } else { + m_SkipChar = m_SkipStack.top(); + } + } + break; + } + if (!m_SkipStack.empty()) { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + } + m_Start++; + } + break; + case FDE_XmlSyntaxState::SkipComment: + if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"-->", 3) == 0) { + m_Start += 2; + m_syntaxParserState = FDE_XmlSyntaxState::Text; + } + + m_Start++; + break; + case FDE_XmlSyntaxState::TargetData: + if (IsXMLWhiteSpace(ch)) { + if (m_BlockBuffer.IsEmpty()) { + m_Start++; + break; + } + if (m_wQuotationMark == 0) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_wQuotationMark = 0; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_Start++; + syntaxParserResult = FX_XmlSyntaxResult::TargetData; + break; + } + } + if (ch == '?') { + m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction; + m_Start++; + } else if (ch == '\"') { + if (m_wQuotationMark == 0) { + m_wQuotationMark = ch; + m_Start++; + } else if (ch == m_wQuotationMark) { + m_iTextDataLength = m_BlockBuffer.GetDataLength(); + m_wQuotationMark = 0; + m_BlockBuffer.Reset(true); + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_Start++; + syntaxParserResult = FX_XmlSyntaxResult::TargetData; + } else { + m_syntaxParserResult = FX_XmlSyntaxResult::Error; + return m_syntaxParserResult; + } + } else { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) { + return FX_XmlSyntaxResult::Error; + } + } + m_pCurrentBlock[m_iIndexInBlock++] = ch; + m_BlockBuffer.IncrementDataLength(); + m_Start++; + } + break; + default: + break; + } + if (syntaxParserResult != FX_XmlSyntaxResult::None) + return syntaxParserResult; + } + } + return FX_XmlSyntaxResult::Text; +} + +int32_t CFX_XMLParser::GetStatus() const { + if (!m_pStream) + return -1; + + int32_t iStreamLength = m_pStream->GetLength(); + if (iStreamLength < 1) + return 100; + + if (m_syntaxParserResult == FX_XmlSyntaxResult::Error) + return -1; + + if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) + return 100; + return m_iParsedBytes * 100 / iStreamLength; +} + +FX_FILESIZE CFX_XMLParser::GetCurrentBinaryPos() const { + if (!m_pStream) + return 0; + + int32_t nDstLen = GetUTF8EncodeLength(m_Buffer, m_Start); + return m_iParsedBytes + nDstLen; +} + +void CFX_XMLParser::ParseTextChar(wchar_t character) { + if (m_iIndexInBlock == m_iAllocStep) { + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + if (!m_pCurrentBlock) + return; + } + + m_pCurrentBlock[m_iIndexInBlock++] = character; + m_BlockBuffer.IncrementDataLength(); + if (m_iEntityStart > -1 && character == L';') { + WideString csEntity = m_BlockBuffer.GetTextData( + m_iEntityStart + 1, + m_BlockBuffer.GetDataLength() - 1 - m_iEntityStart - 1); + int32_t iLen = csEntity.GetLength(); + if (iLen > 0) { + if (csEntity[0] == L'#') { + uint32_t ch = 0; + wchar_t w; + if (iLen > 1 && csEntity[1] == L'x') { + for (int32_t i = 2; i < iLen; i++) { + w = csEntity[i]; + if (std::iswdigit(w)) + ch = (ch << 4) + w - L'0'; + else if (w >= L'A' && w <= L'F') + ch = (ch << 4) + w - 55; + else if (w >= L'a' && w <= L'f') + ch = (ch << 4) + w - 87; + else + break; + } + } else { + for (int32_t i = 1; i < iLen; i++) { + w = csEntity[i]; + if (!std::iswdigit(w)) + break; + ch = ch * 10 + w - L'0'; + } + } + if (ch > kMaxCharRange) + ch = ' '; + + character = static_cast<wchar_t>(ch); + if (character != 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, character); + m_iEntityStart++; + } + } else { + if (csEntity.Compare(L"amp") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'&'); + m_iEntityStart++; + } else if (csEntity.Compare(L"lt") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'<'); + m_iEntityStart++; + } else if (csEntity.Compare(L"gt") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'>'); + m_iEntityStart++; + } else if (csEntity.Compare(L"apos") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'\''); + m_iEntityStart++; + } else if (csEntity.Compare(L"quot") == 0) { + m_BlockBuffer.SetTextChar(m_iEntityStart, L'\"'); + m_iEntityStart++; + } + } + } + if (m_iEntityStart >= 0 && + m_BlockBuffer.GetDataLength() > static_cast<size_t>(m_iEntityStart)) { + m_BlockBuffer.DeleteTextChars(m_BlockBuffer.GetDataLength() - + m_iEntityStart); + } + std::tie(m_pCurrentBlock, m_iIndexInBlock) = + m_BlockBuffer.GetAvailableBlock(); + m_iEntityStart = -1; + } else if (m_iEntityStart < 0 && character == L'&') { + m_iEntityStart = m_BlockBuffer.GetDataLength() - 1; + } + m_Start++; } diff --git a/core/fxcrt/xml/cfx_xmlparser.h b/core/fxcrt/xml/cfx_xmlparser.h index 8c4c354699..c7587e9151 100644 --- a/core/fxcrt/xml/cfx_xmlparser.h +++ b/core/fxcrt/xml/cfx_xmlparser.h @@ -9,29 +9,135 @@ #include <memory> #include <stack> +#include <vector> +#include "core/fxcrt/cfx_blockbuffer.h" +#include "core/fxcrt/cfx_seekablestreamproxy.h" #include "core/fxcrt/fx_string.h" #include "core/fxcrt/retain_ptr.h" -#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" class CFX_XMLElement; class CFX_XMLNode; class CFX_SeekableStreamProxy; +enum class FX_XmlSyntaxResult { + None, + InstructionOpen, + InstructionClose, + ElementOpen, + ElementBreak, + ElementClose, + TargetName, + TagName, + AttriName, + AttriValue, + Text, + CData, + TargetData, + Error, + EndOfString +}; + class CFX_XMLParser { public: + static bool IsXMLNameChar(wchar_t ch, bool bFirstChar); + CFX_XMLParser(CFX_XMLNode* pParent, const RetainPtr<CFX_SeekableStreamProxy>& pStream); - ~CFX_XMLParser(); + virtual ~CFX_XMLParser(); int32_t Parse(); + protected: + FX_XmlSyntaxResult DoSyntaxParse(); + + WideString GetTagName() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + WideString GetAttributeName() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + WideString GetAttributeValue() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + WideString GetTextData() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + private: - std::unique_ptr<CFX_XMLSyntaxParser> m_pParser; + enum class FDE_XmlSyntaxState { + Text, + Node, + Target, + Tag, + AttriName, + AttriEqualSign, + AttriQuotation, + AttriValue, + Entity, + EntityDecimal, + EntityHex, + CloseInstruction, + BreakElement, + CloseElement, + SkipDeclNode, + DeclCharData, + SkipComment, + SkipCommentOrDecl, + SkipCData, + TargetData + }; + + void ParseTextChar(wchar_t ch); + + int32_t GetStatus() const; + FX_FILESIZE GetCurrentPos() const { return m_ParsedChars + m_Start; } + FX_FILESIZE GetCurrentBinaryPos() const; + int32_t GetCurrentNodeNumber() const { return m_iCurrentNodeNum; } + int32_t GetLastNodeNumber() const { return m_iLastNodeNum; } + + WideString GetTargetName() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + + WideString GetTargetData() const { + return m_BlockBuffer.GetTextData(0, m_iTextDataLength); + } + CFX_XMLNode* m_pParent; CFX_XMLNode* m_pChild; std::stack<CFX_XMLNode*> m_NodeStack; WideString m_ws1; + + RetainPtr<CFX_SeekableStreamProxy> m_pStream; + size_t m_iXMLPlaneSize; + FX_FILESIZE m_iCurrentPos; + int32_t m_iCurrentNodeNum; + int32_t m_iLastNodeNum; + int32_t m_iParsedBytes; + FX_FILESIZE m_ParsedChars; + std::vector<wchar_t> m_Buffer; + size_t m_iBufferChars; + bool m_bEOS; + FX_FILESIZE m_Start; // Start position in m_Buffer + FX_FILESIZE m_End; // End position in m_Buffer + FX_XMLNODE m_CurNode; + std::stack<FX_XMLNODE> m_XMLNodeStack; + CFX_BlockBuffer m_BlockBuffer; + int32_t m_iAllocStep; + wchar_t* m_pCurrentBlock; // Pointer into CFX_BlockBuffer + int32_t m_iIndexInBlock; + int32_t m_iTextDataLength; + FX_XmlSyntaxResult m_syntaxParserResult; + FDE_XmlSyntaxState m_syntaxParserState; + wchar_t m_wQuotationMark; + int32_t m_iEntityStart; + std::stack<wchar_t> m_SkipStack; + wchar_t m_SkipChar; }; #endif // CORE_FXCRT_XML_CFX_XMLPARSER_H_ diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp b/core/fxcrt/xml/cfx_xmlparser_unittest.cpp index b7f7c416b1..d22925f797 100644 --- a/core/fxcrt/xml/cfx_xmlsyntaxparser_unittest.cpp +++ b/core/fxcrt/xml/cfx_xmlparser_unittest.cpp @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h" +#include "core/fxcrt/xml/cfx_xmlparser.h" #include <memory> @@ -11,7 +11,30 @@ #include "testing/gtest/include/gtest/gtest.h" #include "testing/test_support.h" -TEST(CFX_XMLSyntaxParserTest, CData) { +class CFX_XMLTestParser : public CFX_XMLParser { + public: + CFX_XMLTestParser(CFX_XMLNode* pParent, + const RetainPtr<CFX_SeekableStreamProxy>& pStream) + : CFX_XMLParser(pParent, pStream) {} + + ~CFX_XMLTestParser() override = default; + + FX_XmlSyntaxResult DoSyntaxParse() { return CFX_XMLParser::DoSyntaxParse(); } + + WideString GetTagName() const { return CFX_XMLParser::GetTagName(); } + + WideString GetAttributeName() const { + return CFX_XMLParser::GetAttributeName(); + } + + WideString GetAttributeValue() const { + return CFX_XMLParser::GetAttributeValue(); + } + + WideString GetTextData() const { return CFX_XMLParser::GetTextData(); } +}; + +TEST(CFX_XMLParserTest, CData) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <![CDATA[\n" @@ -31,7 +54,8 @@ TEST(CFX_XMLSyntaxParserTest, CData) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -57,7 +81,7 @@ TEST(CFX_XMLSyntaxParserTest, CData) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, CDataWithInnerScript) { +TEST(CFX_XMLParserTest, CDataWithInnerScript) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <![CDATA[\n" @@ -79,7 +103,8 @@ TEST(CFX_XMLSyntaxParserTest, CDataWithInnerScript) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -105,7 +130,7 @@ TEST(CFX_XMLSyntaxParserTest, CDataWithInnerScript) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, ArrowBangArrow) { +TEST(CFX_XMLParserTest, ArrowBangArrow) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <!>\n" @@ -116,7 +141,8 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangArrow) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); @@ -140,7 +166,7 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangArrow) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) { +TEST(CFX_XMLParserTest, ArrowBangBracketArrow) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <![>\n" @@ -151,7 +177,8 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -170,7 +197,7 @@ TEST(CFX_XMLSyntaxParserTest, ArrowBangBracketArrow) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, IncompleteCData) { +TEST(CFX_XMLParserTest, IncompleteCData) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <![CDATA>\n" @@ -181,7 +208,8 @@ TEST(CFX_XMLSyntaxParserTest, IncompleteCData) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -200,7 +228,7 @@ TEST(CFX_XMLSyntaxParserTest, IncompleteCData) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, UnClosedCData) { +TEST(CFX_XMLParserTest, UnClosedCData) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <![CDATA[\n" @@ -211,7 +239,8 @@ TEST(CFX_XMLSyntaxParserTest, UnClosedCData) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -230,7 +259,7 @@ TEST(CFX_XMLSyntaxParserTest, UnClosedCData) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, EmptyCData) { +TEST(CFX_XMLParserTest, EmptyCData) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <![CDATA[]]>\n" @@ -241,7 +270,8 @@ TEST(CFX_XMLSyntaxParserTest, EmptyCData) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -267,7 +297,7 @@ TEST(CFX_XMLSyntaxParserTest, EmptyCData) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, Comment) { +TEST(CFX_XMLParserTest, Comment) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <!-- A Comment -->\n" @@ -278,7 +308,8 @@ TEST(CFX_XMLSyntaxParserTest, Comment) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -301,7 +332,7 @@ TEST(CFX_XMLSyntaxParserTest, Comment) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, IncorrectCommentStart) { +TEST(CFX_XMLParserTest, IncorrectCommentStart) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <!- A Comment -->\n" @@ -312,7 +343,8 @@ TEST(CFX_XMLSyntaxParserTest, IncorrectCommentStart) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -335,7 +367,7 @@ TEST(CFX_XMLSyntaxParserTest, IncorrectCommentStart) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, CommentEmpty) { +TEST(CFX_XMLParserTest, CommentEmpty) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <!---->\n" @@ -346,7 +378,8 @@ TEST(CFX_XMLSyntaxParserTest, CommentEmpty) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -369,7 +402,7 @@ TEST(CFX_XMLSyntaxParserTest, CommentEmpty) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, CommentThreeDash) { +TEST(CFX_XMLParserTest, CommentThreeDash) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <!--->\n" @@ -380,7 +413,8 @@ TEST(CFX_XMLSyntaxParserTest, CommentThreeDash) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -397,7 +431,7 @@ TEST(CFX_XMLSyntaxParserTest, CommentThreeDash) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, CommentTwoDash) { +TEST(CFX_XMLParserTest, CommentTwoDash) { const char* input = "<script contentType=\"application/x-javascript\">\n" " <!-->\n" @@ -408,7 +442,8 @@ TEST(CFX_XMLSyntaxParserTest, CommentTwoDash) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -425,7 +460,7 @@ TEST(CFX_XMLSyntaxParserTest, CommentTwoDash) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, Entities) { +TEST(CFX_XMLParserTest, Entities) { const char* input = "<script contentType=\"application/x-javascript\">" "B" @@ -440,7 +475,8 @@ TEST(CFX_XMLSyntaxParserTest, Entities) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -460,7 +496,7 @@ TEST(CFX_XMLSyntaxParserTest, Entities) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, EntityOverflowHex) { +TEST(CFX_XMLParserTest, EntityOverflowHex) { const char* input = "<script contentType=\"application/x-javascript\">" "�" @@ -472,7 +508,8 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowHex) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -492,7 +529,7 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowHex) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) { +TEST(CFX_XMLParserTest, EntityOverflowDecimal) { const char* input = "<script contentType=\"application/x-javascript\">" "�" @@ -504,7 +541,8 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) { reinterpret_cast<uint8_t*>(const_cast<char*>(input)), strlen(input)); stream->SetCodePage(FX_CODEPAGE_UTF8); - CFX_XMLSyntaxParser parser(stream); + auto root = pdfium::MakeUnique<CFX_XMLNode>(); + CFX_XMLTestParser parser(root.get(), stream); ASSERT_EQ(FX_XmlSyntaxResult::ElementOpen, parser.DoSyntaxParse()); ASSERT_EQ(FX_XmlSyntaxResult::TagName, parser.DoSyntaxParse()); ASSERT_EQ(L"script", parser.GetTagName()); @@ -524,19 +562,19 @@ TEST(CFX_XMLSyntaxParserTest, EntityOverflowDecimal) { ASSERT_EQ(FX_XmlSyntaxResult::EndOfString, parser.DoSyntaxParse()); } -TEST(CFX_XMLSyntaxParserTest, IsXMLNameChar) { - EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(L'-', true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(L'-', false)); - - EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2069, true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2070, true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2073, true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0x218F, true)); - EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0x2190, true)); - - EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFDEF, true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFDF0, true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFDF1, true)); - EXPECT_TRUE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFFFD, true)); - EXPECT_FALSE(CFX_XMLSyntaxParser::IsXMLNameChar(0xFFFE, true)); +TEST(CFX_XMLParserTest, IsXMLNameChar) { + EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(L'-', true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(L'-', false)); + + EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0x2069, true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0x2070, true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0x2073, true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0x218F, true)); + EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0x2190, true)); + + EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0xFDEF, true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0xFDF0, true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0xFDF1, true)); + EXPECT_TRUE(CFX_XMLTestParser::IsXMLNameChar(0xFFFD, true)); + EXPECT_FALSE(CFX_XMLTestParser::IsXMLNameChar(0xFFFE, true)); } diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp b/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp deleted file mode 100644 index 1fb51b7d52..0000000000 --- a/core/fxcrt/xml/cfx_xmlsyntaxparser.cpp +++ /dev/null @@ -1,694 +0,0 @@ -// Copyright 2017 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#include "core/fxcrt/xml/cfx_xmlsyntaxparser.h" - -#include <algorithm> -#include <cwctype> -#include <iterator> - -#include "core/fxcrt/fx_extension.h" -#include "core/fxcrt/fx_safe_types.h" - -namespace { - -const uint32_t kMaxCharRange = 0x10ffff; - -bool IsXMLWhiteSpace(wchar_t ch) { - return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09; -} - -struct FX_XMLNAMECHAR { - uint16_t wStart; - uint16_t wEnd; - bool bStartChar; -}; - -const FX_XMLNAMECHAR g_XMLNameChars[] = { - {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false}, - {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true}, - {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true}, - {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true}, - {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false}, - {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true}, - {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true}, -}; - - -int32_t GetUTF8EncodeLength(const std::vector<wchar_t>& src, - FX_FILESIZE iSrcLen) { - uint32_t unicode = 0; - int32_t iDstNum = 0; - const wchar_t* pSrc = src.data(); - while (iSrcLen-- > 0) { - unicode = *pSrc++; - int nbytes = 0; - if ((uint32_t)unicode < 0x80) { - nbytes = 1; - } else if ((uint32_t)unicode < 0x800) { - nbytes = 2; - } else if ((uint32_t)unicode < 0x10000) { - nbytes = 3; - } else if ((uint32_t)unicode < 0x200000) { - nbytes = 4; - } else if ((uint32_t)unicode < 0x4000000) { - nbytes = 5; - } else { - nbytes = 6; - } - iDstNum += nbytes; - } - return iDstNum; -} - -} // namespace - -// static -bool CFX_XMLSyntaxParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) { - auto* it = std::lower_bound( - std::begin(g_XMLNameChars), std::end(g_XMLNameChars), ch, - [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; }); - return it != std::end(g_XMLNameChars) && ch >= it->wStart && - (!bFirstChar || it->bStartChar); -} - -CFX_XMLSyntaxParser::CFX_XMLSyntaxParser( - const RetainPtr<CFX_SeekableStreamProxy>& pStream) - : m_pStream(pStream), - m_iXMLPlaneSize(1024), - m_iCurrentPos(0), - m_iCurrentNodeNum(-1), - m_iLastNodeNum(-1), - m_iParsedBytes(0), - m_ParsedChars(0), - m_iBufferChars(0), - m_bEOS(false), - m_Start(0), - m_End(0), - m_iAllocStep(m_BlockBuffer.GetAllocStep()), - m_pCurrentBlock(nullptr), - m_iIndexInBlock(0), - m_iTextDataLength(0), - m_syntaxParserResult(FX_XmlSyntaxResult::None), - m_syntaxParserState(FDE_XmlSyntaxState::Text), - m_wQuotationMark(0), - m_iEntityStart(-1) { - ASSERT(pStream); - - m_CurNode.iNodeNum = -1; - m_CurNode.eNodeType = FX_XMLNODE_Unknown; - - m_iXMLPlaneSize = - std::min(m_iXMLPlaneSize, - pdfium::base::checked_cast<size_t>(m_pStream->GetLength())); - m_iCurrentPos = m_pStream->GetBOMLength(); - - FX_SAFE_SIZE_T alloc_size_safe = m_iXMLPlaneSize; - alloc_size_safe += 1; // For NUL. - if (!alloc_size_safe.IsValid() || alloc_size_safe.ValueOrDie() <= 0) { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return; - } - - m_Buffer.resize(pdfium::base::ValueOrDieForType<size_t>(alloc_size_safe)); - - m_BlockBuffer.InitBuffer(); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); -} - -CFX_XMLSyntaxParser::~CFX_XMLSyntaxParser() {} - -FX_XmlSyntaxResult CFX_XMLSyntaxParser::DoSyntaxParse() { - if (m_syntaxParserResult == FX_XmlSyntaxResult::Error || - m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) { - return m_syntaxParserResult; - } - - FX_FILESIZE iStreamLength = m_pStream->GetLength(); - FX_FILESIZE iPos; - - FX_XmlSyntaxResult syntaxParserResult = FX_XmlSyntaxResult::None; - while (true) { - if (m_Start >= m_End) { - if (m_bEOS || m_iCurrentPos >= iStreamLength) { - m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString; - return m_syntaxParserResult; - } - m_ParsedChars += m_End; - m_iParsedBytes = m_iCurrentPos; - if (m_pStream->GetPosition() != m_iCurrentPos) - m_pStream->Seek(CFX_SeekableStreamProxy::From::Begin, m_iCurrentPos); - - m_iBufferChars = - m_pStream->ReadString(m_Buffer.data(), m_iXMLPlaneSize, &m_bEOS); - iPos = m_pStream->GetPosition(); - if (m_iBufferChars < 1) { - m_iCurrentPos = iStreamLength; - m_syntaxParserResult = FX_XmlSyntaxResult::EndOfString; - return m_syntaxParserResult; - } - m_iCurrentPos = iPos; - m_Start = 0; - m_End = m_iBufferChars; - } - - while (m_Start < m_End) { - wchar_t ch = m_Buffer[m_Start]; - switch (m_syntaxParserState) { - case FDE_XmlSyntaxState::Text: - if (ch == L'<') { - if (!m_BlockBuffer.IsEmpty()) { - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_iEntityStart = -1; - syntaxParserResult = FX_XmlSyntaxResult::Text; - } else { - m_Start++; - m_syntaxParserState = FDE_XmlSyntaxState::Node; - } - } else { - ParseTextChar(ch); - } - break; - case FDE_XmlSyntaxState::Node: - if (ch == L'!') { - m_Start++; - m_syntaxParserState = FDE_XmlSyntaxState::SkipCommentOrDecl; - } else if (ch == L'/') { - m_Start++; - m_syntaxParserState = FDE_XmlSyntaxState::CloseElement; - } else if (ch == L'?') { - m_iLastNodeNum++; - m_iCurrentNodeNum = m_iLastNodeNum; - m_CurNode.iNodeNum = m_iLastNodeNum; - m_CurNode.eNodeType = FX_XMLNODE_Instruction; - m_XMLNodeStack.push(m_CurNode); - m_Start++; - m_syntaxParserState = FDE_XmlSyntaxState::Target; - syntaxParserResult = FX_XmlSyntaxResult::InstructionOpen; - } else { - m_iLastNodeNum++; - m_iCurrentNodeNum = m_iLastNodeNum; - m_CurNode.iNodeNum = m_iLastNodeNum; - m_CurNode.eNodeType = FX_XMLNODE_Element; - m_XMLNodeStack.push(m_CurNode); - m_syntaxParserState = FDE_XmlSyntaxState::Tag; - syntaxParserResult = FX_XmlSyntaxResult::ElementOpen; - } - break; - case FDE_XmlSyntaxState::Target: - case FDE_XmlSyntaxState::Tag: - if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { - if (m_BlockBuffer.IsEmpty()) { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (m_syntaxParserState != FDE_XmlSyntaxState::Target) - syntaxParserResult = FX_XmlSyntaxResult::TagName; - else - syntaxParserResult = FX_XmlSyntaxResult::TargetName; - - m_syntaxParserState = FDE_XmlSyntaxState::AttriName; - } else { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) { - return FX_XmlSyntaxResult::Error; - } - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - m_Start++; - } - break; - case FDE_XmlSyntaxState::AttriName: - if (m_BlockBuffer.IsEmpty() && IsXMLWhiteSpace(ch)) { - m_Start++; - break; - } - if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { - if (m_BlockBuffer.IsEmpty()) { - if (m_CurNode.eNodeType == FX_XMLNODE_Element) { - if (ch == L'>' || ch == L'/') { - m_syntaxParserState = FDE_XmlSyntaxState::BreakElement; - break; - } - } else if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { - if (ch == L'?') { - m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction; - m_Start++; - } else { - m_syntaxParserState = FDE_XmlSyntaxState::TargetData; - } - break; - } - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } else { - if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { - if (ch != '=' && !IsXMLWhiteSpace(ch)) { - m_syntaxParserState = FDE_XmlSyntaxState::TargetData; - break; - } - } - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_syntaxParserState = FDE_XmlSyntaxState::AttriEqualSign; - syntaxParserResult = FX_XmlSyntaxResult::AttriName; - } - } else { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) { - return FX_XmlSyntaxResult::Error; - } - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - m_Start++; - } - break; - case FDE_XmlSyntaxState::AttriEqualSign: - if (IsXMLWhiteSpace(ch)) { - m_Start++; - break; - } - if (ch != L'=') { - if (m_CurNode.eNodeType == FX_XMLNODE_Instruction) { - m_syntaxParserState = FDE_XmlSyntaxState::TargetData; - break; - } - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } else { - m_syntaxParserState = FDE_XmlSyntaxState::AttriQuotation; - m_Start++; - } - break; - case FDE_XmlSyntaxState::AttriQuotation: - if (IsXMLWhiteSpace(ch)) { - m_Start++; - break; - } - if (ch != L'\"' && ch != L'\'') { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } else { - m_wQuotationMark = ch; - m_syntaxParserState = FDE_XmlSyntaxState::AttriValue; - m_Start++; - } - break; - case FDE_XmlSyntaxState::AttriValue: - if (ch == m_wQuotationMark) { - if (m_iEntityStart > -1) { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_wQuotationMark = 0; - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_Start++; - m_syntaxParserState = FDE_XmlSyntaxState::AttriName; - syntaxParserResult = FX_XmlSyntaxResult::AttriValue; - } else { - ParseTextChar(ch); - } - break; - case FDE_XmlSyntaxState::CloseInstruction: - if (ch != L'>') { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) { - return FX_XmlSyntaxResult::Error; - } - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - m_syntaxParserState = FDE_XmlSyntaxState::TargetData; - } else if (!m_BlockBuffer.IsEmpty()) { - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - syntaxParserResult = FX_XmlSyntaxResult::TargetData; - } else { - m_Start++; - if (m_XMLNodeStack.empty()) { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - m_XMLNodeStack.pop(); - if (!m_XMLNodeStack.empty()) { - m_CurNode = m_XMLNodeStack.top(); - } else { - m_CurNode.iNodeNum = -1; - m_CurNode.eNodeType = FX_XMLNODE_Unknown; - } - m_iCurrentNodeNum = m_CurNode.iNodeNum; - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_syntaxParserState = FDE_XmlSyntaxState::Text; - syntaxParserResult = FX_XmlSyntaxResult::InstructionClose; - } - break; - case FDE_XmlSyntaxState::BreakElement: - if (ch == L'>') { - m_syntaxParserState = FDE_XmlSyntaxState::Text; - syntaxParserResult = FX_XmlSyntaxResult::ElementBreak; - } else if (ch == L'/') { - m_syntaxParserState = FDE_XmlSyntaxState::CloseElement; - } else { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - m_Start++; - break; - case FDE_XmlSyntaxState::CloseElement: - if (!IsXMLNameChar(ch, m_BlockBuffer.IsEmpty())) { - if (ch == L'>') { - if (m_XMLNodeStack.empty()) { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - m_XMLNodeStack.pop(); - if (!m_XMLNodeStack.empty()) { - m_CurNode = m_XMLNodeStack.top(); - } else { - m_CurNode.iNodeNum = -1; - m_CurNode.eNodeType = FX_XMLNODE_Unknown; - } - m_iCurrentNodeNum = m_CurNode.iNodeNum; - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_syntaxParserState = FDE_XmlSyntaxState::Text; - syntaxParserResult = FX_XmlSyntaxResult::ElementClose; - } else if (!IsXMLWhiteSpace(ch)) { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - } else { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) { - return FX_XmlSyntaxResult::Error; - } - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - } - m_Start++; - break; - case FDE_XmlSyntaxState::SkipCommentOrDecl: - if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"--", 2) == 0) { - m_Start += 2; - m_syntaxParserState = FDE_XmlSyntaxState::SkipComment; - } else if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"[CDATA[", 7) == - 0) { - m_Start += 7; - m_syntaxParserState = FDE_XmlSyntaxState::SkipCData; - } else { - m_syntaxParserState = FDE_XmlSyntaxState::SkipDeclNode; - m_SkipChar = L'>'; - m_SkipStack.push(L'>'); - } - break; - case FDE_XmlSyntaxState::SkipCData: { - if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"]]>", 3) == 0) { - m_Start += 3; - syntaxParserResult = FX_XmlSyntaxResult::CData; - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_syntaxParserState = FDE_XmlSyntaxState::Text; - } else { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) - return FX_XmlSyntaxResult::Error; - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - m_Start++; - } - break; - } - case FDE_XmlSyntaxState::SkipDeclNode: - if (m_SkipChar == L'\'' || m_SkipChar == L'\"') { - m_Start++; - if (ch != m_SkipChar) - break; - - m_SkipStack.pop(); - if (m_SkipStack.empty()) - m_syntaxParserState = FDE_XmlSyntaxState::Text; - else - m_SkipChar = m_SkipStack.top(); - } else { - switch (ch) { - case L'<': - m_SkipChar = L'>'; - m_SkipStack.push(L'>'); - break; - case L'[': - m_SkipChar = L']'; - m_SkipStack.push(L']'); - break; - case L'(': - m_SkipChar = L')'; - m_SkipStack.push(L')'); - break; - case L'\'': - m_SkipChar = L'\''; - m_SkipStack.push(L'\''); - break; - case L'\"': - m_SkipChar = L'\"'; - m_SkipStack.push(L'\"'); - break; - default: - if (ch == m_SkipChar) { - m_SkipStack.pop(); - if (m_SkipStack.empty()) { - if (m_BlockBuffer.GetDataLength() >= 9) - (void)m_BlockBuffer.GetTextData(0, 7); - - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_syntaxParserState = FDE_XmlSyntaxState::Text; - } else { - m_SkipChar = m_SkipStack.top(); - } - } - break; - } - if (!m_SkipStack.empty()) { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) { - return FX_XmlSyntaxResult::Error; - } - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - } - m_Start++; - } - break; - case FDE_XmlSyntaxState::SkipComment: - if (FXSYS_wcsnicmp(m_Buffer.data() + m_Start, L"-->", 3) == 0) { - m_Start += 2; - m_syntaxParserState = FDE_XmlSyntaxState::Text; - } - - m_Start++; - break; - case FDE_XmlSyntaxState::TargetData: - if (IsXMLWhiteSpace(ch)) { - if (m_BlockBuffer.IsEmpty()) { - m_Start++; - break; - } - if (m_wQuotationMark == 0) { - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_wQuotationMark = 0; - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_Start++; - syntaxParserResult = FX_XmlSyntaxResult::TargetData; - break; - } - } - if (ch == '?') { - m_syntaxParserState = FDE_XmlSyntaxState::CloseInstruction; - m_Start++; - } else if (ch == '\"') { - if (m_wQuotationMark == 0) { - m_wQuotationMark = ch; - m_Start++; - } else if (ch == m_wQuotationMark) { - m_iTextDataLength = m_BlockBuffer.GetDataLength(); - m_wQuotationMark = 0; - m_BlockBuffer.Reset(true); - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_Start++; - syntaxParserResult = FX_XmlSyntaxResult::TargetData; - } else { - m_syntaxParserResult = FX_XmlSyntaxResult::Error; - return m_syntaxParserResult; - } - } else { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) { - return FX_XmlSyntaxResult::Error; - } - } - m_pCurrentBlock[m_iIndexInBlock++] = ch; - m_BlockBuffer.IncrementDataLength(); - m_Start++; - } - break; - default: - break; - } - if (syntaxParserResult != FX_XmlSyntaxResult::None) - return syntaxParserResult; - } - } - return FX_XmlSyntaxResult::Text; -} - -int32_t CFX_XMLSyntaxParser::GetStatus() const { - if (!m_pStream) - return -1; - - int32_t iStreamLength = m_pStream->GetLength(); - if (iStreamLength < 1) - return 100; - - if (m_syntaxParserResult == FX_XmlSyntaxResult::Error) - return -1; - - if (m_syntaxParserResult == FX_XmlSyntaxResult::EndOfString) - return 100; - return m_iParsedBytes * 100 / iStreamLength; -} - -FX_FILESIZE CFX_XMLSyntaxParser::GetCurrentBinaryPos() const { - if (!m_pStream) - return 0; - - int32_t nDstLen = GetUTF8EncodeLength(m_Buffer, m_Start); - return m_iParsedBytes + nDstLen; -} - -void CFX_XMLSyntaxParser::ParseTextChar(wchar_t character) { - if (m_iIndexInBlock == m_iAllocStep) { - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - if (!m_pCurrentBlock) - return; - } - - m_pCurrentBlock[m_iIndexInBlock++] = character; - m_BlockBuffer.IncrementDataLength(); - if (m_iEntityStart > -1 && character == L';') { - WideString csEntity = m_BlockBuffer.GetTextData( - m_iEntityStart + 1, - m_BlockBuffer.GetDataLength() - 1 - m_iEntityStart - 1); - int32_t iLen = csEntity.GetLength(); - if (iLen > 0) { - if (csEntity[0] == L'#') { - uint32_t ch = 0; - wchar_t w; - if (iLen > 1 && csEntity[1] == L'x') { - for (int32_t i = 2; i < iLen; i++) { - w = csEntity[i]; - if (std::iswdigit(w)) - ch = (ch << 4) + w - L'0'; - else if (w >= L'A' && w <= L'F') - ch = (ch << 4) + w - 55; - else if (w >= L'a' && w <= L'f') - ch = (ch << 4) + w - 87; - else - break; - } - } else { - for (int32_t i = 1; i < iLen; i++) { - w = csEntity[i]; - if (!std::iswdigit(w)) - break; - ch = ch * 10 + w - L'0'; - } - } - if (ch > kMaxCharRange) - ch = ' '; - - character = static_cast<wchar_t>(ch); - if (character != 0) { - m_BlockBuffer.SetTextChar(m_iEntityStart, character); - m_iEntityStart++; - } - } else { - if (csEntity.Compare(L"amp") == 0) { - m_BlockBuffer.SetTextChar(m_iEntityStart, L'&'); - m_iEntityStart++; - } else if (csEntity.Compare(L"lt") == 0) { - m_BlockBuffer.SetTextChar(m_iEntityStart, L'<'); - m_iEntityStart++; - } else if (csEntity.Compare(L"gt") == 0) { - m_BlockBuffer.SetTextChar(m_iEntityStart, L'>'); - m_iEntityStart++; - } else if (csEntity.Compare(L"apos") == 0) { - m_BlockBuffer.SetTextChar(m_iEntityStart, L'\''); - m_iEntityStart++; - } else if (csEntity.Compare(L"quot") == 0) { - m_BlockBuffer.SetTextChar(m_iEntityStart, L'\"'); - m_iEntityStart++; - } - } - } - if (m_iEntityStart >= 0 && - m_BlockBuffer.GetDataLength() > static_cast<size_t>(m_iEntityStart)) { - m_BlockBuffer.DeleteTextChars(m_BlockBuffer.GetDataLength() - - m_iEntityStart); - } - std::tie(m_pCurrentBlock, m_iIndexInBlock) = - m_BlockBuffer.GetAvailableBlock(); - m_iEntityStart = -1; - } else if (m_iEntityStart < 0 && character == L'&') { - m_iEntityStart = m_BlockBuffer.GetDataLength() - 1; - } - m_Start++; -} diff --git a/core/fxcrt/xml/cfx_xmlsyntaxparser.h b/core/fxcrt/xml/cfx_xmlsyntaxparser.h deleted file mode 100644 index b93bbb6801..0000000000 --- a/core/fxcrt/xml/cfx_xmlsyntaxparser.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2017 PDFium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com - -#ifndef CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_ -#define CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_ - -#include <stack> -#include <vector> - -#include "core/fxcrt/cfx_blockbuffer.h" -#include "core/fxcrt/cfx_seekablestreamproxy.h" -#include "core/fxcrt/fx_string.h" -#include "core/fxcrt/retain_ptr.h" -#include "core/fxcrt/xml/cfx_xmlnode.h" - -enum class FX_XmlSyntaxResult { - None, - InstructionOpen, - InstructionClose, - ElementOpen, - ElementBreak, - ElementClose, - TargetName, - TagName, - AttriName, - AttriValue, - Text, - CData, - TargetData, - Error, - EndOfString -}; - -class CFX_XMLSyntaxParser { - public: - static bool IsXMLNameChar(wchar_t ch, bool bFirstChar); - - explicit CFX_XMLSyntaxParser( - const RetainPtr<CFX_SeekableStreamProxy>& pStream); - ~CFX_XMLSyntaxParser(); - - FX_XmlSyntaxResult DoSyntaxParse(); - - int32_t GetStatus() const; - FX_FILESIZE GetCurrentPos() const { return m_ParsedChars + m_Start; } - FX_FILESIZE GetCurrentBinaryPos() const; - int32_t GetCurrentNodeNumber() const { return m_iCurrentNodeNum; } - int32_t GetLastNodeNumber() const { return m_iLastNodeNum; } - - WideString GetTargetName() const { - return m_BlockBuffer.GetTextData(0, m_iTextDataLength); - } - - WideString GetTagName() const { - return m_BlockBuffer.GetTextData(0, m_iTextDataLength); - } - - WideString GetAttributeName() const { - return m_BlockBuffer.GetTextData(0, m_iTextDataLength); - } - - WideString GetAttributeValue() const { - return m_BlockBuffer.GetTextData(0, m_iTextDataLength); - } - - WideString GetTextData() const { - return m_BlockBuffer.GetTextData(0, m_iTextDataLength); - } - - WideString GetTargetData() const { - return m_BlockBuffer.GetTextData(0, m_iTextDataLength); - } - - protected: - enum class FDE_XmlSyntaxState { - Text, - Node, - Target, - Tag, - AttriName, - AttriEqualSign, - AttriQuotation, - AttriValue, - Entity, - EntityDecimal, - EntityHex, - CloseInstruction, - BreakElement, - CloseElement, - SkipDeclNode, - DeclCharData, - SkipComment, - SkipCommentOrDecl, - SkipCData, - TargetData - }; - - void ParseTextChar(wchar_t ch); - - RetainPtr<CFX_SeekableStreamProxy> m_pStream; - size_t m_iXMLPlaneSize; - FX_FILESIZE m_iCurrentPos; - int32_t m_iCurrentNodeNum; - int32_t m_iLastNodeNum; - int32_t m_iParsedBytes; - FX_FILESIZE m_ParsedChars; - std::vector<wchar_t> m_Buffer; - size_t m_iBufferChars; - bool m_bEOS; - FX_FILESIZE m_Start; // Start position in m_Buffer - FX_FILESIZE m_End; // End position in m_Buffer - FX_XMLNODE m_CurNode; - std::stack<FX_XMLNODE> m_XMLNodeStack; - CFX_BlockBuffer m_BlockBuffer; - int32_t m_iAllocStep; - wchar_t* m_pCurrentBlock; // Pointer into CFX_BlockBuffer - int32_t m_iIndexInBlock; - int32_t m_iTextDataLength; - FX_XmlSyntaxResult m_syntaxParserResult; - FDE_XmlSyntaxState m_syntaxParserState; - wchar_t m_wQuotationMark; - int32_t m_iEntityStart; - std::stack<wchar_t> m_SkipStack; - wchar_t m_SkipChar; -}; - -#endif // CORE_FXCRT_XML_CFX_XMLSYNTAXPARSER_H_ |