diff options
Diffstat (limited to 'xfa/fxfa/cxfa_textparser.cpp')
-rw-r--r-- | xfa/fxfa/cxfa_textparser.cpp | 625 |
1 files changed, 625 insertions, 0 deletions
diff --git a/xfa/fxfa/cxfa_textparser.cpp b/xfa/fxfa/cxfa_textparser.cpp new file mode 100644 index 0000000000..2bc0096962 --- /dev/null +++ b/xfa/fxfa/cxfa_textparser.cpp @@ -0,0 +1,625 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "xfa/fxfa/cxfa_textparser.h" + +#include <algorithm> +#include <utility> +#include <vector> + +#include "core/fxcrt/fx_codepage.h" +#include "core/fxcrt/xml/cfx_xmlelement.h" +#include "core/fxcrt/xml/cfx_xmlnode.h" +#include "third_party/base/ptr_util.h" +#include "xfa/fde/css/cfde_csscomputedstyle.h" +#include "xfa/fde/css/cfde_cssstyleselector.h" +#include "xfa/fde/css/cfde_cssstylesheet.h" +#include "xfa/fde/css/fde_css.h" +#include "xfa/fgas/font/cfgas_fontmgr.h" +#include "xfa/fxfa/cxfa_csstagprovider.h" +#include "xfa/fxfa/cxfa_ffapp.h" +#include "xfa/fxfa/cxfa_ffdoc.h" +#include "xfa/fxfa/cxfa_fontmgr.h" +#include "xfa/fxfa/cxfa_textparsecontext.h" +#include "xfa/fxfa/cxfa_textprovider.h" +#include "xfa/fxfa/cxfa_texttabstopscontext.h" +#include "xfa/fxfa/parser/cxfa_measurement.h" + +namespace { + +enum class TabStopStatus { + Error, + EOS, + None, + Alignment, + StartLeader, + Leader, + Location, +}; + +} // namespace + +CXFA_TextParser::CXFA_TextParser() + : m_bParsed(false), m_cssInitialized(false) {} + +CXFA_TextParser::~CXFA_TextParser() {} + +void CXFA_TextParser::Reset() { + m_mapXMLNodeToParseContext.clear(); + m_bParsed = false; +} + +void CXFA_TextParser::InitCSSData(CXFA_TextProvider* pTextProvider) { + if (!pTextProvider) + return; + + if (!m_pSelector) { + CXFA_FFDoc* pDoc = pTextProvider->GetDocNode(); + CFGAS_FontMgr* pFontMgr = pDoc->GetApp()->GetFDEFontMgr(); + ASSERT(pFontMgr); + m_pSelector = pdfium::MakeUnique<CFDE_CSSStyleSelector>(pFontMgr); + CXFA_Font font = pTextProvider->GetFontNode(); + m_pSelector->SetDefFontSize(font ? font.GetFontSize() : 10.0f); + } + + if (m_cssInitialized) + return; + + m_cssInitialized = true; + auto uaSheet = LoadDefaultSheetStyle(); + m_pSelector->SetUAStyleSheet(std::move(uaSheet)); + m_pSelector->UpdateStyleIndex(); +} + +std::unique_ptr<CFDE_CSSStyleSheet> CXFA_TextParser::LoadDefaultSheetStyle() { + static const wchar_t s_pStyle[] = + L"html,body,ol,p,ul{display:block}" + L"li{display:list-item}" + L"ol,ul{padding-left:33px;margin:1.12em 0}" + L"ol{list-style-type:decimal}" + L"a{color:#0000ff;text-decoration:underline}" + L"b{font-weight:bolder}" + L"i{font-style:italic}" + L"sup{vertical-align:+15em;font-size:.66em}" + L"sub{vertical-align:-15em;font-size:.66em}"; + + auto sheet = pdfium::MakeUnique<CFDE_CSSStyleSheet>(); + return sheet->LoadBuffer(s_pStyle, FXSYS_wcslen(s_pStyle)) ? std::move(sheet) + : nullptr; +} + +CFX_RetainPtr<CFDE_CSSComputedStyle> CXFA_TextParser::CreateRootStyle( + CXFA_TextProvider* pTextProvider) { + CXFA_Font font = pTextProvider->GetFontNode(); + CXFA_Para para = pTextProvider->GetParaNode(); + auto pStyle = m_pSelector->CreateComputedStyle(nullptr); + float fLineHeight = 0; + float fFontSize = 10; + + if (para) { + fLineHeight = para.GetLineHeight(); + FDE_CSSLength indent; + indent.Set(FDE_CSSLengthUnit::Point, para.GetTextIndent()); + pStyle->SetTextIndent(indent); + FDE_CSSTextAlign hAlign = FDE_CSSTextAlign::Left; + switch (para.GetHorizontalAlign()) { + case XFA_ATTRIBUTEENUM_Center: + hAlign = FDE_CSSTextAlign::Center; + break; + case XFA_ATTRIBUTEENUM_Right: + hAlign = FDE_CSSTextAlign::Right; + break; + case XFA_ATTRIBUTEENUM_Justify: + hAlign = FDE_CSSTextAlign::Justify; + break; + case XFA_ATTRIBUTEENUM_JustifyAll: + hAlign = FDE_CSSTextAlign::JustifyAll; + break; + } + pStyle->SetTextAlign(hAlign); + FDE_CSSRect rtMarginWidth; + rtMarginWidth.left.Set(FDE_CSSLengthUnit::Point, para.GetMarginLeft()); + rtMarginWidth.top.Set(FDE_CSSLengthUnit::Point, para.GetSpaceAbove()); + rtMarginWidth.right.Set(FDE_CSSLengthUnit::Point, para.GetMarginRight()); + rtMarginWidth.bottom.Set(FDE_CSSLengthUnit::Point, para.GetSpaceBelow()); + pStyle->SetMarginWidth(rtMarginWidth); + } + + if (font) { + pStyle->SetColor(font.GetColor()); + pStyle->SetFontStyle(font.IsItalic() ? FDE_CSSFontStyle::Italic + : FDE_CSSFontStyle::Normal); + pStyle->SetFontWeight(font.IsBold() ? FXFONT_FW_BOLD : FXFONT_FW_NORMAL); + pStyle->SetNumberVerticalAlign(-font.GetBaselineShift()); + fFontSize = font.GetFontSize(); + FDE_CSSLength letterSpacing; + letterSpacing.Set(FDE_CSSLengthUnit::Point, font.GetLetterSpacing()); + pStyle->SetLetterSpacing(letterSpacing); + uint32_t dwDecoration = 0; + if (font.GetLineThrough() > 0) + dwDecoration |= FDE_CSSTEXTDECORATION_LineThrough; + if (font.GetUnderline() > 1) + dwDecoration |= FDE_CSSTEXTDECORATION_Double; + else if (font.GetUnderline() > 0) + dwDecoration |= FDE_CSSTEXTDECORATION_Underline; + + pStyle->SetTextDecoration(dwDecoration); + } + pStyle->SetLineHeight(fLineHeight); + pStyle->SetFontSize(fFontSize); + return pStyle; +} + +CFX_RetainPtr<CFDE_CSSComputedStyle> CXFA_TextParser::CreateStyle( + CFDE_CSSComputedStyle* pParentStyle) { + auto pNewStyle = m_pSelector->CreateComputedStyle(pParentStyle); + ASSERT(pNewStyle); + if (!pParentStyle) + return pNewStyle; + + uint32_t dwDecoration = pParentStyle->GetTextDecoration(); + float fBaseLine = 0; + if (pParentStyle->GetVerticalAlign() == FDE_CSSVerticalAlign::Number) + fBaseLine = pParentStyle->GetNumberVerticalAlign(); + + pNewStyle->SetTextDecoration(dwDecoration); + pNewStyle->SetNumberVerticalAlign(fBaseLine); + + const FDE_CSSRect* pRect = pParentStyle->GetMarginWidth(); + if (pRect) + pNewStyle->SetMarginWidth(*pRect); + return pNewStyle; +} + +CFX_RetainPtr<CFDE_CSSComputedStyle> CXFA_TextParser::ComputeStyle( + CFX_XMLNode* pXMLNode, + CFDE_CSSComputedStyle* pParentStyle) { + auto it = m_mapXMLNodeToParseContext.find(pXMLNode); + if (it == m_mapXMLNodeToParseContext.end()) + return nullptr; + + CXFA_TextParseContext* pContext = it->second.get(); + if (!pContext) + return nullptr; + + pContext->m_pParentStyle.Reset(pParentStyle); + + auto tagProvider = ParseTagInfo(pXMLNode); + if (tagProvider->m_bContent) + return nullptr; + + auto pStyle = CreateStyle(pParentStyle); + m_pSelector->ComputeStyle(pContext->GetDecls(), + tagProvider->GetAttribute(L"style"), + tagProvider->GetAttribute(L"align"), pStyle.Get()); + return pStyle; +} + +void CXFA_TextParser::DoParse(CFX_XMLNode* pXMLContainer, + CXFA_TextProvider* pTextProvider) { + if (!pXMLContainer || !pTextProvider || m_bParsed) + return; + + m_bParsed = true; + InitCSSData(pTextProvider); + auto pRootStyle = CreateRootStyle(pTextProvider); + ParseRichText(pXMLContainer, pRootStyle.Get()); +} + +void CXFA_TextParser::ParseRichText(CFX_XMLNode* pXMLNode, + CFDE_CSSComputedStyle* pParentStyle) { + if (!pXMLNode) + return; + + auto tagProvider = ParseTagInfo(pXMLNode); + if (!tagProvider->m_bTagAvailable) + return; + + CFX_RetainPtr<CFDE_CSSComputedStyle> pNewStyle; + if ((tagProvider->GetTagName() != L"body") || + (tagProvider->GetTagName() != L"html")) { + auto pTextContext = pdfium::MakeUnique<CXFA_TextParseContext>(); + FDE_CSSDisplay eDisplay = FDE_CSSDisplay::Inline; + if (!tagProvider->m_bContent) { + auto declArray = + m_pSelector->MatchDeclarations(tagProvider->GetTagName()); + pNewStyle = CreateStyle(pParentStyle); + m_pSelector->ComputeStyle(declArray, tagProvider->GetAttribute(L"style"), + tagProvider->GetAttribute(L"align"), + pNewStyle.Get()); + + if (!declArray.empty()) + pTextContext->SetDecls(std::move(declArray)); + + eDisplay = pNewStyle->GetDisplay(); + } + pTextContext->SetDisplay(eDisplay); + m_mapXMLNodeToParseContext[pXMLNode] = std::move(pTextContext); + } + + for (CFX_XMLNode* pXMLChild = pXMLNode->GetNodeItem(CFX_XMLNode::FirstChild); + pXMLChild; + pXMLChild = pXMLChild->GetNodeItem(CFX_XMLNode::NextSibling)) { + ParseRichText(pXMLChild, pNewStyle.Get()); + } +} + +bool CXFA_TextParser::TagValidate(const CFX_WideString& wsName) const { + static const uint32_t s_XFATagName[] = { + 0x61, // a + 0x62, // b + 0x69, // i + 0x70, // p + 0x0001f714, // br + 0x00022a55, // li + 0x000239bb, // ol + 0x00025881, // ul + 0x0bd37faa, // sub + 0x0bd37fb8, // sup + 0xa73e3af2, // span + 0xb182eaae, // body + 0xdb8ac455, // html + }; + static const int32_t s_iCount = FX_ArraySize(s_XFATagName); + + return std::binary_search(s_XFATagName, s_XFATagName + s_iCount, + FX_HashCode_GetW(wsName.AsStringC(), true)); +} + +std::unique_ptr<CXFA_CSSTagProvider> CXFA_TextParser::ParseTagInfo( + CFX_XMLNode* pXMLNode) { + auto tagProvider = pdfium::MakeUnique<CXFA_CSSTagProvider>(); + + CFX_WideString wsName; + if (pXMLNode->GetType() == FX_XMLNODE_Element) { + CFX_XMLElement* pXMLElement = static_cast<CFX_XMLElement*>(pXMLNode); + wsName = pXMLElement->GetLocalTagName(); + tagProvider->SetTagName(wsName); + tagProvider->m_bTagAvailable = TagValidate(wsName); + + CFX_WideString wsValue = pXMLElement->GetString(L"style"); + if (!wsValue.IsEmpty()) + tagProvider->SetAttribute(L"style", wsValue); + } else if (pXMLNode->GetType() == FX_XMLNODE_Text) { + tagProvider->m_bTagAvailable = true; + tagProvider->m_bContent = true; + } + return tagProvider; +} + +int32_t CXFA_TextParser::GetVAlign(CXFA_TextProvider* pTextProvider) const { + CXFA_Para para = pTextProvider->GetParaNode(); + return para ? para.GetVerticalAlign() : XFA_ATTRIBUTEENUM_Top; +} + +float CXFA_TextParser::GetTabInterval(CFDE_CSSComputedStyle* pStyle) const { + CFX_WideString wsValue; + if (pStyle && pStyle->GetCustomStyle(L"tab-interval", wsValue)) + return CXFA_Measurement(wsValue.AsStringC()).ToUnit(XFA_UNIT_Pt); + return 36; +} + +int32_t CXFA_TextParser::CountTabs(CFDE_CSSComputedStyle* pStyle) const { + CFX_WideString wsValue; + if (pStyle && pStyle->GetCustomStyle(L"xfa-tab-count", wsValue)) + return wsValue.GetInteger(); + return 0; +} + +bool CXFA_TextParser::IsSpaceRun(CFDE_CSSComputedStyle* pStyle) const { + CFX_WideString wsValue; + if (pStyle && pStyle->GetCustomStyle(L"xfa-spacerun", wsValue)) { + wsValue.MakeLower(); + return wsValue == L"yes"; + } + return false; +} + +CFX_RetainPtr<CFGAS_GEFont> CXFA_TextParser::GetFont( + CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle) const { + CFX_WideStringC wsFamily = L"Courier"; + uint32_t dwStyle = 0; + CXFA_Font font = pTextProvider->GetFontNode(); + if (font) { + font.GetTypeface(wsFamily); + if (font.IsBold()) + dwStyle |= FX_FONTSTYLE_Bold; + if (font.IsItalic()) + dwStyle |= FX_FONTSTYLE_Italic; + } + + if (pStyle) { + int32_t iCount = pStyle->CountFontFamilies(); + if (iCount > 0) + wsFamily = pStyle->GetFontFamily(iCount - 1).AsStringC(); + + dwStyle = 0; + if (pStyle->GetFontWeight() > FXFONT_FW_NORMAL) + dwStyle |= FX_FONTSTYLE_Bold; + if (pStyle->GetFontStyle() == FDE_CSSFontStyle::Italic) + dwStyle |= FX_FONTSTYLE_Italic; + } + + CXFA_FFDoc* pDoc = pTextProvider->GetDocNode(); + CXFA_FontMgr* pFontMgr = pDoc->GetApp()->GetXFAFontMgr(); + return pFontMgr->GetFont(pDoc, wsFamily, dwStyle); +} + +float CXFA_TextParser::GetFontSize(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle) const { + if (pStyle) + return pStyle->GetFontSize(); + + CXFA_Font font = pTextProvider->GetFontNode(); + if (font) + return font.GetFontSize(); + return 10; +} + +int32_t CXFA_TextParser::GetHorScale(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle, + CFX_XMLNode* pXMLNode) const { + if (pStyle) { + CFX_WideString wsValue; + if (pStyle->GetCustomStyle(L"xfa-font-horizontal-scale", wsValue)) + return wsValue.GetInteger(); + + while (pXMLNode) { + auto it = m_mapXMLNodeToParseContext.find(pXMLNode); + if (it != m_mapXMLNodeToParseContext.end()) { + CXFA_TextParseContext* pContext = it->second.get(); + if (pContext && pContext->m_pParentStyle && + pContext->m_pParentStyle->GetCustomStyle( + L"xfa-font-horizontal-scale", wsValue)) { + return wsValue.GetInteger(); + } + } + pXMLNode = pXMLNode->GetNodeItem(CFX_XMLNode::Parent); + } + } + + if (CXFA_Font font = pTextProvider->GetFontNode()) + return static_cast<int32_t>(font.GetHorizontalScale()); + return 100; +} + +int32_t CXFA_TextParser::GetVerScale(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle) const { + if (pStyle) { + CFX_WideString wsValue; + if (pStyle->GetCustomStyle(L"xfa-font-vertical-scale", wsValue)) + return wsValue.GetInteger(); + } + + if (CXFA_Font font = pTextProvider->GetFontNode()) + return (int32_t)font.GetVerticalScale(); + return 100; +} + +void CXFA_TextParser::GetUnderline(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle, + int32_t& iUnderline, + int32_t& iPeriod) const { + iUnderline = 0; + iPeriod = XFA_ATTRIBUTEENUM_All; + if (!pStyle) { + CXFA_Font font = pTextProvider->GetFontNode(); + if (font) { + iUnderline = font.GetUnderline(); + iPeriod = font.GetUnderlinePeriod(); + } + return; + } + + uint32_t dwDecoration = pStyle->GetTextDecoration(); + if (dwDecoration & FDE_CSSTEXTDECORATION_Double) + iUnderline = 2; + else if (dwDecoration & FDE_CSSTEXTDECORATION_Underline) + iUnderline = 1; + + CFX_WideString wsValue; + if (pStyle->GetCustomStyle(L"underlinePeriod", wsValue)) { + if (wsValue == L"word") + iPeriod = XFA_ATTRIBUTEENUM_Word; + } else if (CXFA_Font font = pTextProvider->GetFontNode()) { + iPeriod = font.GetUnderlinePeriod(); + } +} + +void CXFA_TextParser::GetLinethrough(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle, + int32_t& iLinethrough) const { + if (pStyle) { + uint32_t dwDecoration = pStyle->GetTextDecoration(); + iLinethrough = (dwDecoration & FDE_CSSTEXTDECORATION_LineThrough) ? 1 : 0; + return; + } + + CXFA_Font font = pTextProvider->GetFontNode(); + if (font) + iLinethrough = font.GetLineThrough(); +} + +FX_ARGB CXFA_TextParser::GetColor(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle) const { + if (pStyle) + return pStyle->GetColor(); + if (CXFA_Font font = pTextProvider->GetFontNode()) + return font.GetColor(); + + return 0xFF000000; +} + +float CXFA_TextParser::GetBaseline(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle) const { + if (pStyle) { + if (pStyle->GetVerticalAlign() == FDE_CSSVerticalAlign::Number) + return pStyle->GetNumberVerticalAlign(); + } else if (CXFA_Font font = pTextProvider->GetFontNode()) { + return font.GetBaselineShift(); + } + return 0; +} + +float CXFA_TextParser::GetLineHeight(CXFA_TextProvider* pTextProvider, + CFDE_CSSComputedStyle* pStyle, + bool bFirst, + float fVerScale) const { + float fLineHeight = 0; + if (pStyle) + fLineHeight = pStyle->GetLineHeight(); + else if (CXFA_Para para = pTextProvider->GetParaNode()) + fLineHeight = para.GetLineHeight(); + + if (bFirst) { + float fFontSize = GetFontSize(pTextProvider, pStyle); + if (fLineHeight < 0.1f) + fLineHeight = fFontSize; + else + fLineHeight = std::min(fLineHeight, fFontSize); + } else if (fLineHeight < 0.1f) { + fLineHeight = GetFontSize(pTextProvider, pStyle) * 1.2f; + } + fLineHeight *= fVerScale; + return fLineHeight; +} + +bool CXFA_TextParser::GetEmbbedObj(CXFA_TextProvider* pTextProvider, + CFX_XMLNode* pXMLNode, + CFX_WideString& wsValue) { + wsValue.clear(); + if (!pXMLNode) + return false; + + bool bRet = false; + if (pXMLNode->GetType() == FX_XMLNODE_Element) { + CFX_XMLElement* pElement = static_cast<CFX_XMLElement*>(pXMLNode); + CFX_WideString wsAttr = pElement->GetString(L"xfa:embed"); + if (wsAttr.IsEmpty()) + return false; + if (wsAttr.GetAt(0) == L'#') + wsAttr.Delete(0); + + CFX_WideString ws = pElement->GetString(L"xfa:embedType"); + if (ws.IsEmpty()) + ws = L"som"; + else + ws.MakeLower(); + + bool bURI = (ws == L"uri"); + if (!bURI && ws != L"som") + return false; + + ws = pElement->GetString(L"xfa:embedMode"); + if (ws.IsEmpty()) + ws = L"formatted"; + else + ws.MakeLower(); + + bool bRaw = (ws == L"raw"); + if (!bRaw && ws != L"formatted") + return false; + + bRet = pTextProvider->GetEmbbedObj(bURI, bRaw, wsAttr, wsValue); + } + return bRet; +} + +CXFA_TextParseContext* CXFA_TextParser::GetParseContextFromMap( + CFX_XMLNode* pXMLNode) { + auto it = m_mapXMLNodeToParseContext.find(pXMLNode); + return it != m_mapXMLNodeToParseContext.end() ? it->second.get() : nullptr; +} + +bool CXFA_TextParser::GetTabstops(CFDE_CSSComputedStyle* pStyle, + CXFA_TextTabstopsContext* pTabstopContext) { + if (!pStyle || !pTabstopContext) + return false; + + CFX_WideString wsValue; + if (!pStyle->GetCustomStyle(L"xfa-tab-stops", wsValue) && + !pStyle->GetCustomStyle(L"tab-stops", wsValue)) { + return false; + } + + int32_t iLength = wsValue.GetLength(); + const wchar_t* pTabStops = wsValue.c_str(); + int32_t iCur = 0; + int32_t iLast = 0; + CFX_WideString wsAlign; + TabStopStatus eStatus = TabStopStatus::None; + wchar_t ch; + while (iCur < iLength) { + ch = pTabStops[iCur]; + switch (eStatus) { + case TabStopStatus::None: + if (ch <= ' ') { + iCur++; + } else { + eStatus = TabStopStatus::Alignment; + iLast = iCur; + } + break; + case TabStopStatus::Alignment: + if (ch == ' ') { + wsAlign = CFX_WideStringC(pTabStops + iLast, iCur - iLast); + eStatus = TabStopStatus::StartLeader; + iCur++; + while (iCur < iLength && pTabStops[iCur] <= ' ') + iCur++; + iLast = iCur; + } else { + iCur++; + } + break; + case TabStopStatus::StartLeader: + if (ch != 'l') { + eStatus = TabStopStatus::Location; + } else { + int32_t iCount = 0; + while (iCur < iLength) { + ch = pTabStops[iCur]; + iCur++; + if (ch == '(') { + iCount++; + } else if (ch == ')') { + iCount--; + if (iCount == 0) + break; + } + } + while (iCur < iLength && pTabStops[iCur] <= ' ') + iCur++; + + iLast = iCur; + eStatus = TabStopStatus::Location; + } + break; + case TabStopStatus::Location: + if (ch == ' ') { + uint32_t dwHashCode = FX_HashCode_GetW(wsAlign.AsStringC(), true); + CXFA_Measurement ms(CFX_WideStringC(pTabStops + iLast, iCur - iLast)); + float fPos = ms.ToUnit(XFA_UNIT_Pt); + pTabstopContext->Append(dwHashCode, fPos); + wsAlign.clear(); + eStatus = TabStopStatus::None; + } + iCur++; + break; + default: + break; + } + } + + if (!wsAlign.IsEmpty()) { + uint32_t dwHashCode = FX_HashCode_GetW(wsAlign.AsStringC(), true); + CXFA_Measurement ms(CFX_WideStringC(pTabStops + iLast, iCur - iLast)); + float fPos = ms.ToUnit(XFA_UNIT_Pt); + pTabstopContext->Append(dwHashCode, fPos); + } + return true; +} |