From d9dad3a1915973d113f1f8685474a5a8c1f4faac Mon Sep 17 00:00:00 2001 From: dan sinclair Date: Thu, 6 Apr 2017 14:44:02 -0400 Subject: Add title (/T) extraction for PDF tagged structures This CL adds the ability to extract the title from a tagged structure element if one exists. Bug: pdfium:672 Change-Id: I22e2a8371db4f08b8a70dd77002f1befab97f530 Reviewed-on: https://pdfium-review.googlesource.com/3819 Reviewed-by: Lei Zhang Reviewed-by: Tom Sepez Commit-Queue: dsinclair --- core/fpdfdoc/cpdf_structelement.cpp | 3 ++- core/fpdfdoc/cpdf_structelement.h | 2 ++ fpdfsdk/fpdf_structtree.cpp | 10 ++++++++++ public/fpdf_structtree.h | 20 ++++++++++++++++++++ samples/pdfium_test.cc | 5 +++++ 5 files changed, 39 insertions(+), 1 deletion(-) diff --git a/core/fpdfdoc/cpdf_structelement.cpp b/core/fpdfdoc/cpdf_structelement.cpp index 137d5b32e4..c85ae0dd42 100644 --- a/core/fpdfdoc/cpdf_structelement.cpp +++ b/core/fpdfdoc/cpdf_structelement.cpp @@ -33,7 +33,8 @@ CPDF_StructElement::CPDF_StructElement(CPDF_StructTree* pTree, : m_pTree(pTree), m_pParent(pParent), m_pDict(pDict), - m_Type(pDict->GetStringFor("S")) { + m_Type(pDict->GetStringFor("S")), + m_Title(pDict->GetStringFor("T")) { if (pTree->GetRoleMap()) { CFX_ByteString mapped = pTree->GetRoleMap()->GetStringFor(m_Type); if (!mapped.IsEmpty()) diff --git a/core/fpdfdoc/cpdf_structelement.h b/core/fpdfdoc/cpdf_structelement.h index ba0685e895..c65363db53 100644 --- a/core/fpdfdoc/cpdf_structelement.h +++ b/core/fpdfdoc/cpdf_structelement.h @@ -39,6 +39,7 @@ class CPDF_StructElement : public CFX_Retainable { friend CFX_RetainPtr pdfium::MakeRetain(Args&&... args); const CFX_ByteString& GetType() const { return m_Type; } + const CFX_ByteString& GetTitle() const { return m_Title; } CPDF_Dictionary* GetDict() const { return m_pDict; } int CountKids() const; @@ -58,6 +59,7 @@ class CPDF_StructElement : public CFX_Retainable { CPDF_StructElement* const m_pParent; CPDF_Dictionary* const m_pDict; CFX_ByteString m_Type; + CFX_ByteString m_Title; std::vector m_Kids; }; diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp index 96d40b41c2..74c44f8083 100644 --- a/fpdfsdk/fpdf_structtree.cpp +++ b/fpdfsdk/fpdf_structtree.cpp @@ -83,6 +83,16 @@ FPDF_StructElement_GetType(FPDF_STRUCTELEMENT struct_element, : 0; } +DLLEXPORT unsigned long STDCALL +FPDF_StructElement_GetTitle(FPDF_STRUCTELEMENT struct_element, + void* buffer, + unsigned long buflen) { + CPDF_StructElement* elem = ToStructTreeElement(struct_element); + return elem + ? WideStringToBuffer(elem->GetTitle().UTF8Decode(), buffer, buflen) + : 0; +} + DLLEXPORT int STDCALL FPDF_StructElement_CountChildren(FPDF_STRUCTELEMENT struct_element) { CPDF_StructElement* elem = ToStructTreeElement(struct_element); diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h index 6f85d4222e..9cf46cc306 100644 --- a/public/fpdf_structtree.h +++ b/public/fpdf_structtree.h @@ -93,6 +93,26 @@ FPDF_StructElement_GetType(FPDF_STRUCTELEMENT struct_element, void* buffer, unsigned long buflen); +// Function: FPDF_StructElement_GetTitle +// Get the title (/T) for a given element. +// Parameters: +// struct_element - Handle to the struct element. +// buffer - A buffer for output. May be NULL. +// buflen - The length of the buffer, in bytes. May be 0. +// Return value: +// The number of bytes in the title, including the terminating NUL +// character. The number of bytes is returned regardless of the +// |buffer| and |buflen| parameters. +// Comments: +// Regardless of the platform, the |buffer| is always in UTF-16LE +// encoding. The string is terminated by a UTF16 NUL character. If +// |buflen| is less than the required length, or |buffer| is NULL, +// |buffer| will not be modified. +DLLEXPORT unsigned long STDCALL +FPDF_StructElement_GetTitle(FPDF_STRUCTELEMENT struct_element, + void* buffer, + unsigned long buflen); + // Function: FPDF_StructElement_CountChildren // Count the number of children for the structure element. // Parameters: diff --git a/samples/pdfium_test.cc b/samples/pdfium_test.cc index 1dc76fee5b..d2b3c01196 100644 --- a/samples/pdfium_test.cc +++ b/samples/pdfium_test.cc @@ -640,6 +640,11 @@ void DumpChildStructure(FPDF_STRUCTELEMENT child, int indent) { unsigned long len = FPDF_StructElement_GetType(child, buf, kBufSize); printf("%*s%ls", indent * 2, "", ConvertToWString(buf, len).c_str()); + memset(buf, 0, sizeof(buf)); + len = FPDF_StructElement_GetTitle(child, buf, kBufSize); + if (len > 0) + printf(": '%ls'", ConvertToWString(buf, len).c_str()); + memset(buf, 0, sizeof(buf)); len = FPDF_StructElement_GetAltText(child, buf, kBufSize); if (len > 0) -- cgit v1.2.3