From 9067fd683ebf8d6467f8cc5aa7daf5e1f950f846 Mon Sep 17 00:00:00 2001 From: thestig Date: Wed, 23 Nov 2016 14:10:06 -0800 Subject: Add APIs for limited use of document tagged code. BUG=pdfium:568 Review-Url: https://codereview.chromium.org/2519343002 --- BUILD.gn | 3 + fpdfsdk/fpdf_structtree.cpp | 88 ++++++++++++++++++++++++++ fpdfsdk/fpdf_structtree_embeddertest.cpp | 70 +++++++++++++++++++++ fpdfsdk/fpdfdoc.cpp | 6 +- fpdfsdk/fpdfview.cpp | 2 +- fpdfsdk/fpdfview_c_api_test.c | 10 +++ public/fpdf_doc.h | 6 +- public/fpdf_formfill.h | 6 +- public/fpdf_structtree.h | 103 +++++++++++++++++++++++++++++++ public/fpdfview.h | 2 + testing/resources/tagged_alt_text.pdf | Bin 0 -> 2659 bytes 11 files changed, 285 insertions(+), 11 deletions(-) create mode 100644 fpdfsdk/fpdf_structtree.cpp create mode 100644 fpdfsdk/fpdf_structtree_embeddertest.cpp create mode 100644 public/fpdf_structtree.h create mode 100644 testing/resources/tagged_alt_text.pdf diff --git a/BUILD.gn b/BUILD.gn index 84a415cd4d..29572e72c5 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -97,6 +97,7 @@ static_library("pdfium") { "fpdfsdk/fpdf_flatten.cpp", "fpdfsdk/fpdf_progressive.cpp", "fpdfsdk/fpdf_searchex.cpp", + "fpdfsdk/fpdf_structtree.cpp", "fpdfsdk/fpdf_sysfontinfo.cpp", "fpdfsdk/fpdf_transformpage.cpp", "fpdfsdk/fpdfdoc.cpp", @@ -124,6 +125,7 @@ static_library("pdfium") { "public/fpdf_progressive.h", "public/fpdf_save.h", "public/fpdf_searchex.h", + "public/fpdf_structtree.h", "public/fpdf_sysfontinfo.h", "public/fpdf_text.h", "public/fpdf_transformpage.h", @@ -1797,6 +1799,7 @@ test("pdfium_embeddertests") { "core/fxge/ge/fx_ge_text_embeddertest.cpp", "fpdfsdk/fpdf_dataavail_embeddertest.cpp", "fpdfsdk/fpdf_flatten_embeddertest.cpp", + "fpdfsdk/fpdf_structtree_embeddertest.cpp", "fpdfsdk/fpdfdoc_embeddertest.cpp", "fpdfsdk/fpdfedit_embeddertest.cpp", "fpdfsdk/fpdfext_embeddertest.cpp", diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp new file mode 100644 index 0000000000..541c46b378 --- /dev/null +++ b/fpdfsdk/fpdf_structtree.cpp @@ -0,0 +1,88 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "public/fpdf_structtree.h" + +#include "core/fpdfapi/page/cpdf_page.h" +#include "core/fpdfapi/parser/cpdf_dictionary.h" +#include "core/fpdfdoc/fpdf_tagged.h" +#include "fpdfsdk/fsdk_define.h" + +namespace { + +IPDF_StructTree* ToStructTree(FPDF_STRUCTTREE struct_tree) { + return reinterpret_cast(struct_tree); +} + +IPDF_StructElement* ToStructTreeElement(FPDF_STRUCTELEMENT struct_element) { + return reinterpret_cast(struct_element); +} + +} // namespace + +DLLEXPORT FPDF_STRUCTTREE STDCALL FPDF_StructTree_GetForPage(FPDF_PAGE page) { + CPDF_Page* pPage = CPDFPageFromFPDFPage(page); + if (!pPage) + return nullptr; + return IPDF_StructTree::LoadPage(pPage->m_pDocument, pPage->m_pFormDict); +} + +DLLEXPORT void STDCALL FPDF_StructTree_Close(FPDF_STRUCTTREE struct_tree) { + delete ToStructTree(struct_tree); +} + +DLLEXPORT int STDCALL +FPDF_StructTree_CountChildren(FPDF_STRUCTTREE struct_tree) { + IPDF_StructTree* tree = ToStructTree(struct_tree); + return tree ? tree->CountTopElements() : -1; +} + +DLLEXPORT FPDF_STRUCTELEMENT STDCALL +FPDF_StructTree_GetChildAtIndex(FPDF_STRUCTTREE struct_tree, int index) { + IPDF_StructTree* tree = ToStructTree(struct_tree); + if (!tree || index < 0 || index >= tree->CountTopElements()) + return nullptr; + return tree->GetTopElement(index); +} + +DLLEXPORT unsigned long STDCALL +FPDF_StructElement_GetAltText(FPDF_STRUCTELEMENT struct_element, + void* buffer, + unsigned long buflen) { + IPDF_StructElement* elem = ToStructTreeElement(struct_element); + if (!elem) + return 0; + + CPDF_Dictionary* dict = elem->GetDict(); + if (!dict) + return 0; + + CFX_WideString str = elem->GetDict()->GetUnicodeTextFor("Alt"); + if (str.IsEmpty()) + return 0; + + CFX_ByteString encodedStr = str.UTF16LE_Encode(); + const unsigned long len = encodedStr.GetLength(); + if (buffer && len <= buflen) + FXSYS_memcpy(buffer, encodedStr.c_str(), len); + return len; +} + +DLLEXPORT int STDCALL +FPDF_StructElement_CountChildren(FPDF_STRUCTELEMENT struct_element) { + IPDF_StructElement* elem = ToStructTreeElement(struct_element); + return elem ? elem->CountKids() : -1; +} + +DLLEXPORT FPDF_STRUCTELEMENT STDCALL +FPDF_StructElement_GetChildAtIndex(FPDF_STRUCTELEMENT struct_element, + int index) { + IPDF_StructElement* elem = ToStructTreeElement(struct_element); + if (!elem || index < 0 || index >= elem->CountKids()) + return nullptr; + + CPDF_StructKid kid = elem->GetKid(index); + return kid.m_Type == CPDF_StructKid::Element ? kid.m_Element.m_pElement + : nullptr; +} diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp new file mode 100644 index 0000000000..58b3172057 --- /dev/null +++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp @@ -0,0 +1,70 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "core/fxcrt/fx_string.h" +#include "public/fpdf_structtree.h" +#include "testing/embedder_test.h" +#include "testing/test_support.h" + +class FPDFStructTreeEmbeddertest : public EmbedderTest, public TestSaver {}; + +TEST_F(FPDFStructTreeEmbeddertest, GetAltText) { + ASSERT_TRUE(OpenDocument("tagged_alt_text.pdf")); + FPDF_PAGE page = LoadPage(0); + ASSERT_TRUE(page); + + FPDF_STRUCTTREE struct_tree = FPDF_StructTree_GetForPage(page); + ASSERT_TRUE(struct_tree); + ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree)); + + FPDF_STRUCTELEMENT element = FPDF_StructTree_GetChildAtIndex(struct_tree, -1); + EXPECT_EQ(nullptr, element); + element = FPDF_StructTree_GetChildAtIndex(struct_tree, 1); + EXPECT_EQ(nullptr, element); + element = FPDF_StructTree_GetChildAtIndex(struct_tree, 0); + ASSERT_NE(nullptr, element); + EXPECT_EQ(0U, FPDF_StructElement_GetAltText(element, nullptr, 0)); + + ASSERT_EQ(1, FPDF_StructElement_CountChildren(element)); + FPDF_STRUCTELEMENT child_element = + FPDF_StructElement_GetChildAtIndex(element, -1); + EXPECT_EQ(nullptr, child_element); + child_element = FPDF_StructElement_GetChildAtIndex(element, 1); + EXPECT_EQ(nullptr, child_element); + child_element = FPDF_StructElement_GetChildAtIndex(element, 0); + ASSERT_NE(nullptr, child_element); + EXPECT_EQ(0U, FPDF_StructElement_GetAltText(child_element, nullptr, 0)); + + ASSERT_EQ(1, FPDF_StructElement_CountChildren(child_element)); + FPDF_STRUCTELEMENT gchild_element = + FPDF_StructElement_GetChildAtIndex(child_element, -1); + EXPECT_EQ(nullptr, gchild_element); + gchild_element = FPDF_StructElement_GetChildAtIndex(child_element, 1); + EXPECT_EQ(nullptr, gchild_element); + gchild_element = FPDF_StructElement_GetChildAtIndex(child_element, 0); + ASSERT_NE(nullptr, gchild_element); + ASSERT_EQ(24U, FPDF_StructElement_GetAltText(gchild_element, nullptr, 0)); + + unsigned short buffer[12]; + memset(buffer, 0, sizeof(buffer)); + // Deliberately pass in a small buffer size to make sure |buffer| remains + // untouched. + ASSERT_EQ(24U, FPDF_StructElement_GetAltText(gchild_element, buffer, 1)); + for (size_t i = 0; i < FX_ArraySize(buffer); ++i) + EXPECT_EQ(0U, buffer[i]); + + ASSERT_EQ(24U, FPDF_StructElement_GetAltText(gchild_element, buffer, + sizeof(buffer))); + const FX_WCHAR kExpected[] = L"Black Image"; + EXPECT_EQ(CFX_WideString(kExpected), + CFX_WideString::FromUTF16LE(buffer, FXSYS_len(kExpected))); + + ASSERT_EQ(1, FPDF_StructElement_CountChildren(gchild_element)); + FPDF_STRUCTELEMENT ggchild_element = + FPDF_StructElement_GetChildAtIndex(gchild_element, 0); + EXPECT_EQ(nullptr, ggchild_element); + + FPDF_StructTree_Close(struct_tree); + FPDF_ClosePage(page); +} diff --git a/fpdfsdk/fpdfdoc.cpp b/fpdfsdk/fpdfdoc.cpp index 254be3f883..2dcf606a7c 100644 --- a/fpdfsdk/fpdfdoc.cpp +++ b/fpdfsdk/fpdfdoc.cpp @@ -64,7 +64,7 @@ unsigned long Utf16EncodeMaybeCopyAndReturnLength(const CFX_WideString& text, unsigned long buflen) { CFX_ByteString encodedText = text.UTF16LE_Encode(); unsigned long len = encodedText.GetLength(); - if (buffer && buflen >= len) + if (buffer && len <= buflen) FXSYS_memcpy(buffer, encodedText.c_str(), len); return len; } @@ -186,7 +186,7 @@ DLLEXPORT unsigned long STDCALL FPDFAction_GetFilePath(FPDF_ACTION pDict, CPDF_Action action(ToDictionary(static_cast(pDict))); CFX_ByteString path = action.GetFilePath().UTF8Encode(); unsigned long len = path.GetLength() + 1; - if (buffer && buflen >= len) + if (buffer && len <= buflen) FXSYS_memcpy(buffer, path.c_str(), len); return len; } @@ -203,7 +203,7 @@ DLLEXPORT unsigned long STDCALL FPDFAction_GetURIPath(FPDF_DOCUMENT document, CPDF_Action action(ToDictionary(static_cast(pDict))); CFX_ByteString path = action.GetURI(pDoc); unsigned long len = path.GetLength() + 1; - if (buffer && buflen >= len) + if (buffer && len <= buflen) FXSYS_memcpy(buffer, path.c_str(), len); return len; } diff --git a/fpdfsdk/fpdfview.cpp b/fpdfsdk/fpdfview.cpp index 959bf14390..3f5115afd6 100644 --- a/fpdfsdk/fpdfview.cpp +++ b/fpdfsdk/fpdfview.cpp @@ -1110,7 +1110,7 @@ DLLEXPORT FPDF_DEST STDCALL FPDF_GetNamedDest(FPDF_DOCUMENT document, int len = utf16Name.GetLength(); if (!buffer) { *buflen = len; - } else if (*buflen >= len) { + } else if (len <= *buflen) { memcpy(buffer, utf16Name.c_str(), len); *buflen = len; } else { diff --git a/fpdfsdk/fpdfview_c_api_test.c b/fpdfsdk/fpdfview_c_api_test.c index 5e6c36f2b1..ed9a3fafe2 100644 --- a/fpdfsdk/fpdfview_c_api_test.c +++ b/fpdfsdk/fpdfview_c_api_test.c @@ -20,6 +20,7 @@ #include "public/fpdf_progressive.h" #include "public/fpdf_save.h" #include "public/fpdf_searchex.h" +#include "public/fpdf_structtree.h" #include "public/fpdf_sysfontinfo.h" #include "public/fpdf_text.h" #include "public/fpdf_transformpage.h" @@ -154,6 +155,15 @@ int CheckPDFiumCApi() { // fpdf_searchex.h CHK(FPDFText_GetCharIndexFromTextIndex); + // fpdf_structtree.h + CHK(FPDF_StructTree_GetForPage); + CHK(FPDF_StructTree_Close); + CHK(FPDF_StructTree_CountChildren); + CHK(FPDF_StructTree_GetChildAtIndex); + CHK(FPDF_StructElement_GetAltText); + CHK(FPDF_StructElement_CountChildren); + CHK(FPDF_StructElement_GetChildAtIndex); + // fpdf_sysfontinfo.h CHK(FPDF_GetDefaultTTFMap); CHK(FPDF_AddInstalledFont); diff --git a/public/fpdf_doc.h b/public/fpdf_doc.h index b245d46900..10f899549b 100644 --- a/public/fpdf_doc.h +++ b/public/fpdf_doc.h @@ -68,7 +68,7 @@ FPDFBookmark_GetNextSibling(FPDF_DOCUMENT document, FPDF_BOOKMARK bookmark); // |buflen| parameters. // // Regardless of the platform, the |buffer| is always in UTF-16LE encoding. The -// string is terminated by a UTF16 NUL character. If |buflen| is less then the +// string is terminated by a UTF16 NUL character. If |buflen| is less than the // required length, or |buffer| is NULL, |buffer| will not be modified. DLLEXPORT unsigned long STDCALL FPDFBookmark_GetTitle(FPDF_BOOKMARK bookmark, void* buffer, @@ -142,7 +142,7 @@ DLLEXPORT FPDF_DEST STDCALL FPDFAction_GetDest(FPDF_DOCUMENT document, // NUL character. // // Regardless of the platform, the |buffer| is always in UTF-16LE encoding. -// If |buflen| is less then the returned length, or |buffer| is NULL, |buffer| +// If |buflen| is less than the returned length, or |buffer| is NULL, |buffer| // will not be modified. DLLEXPORT unsigned long STDCALL FPDFAction_GetFilePath(FPDF_ACTION action, void* buffer, unsigned long buflen); @@ -156,7 +156,7 @@ FPDFAction_GetFilePath(FPDF_ACTION action, void* buffer, unsigned long buflen); // // Returns the number of bytes in the URI path, including trailing zeros. // -// The |buffer| is always encoded in 7-bit ASCII. If |buflen| is less then the +// The |buffer| is always encoded in 7-bit ASCII. If |buflen| is less than the // returned length, or |buffer| is NULL, |buffer| will not be modified. DLLEXPORT unsigned long STDCALL FPDFAction_GetURIPath(FPDF_DOCUMENT document, FPDF_ACTION action, diff --git a/public/fpdf_formfill.h b/public/fpdf_formfill.h index ada87d3158..09b80eaf90 100644 --- a/public/fpdf_formfill.h +++ b/public/fpdf_formfill.h @@ -149,10 +149,8 @@ typedef struct _IPDF_JsPlatform { * The filePath should be always input in local encoding. * * The return value always indicated number of bytes required for the - * buffer, even when there is - * no buffer specified, or the buffer size is less then required. In this - * case, the buffer will not - * be modified. + * buffer , even when there is no buffer specified, or the buffer size is + * less than required. In this case, the buffer will not be modified. */ int (*Doc_getFilePath)(struct _IPDF_JsPlatform* pThis, void* filePath, diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h new file mode 100644 index 0000000000..3d4da402aa --- /dev/null +++ b/public/fpdf_structtree.h @@ -0,0 +1,103 @@ +// Copyright 2016 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef PUBLIC_FPDF_STRUCTTREE_H_ +#define PUBLIC_FPDF_STRUCTTREE_H_ + +// NOLINTNEXTLINE(build/include) +#include "fpdfview.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Function: FPDF_StructTree_GetForPage +// Get the structure tree for a page. +// Parameters: +// page - Handle to the page. Returned by FPDF_LoadPage +// function. +// Return value: +// A handle to the structure tree or NULL on error. +DLLEXPORT FPDF_STRUCTTREE STDCALL FPDF_StructTree_GetForPage(FPDF_PAGE page); + +// Function: FPDF_StructTree_Close +// Release the resource allocate by FPDF_StructTree_GetForPage. +// Parameters: +// struct_tree - Handle to the struct tree. Returned by +// FPDF_StructTree_LoadPage function. +// Return value: +// NULL +DLLEXPORT void STDCALL FPDF_StructTree_Close(FPDF_STRUCTTREE struct_tree); + +// Function: FPDF_StructTree_CountChildren +// Count the number of children for the structure tree. +// Parameters: +// struct_tree - Handle to the struct tree. Returned by +// FPDF_StructTree_LoadPage function. +// Return value: +// The number of children, or -1 on error. +DLLEXPORT int STDCALL +FPDF_StructTree_CountChildren(FPDF_STRUCTTREE struct_tree); + +// Function: FPDF_StructTree_GetChildAtIndex +// Get a child in the structure tree. +// Parameters: +// struct_tree - Handle to the struct tree. Returned by +// FPDF_StructTree_LoadPage function. +// index - The index for the child, 0-based. +// Return value: +// The child at the n-th index or NULL on error. +DLLEXPORT FPDF_STRUCTELEMENT STDCALL +FPDF_StructTree_GetChildAtIndex(FPDF_STRUCTTREE struct_tree, int index); + +// Function: FPDF_StructElement_GetAltText +// Get the alt text for a given element. +// Parameters: +// struct_element - Handle to the struct element. +// buffer - A buffer for output the alt text. May be NULL. +// buflen - The length of the buffer, in bytes. May be 0. +// Return value: +// The number of bytes in the title, including the terminating NUL +// character. The number of bytes is returned regardless of the +// |buffer| and |buflen| parameters. +// Comments: +// Regardless of the platform, the |buffer| is always in UTF-16LE +// encoding. The string is terminated by a UTF16 NUL character. If +// |buflen| is less than the required length, or |buffer| is NULL, +// |buffer| will not be modified. +DLLEXPORT unsigned long STDCALL +FPDF_StructElement_GetAltText(FPDF_STRUCTELEMENT struct_element, + void* buffer, + unsigned long buflen); + +// Function: FPDF_StructElement_CountChildren +// Count the number of children for the structure element. +// Parameters: +// struct_element - Handle to the struct element. +// Return value: +// The number of children, or -1 on error. +DLLEXPORT int STDCALL +FPDF_StructElement_CountChildren(FPDF_STRUCTELEMENT struct_element); + +// Function: FPDF_StructElement_GetChildAtIndex +// Get a child in the structure element. +// Parameters: +// struct_tree - Handle to the struct element. +// index - The index for the child, 0-based. +// Return value: +// The child at the n-th index or NULL on error. +// Comments: +// If the child exists but is not an element, then this function will +// return NULL. This will also return NULL for out of bounds indices. +DLLEXPORT FPDF_STRUCTELEMENT STDCALL +FPDF_StructElement_GetChildAtIndex(FPDF_STRUCTELEMENT struct_element, + int index); + +#ifdef __cplusplus +} +#endif + +#endif // PUBLIC_FPDF_STRUCTTREE_H_ diff --git a/public/fpdfview.h b/public/fpdfview.h index 469053c4e8..581951c7e2 100644 --- a/public/fpdfview.h +++ b/public/fpdfview.h @@ -39,6 +39,8 @@ typedef void* FPDF_PAGERANGE; typedef void* FPDF_PATH; typedef void* FPDF_RECORDER; typedef void* FPDF_SCHHANDLE; +typedef void* FPDF_STRUCTELEMENT; +typedef void* FPDF_STRUCTTREE; typedef void* FPDF_TEXTPAGE; #ifdef PDF_ENABLE_XFA diff --git a/testing/resources/tagged_alt_text.pdf b/testing/resources/tagged_alt_text.pdf new file mode 100644 index 0000000000..a899ce11af Binary files /dev/null and b/testing/resources/tagged_alt_text.pdf differ -- cgit v1.2.3