From 7c29e27dae139a205755c1a29b7f3ac8b36ec0da Mon Sep 17 00:00:00 2001 From: npm Date: Tue, 18 Oct 2016 10:38:20 -0700 Subject: Traverse PDF page tree only once in CPDF_Document In our current implementation of CPDF_Document::GetPage, we traverse the PDF page tree until we find the index we are looking for. This is slow when we do calls GetPage(0), GetPage(1), ... since in this case the page tree will be traversed n times if there are n pages. This CL makes sure the page tree is only traversed once. Time to load the PDF from the bug below in chrome official build: Before this CL: 1 minute 40 seconds After this CL: 5 seconds BUG=chromium:638513 Review-Url: https://codereview.chromium.org/2414423002 --- core/fpdfapi/parser/cpdf_document.cpp | 245 +++++++++++++++++++--------------- core/fpdfapi/parser/cpdf_document.h | 17 ++- 2 files changed, 147 insertions(+), 115 deletions(-) (limited to 'core/fpdfapi/parser') diff --git a/core/fpdfapi/parser/cpdf_document.cpp b/core/fpdfapi/parser/cpdf_document.cpp index c5f64a790c..ba20ff6223 100644 --- a/core/fpdfapi/parser/cpdf_document.cpp +++ b/core/fpdfapi/parser/cpdf_document.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include "core/fpdfapi/cpdf_modulemgr.h" @@ -240,83 +241,6 @@ void InsertWidthArray1(CFX_Font* pFont, InsertWidthArrayImpl(widths, size, pWidthArray); } -int InsertDeletePDFPage(CPDF_Document* pDoc, - CPDF_Dictionary* pPages, - int nPagesToGo, - CPDF_Dictionary* pPage, - FX_BOOL bInsert, - std::set* pVisited) { - CPDF_Array* pKidList = pPages->GetArrayFor("Kids"); - if (!pKidList) - return -1; - - for (size_t i = 0; i < pKidList->GetCount(); i++) { - CPDF_Dictionary* pKid = pKidList->GetDictAt(i); - if (pKid->GetStringFor("Type") == "Page") { - if (nPagesToGo == 0) { - if (bInsert) { - pKidList->InsertAt(i, new CPDF_Reference(pDoc, pPage->GetObjNum())); - pPage->SetReferenceFor("Parent", pDoc, pPages->GetObjNum()); - } else { - pKidList->RemoveAt(i); - } - pPages->SetIntegerFor( - "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); - return 1; - } - nPagesToGo--; - } else { - int nPages = pKid->GetIntegerFor("Count"); - if (nPagesToGo < nPages) { - if (pdfium::ContainsKey(*pVisited, pKid)) - return -1; - - pdfium::ScopedSetInsertion insertion(pVisited, pKid); - if (InsertDeletePDFPage(pDoc, pKid, nPagesToGo, pPage, bInsert, - pVisited) < 0) { - return -1; - } - pPages->SetIntegerFor( - "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); - return 1; - } - nPagesToGo -= nPages; - } - } - return 0; -} - -int InsertNewPage(CPDF_Document* pDoc, - int iPage, - CPDF_Dictionary* pPageDict, - CFX_ArrayTemplate& pageList) { - CPDF_Dictionary* pRoot = pDoc->GetRoot(); - CPDF_Dictionary* pPages = pRoot ? pRoot->GetDictFor("Pages") : nullptr; - if (!pPages) - return -1; - - int nPages = pDoc->GetPageCount(); - if (iPage < 0 || iPage > nPages) - return -1; - - if (iPage == nPages) { - CPDF_Array* pPagesList = pPages->GetArrayFor("Kids"); - if (!pPagesList) { - pPagesList = new CPDF_Array; - pPages->SetFor("Kids", pPagesList); - } - pPagesList->Add(new CPDF_Reference(pDoc, pPageDict->GetObjNum())); - pPages->SetIntegerFor("Count", nPages + 1); - pPageDict->SetReferenceFor("Parent", pDoc, pPages->GetObjNum()); - } else { - std::set stack = {pPages}; - if (InsertDeletePDFPage(pDoc, pPages, iPage, pPageDict, TRUE, &stack) < 0) - return -1; - } - pageList.InsertAt(iPage, pPageDict->GetObjNum()); - return iPage; -} - int CountPages(CPDF_Dictionary* pPages, std::set* visited_pages) { int count = pPages->GetIntegerFor("Count"); @@ -413,6 +337,7 @@ CPDF_Document::CPDF_Document(std::unique_ptr pParser) m_pParser(std::move(pParser)), m_pRootDict(nullptr), m_pInfoDict(nullptr), + m_iLastPageTraversed(-1), m_bLinearized(false), m_iFirstPageNo(0), m_dwFirstPageObjNum(0), @@ -477,40 +402,63 @@ void CPDF_Document::LoadPages() { m_PageList.SetSize(RetrievePageCount()); } -CPDF_Dictionary* CPDF_Document::FindPDFPage(CPDF_Dictionary* pPages, - int iPage, - int nPagesToGo, - int level) { +CPDF_Dictionary* CPDF_Document::TraversePDFPages(int iPage, int nPagesToGo) { + std::pair* lastProc = &m_pTreeTraversal.top(); + CPDF_Dictionary* pPages = lastProc->first; CPDF_Array* pKidList = pPages->GetArrayFor("Kids"); - if (!pKidList) - return nPagesToGo == 0 ? pPages : nullptr; + if (!pKidList) { + m_pTreeTraversal.pop(); + if (nPagesToGo != 1) + return nullptr; + m_PageList.SetAt(iPage, pPages->GetObjNum()); + return pPages; + } - if (level >= FX_MAX_PAGE_LEVEL) + if (m_pTreeTraversal.size() >= FX_MAX_PAGE_LEVEL) { + m_pTreeTraversal.pop(); return nullptr; + } - for (size_t i = 0; i < pKidList->GetCount(); i++) { + bool shouldFinish = pPages->GetIntegerFor("Count") <= nPagesToGo; + CPDF_Dictionary* page = nullptr; + for (size_t i = lastProc->second + 1; i < pKidList->GetCount(); i++) { CPDF_Dictionary* pKid = pKidList->GetDictAt(i); if (!pKid) { nPagesToGo--; + lastProc->second++; continue; } - if (pKid == pPages) + if (pKid == pPages) { + lastProc->second++; continue; + } if (!pKid->KeyExist("Kids")) { - if (nPagesToGo == 0) - return pKid; - - m_PageList.SetAt(iPage - nPagesToGo, pKid->GetObjNum()); + m_PageList.SetAt(iPage - nPagesToGo + 1, pKid->GetObjNum()); nPagesToGo--; + lastProc->second++; + if (nPagesToGo == 0) { + page = pKid; + break; + } } else { int nPages = pKid->GetIntegerFor("Count"); - if (nPagesToGo < nPages) - return FindPDFPage(pKid, iPage, nPagesToGo, level + 1); - + m_pTreeTraversal.push(std::make_pair(pKid, -1)); + CPDF_Dictionary* pageKid = TraversePDFPages(iPage, nPagesToGo); + // If the stack top is current element, kid was completely processed. + if (lastProc == &m_pTreeTraversal.top()) + lastProc->second++; + if (nPagesToGo <= nPages) { + page = pageKid; + break; + } nPagesToGo -= nPages; } } - return nullptr; + if (shouldFinish || + lastProc->second == static_cast(pKidList->GetCount() - 1)) { + m_pTreeTraversal.pop(); + } + return page; } CPDF_Dictionary* CPDF_Document::GetPagesDict() const { @@ -534,21 +482,20 @@ CPDF_Dictionary* CPDF_Document::GetPage(int iPage) { } int objnum = m_PageList.GetAt(iPage); - if (objnum) { - if (CPDF_Dictionary* pDict = ToDictionary(GetOrParseIndirectObject(objnum))) - return pDict; + if (!objnum || m_iLastPageTraversed < iPage) { + CPDF_Dictionary* pPages = GetPagesDict(); + if (!pPages) + return nullptr; + if (m_pTreeTraversal.empty()) + m_pTreeTraversal.push(std::make_pair(pPages, -1)); + CPDF_Dictionary* page = TraversePDFPages(m_iLastPageTraversed + 1, + iPage - m_iLastPageTraversed); + m_iLastPageTraversed = iPage; + return page; } - - CPDF_Dictionary* pPages = GetPagesDict(); - if (!pPages) - return nullptr; - - CPDF_Dictionary* pPage = FindPDFPage(pPages, iPage, iPage, 0); - if (!pPage) - return nullptr; - - m_PageList.SetAt(iPage, pPage->GetObjNum()); - return pPage; + if (CPDF_Dictionary* pDict = ToDictionary(GetOrParseIndirectObject(objnum))) + return pDict; + return nullptr; } void CPDF_Document::SetPageObjNum(int iPage, uint32_t objNum) { @@ -710,13 +657,91 @@ CPDF_Dictionary* CPDF_Document::CreateNewPage(int iPage) { CPDF_Dictionary* pDict = new CPDF_Dictionary(m_pByteStringPool); pDict->SetNameFor("Type", "Page"); uint32_t dwObjNum = AddIndirectObject(pDict); - if (InsertNewPage(this, iPage, pDict, m_PageList) < 0) { + if (InsertNewPage(iPage, pDict, m_PageList) < 0) { ReleaseIndirectObject(dwObjNum); return nullptr; } return pDict; } +int CPDF_Document::InsertDeletePDFPage(CPDF_Dictionary* pPages, + int nPagesToGo, + CPDF_Dictionary* pPage, + FX_BOOL bInsert, + std::set* pVisited) { + CPDF_Array* pKidList = pPages->GetArrayFor("Kids"); + if (!pKidList) + return -1; + + for (size_t i = 0; i < pKidList->GetCount(); i++) { + CPDF_Dictionary* pKid = pKidList->GetDictAt(i); + if (pKid->GetStringFor("Type") == "Page") { + if (nPagesToGo == 0) { + if (bInsert) { + pKidList->InsertAt(i, new CPDF_Reference(this, pPage->GetObjNum())); + pPage->SetReferenceFor("Parent", this, pPages->GetObjNum()); + } else { + pKidList->RemoveAt(i); + } + pPages->SetIntegerFor( + "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); + // Tree will change, so reset tree transversal variables + m_iLastPageTraversed = -1; + m_pTreeTraversal = std::stack>(); + return 1; + } + nPagesToGo--; + } else { + int nPages = pKid->GetIntegerFor("Count"); + if (nPagesToGo < nPages) { + if (pdfium::ContainsKey(*pVisited, pKid)) + return -1; + + pdfium::ScopedSetInsertion insertion(pVisited, pKid); + if (InsertDeletePDFPage(pKid, nPagesToGo, pPage, bInsert, pVisited) < + 0) { + return -1; + } + pPages->SetIntegerFor( + "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); + return 1; + } + nPagesToGo -= nPages; + } + } + return 0; +} + +int CPDF_Document::InsertNewPage(int iPage, + CPDF_Dictionary* pPageDict, + CFX_ArrayTemplate& pageList) { + CPDF_Dictionary* pRoot = GetRoot(); + CPDF_Dictionary* pPages = pRoot ? pRoot->GetDictFor("Pages") : nullptr; + if (!pPages) + return -1; + + int nPages = GetPageCount(); + if (iPage < 0 || iPage > nPages) + return -1; + + if (iPage == nPages) { + CPDF_Array* pPagesList = pPages->GetArrayFor("Kids"); + if (!pPagesList) { + pPagesList = new CPDF_Array; + pPages->SetFor("Kids", pPagesList); + } + pPagesList->Add(new CPDF_Reference(this, pPageDict->GetObjNum())); + pPages->SetIntegerFor("Count", nPages + 1); + pPageDict->SetReferenceFor("Parent", this, pPages->GetObjNum()); + } else { + std::set stack = {pPages}; + if (InsertDeletePDFPage(pPages, iPage, pPageDict, TRUE, &stack) < 0) + return -1; + } + pageList.InsertAt(iPage, pPageDict->GetObjNum()); + return iPage; +} + void CPDF_Document::DeletePage(int iPage) { CPDF_Dictionary* pPages = GetPagesDict(); if (!pPages) @@ -727,7 +752,7 @@ void CPDF_Document::DeletePage(int iPage) { return; std::set stack = {pPages}; - if (InsertDeletePDFPage(this, pPages, iPage, nullptr, FALSE, &stack) < 0) + if (InsertDeletePDFPage(pPages, iPage, nullptr, FALSE, &stack) < 0) return; m_PageList.RemoveAt(iPage); diff --git a/core/fpdfapi/parser/cpdf_document.h b/core/fpdfapi/parser/cpdf_document.h index c557a56013..e845ea8eef 100644 --- a/core/fpdfapi/parser/cpdf_document.h +++ b/core/fpdfapi/parser/cpdf_document.h @@ -9,6 +9,7 @@ #include #include +#include #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h" #include "core/fpdfapi/parser/cpdf_object.h" @@ -107,10 +108,7 @@ class CPDF_Document : public CPDF_IndirectObjectHolder { // Retrieve page count information by getting count value from the tree nodes int RetrievePageCount() const; - CPDF_Dictionary* FindPDFPage(CPDF_Dictionary* pPages, - int iPage, - int nPagesToGo, - int level); + CPDF_Dictionary* TraversePDFPages(int iPage, int nPagesToGo); int FindPageIndex(CPDF_Dictionary* pNode, uint32_t& skip_count, uint32_t objnum, @@ -126,10 +124,19 @@ class CPDF_Document : public CPDF_IndirectObjectHolder { FX_BOOL bVert, CFX_ByteString basefont, std::function Insert); - + int InsertDeletePDFPage(CPDF_Dictionary* pPages, + int nPagesToGo, + CPDF_Dictionary* pPage, + FX_BOOL bInsert, + std::set* pVisited); + int InsertNewPage(int iPage, + CPDF_Dictionary* pPageDict, + CFX_ArrayTemplate& pageList); std::unique_ptr m_pParser; CPDF_Dictionary* m_pRootDict; CPDF_Dictionary* m_pInfoDict; + std::stack> m_pTreeTraversal; + int m_iLastPageTraversed; bool m_bLinearized; int m_iFirstPageNo; uint32_t m_dwFirstPageObjNum; -- cgit v1.2.3