From 900f421e29daf2ab62de3ae8dc821f031bc7bdb3 Mon Sep 17 00:00:00 2001 From: npm Date: Fri, 28 Oct 2016 14:30:44 -0700 Subject: Revert of Traverse PDF page tree only once in CPDF_Document Try 2 (patchset #3 id:40001 of https://codereview.chromium.org/2442403002/ ) Reason for revert: Not quite right yet. Original issue's description: > Traverse PDF page tree only once in CPDF_Document > > Try 2: main fix was recursively popping elements from the stack. Since > the Traverse method can be called on non-root nodes from GetPage(), we > have to make sure to properly update the parents. > > Try 1 at https://codereview.chromium.org/2414423002/ > > In our current implementation of CPDF_Document::GetPage, we traverse > the PDF page tree until we find the index we are looking for. This is > slow when we do calls GetPage(0), GetPage(1), ... since in this case > the page tree will be traversed n times if there are n pages. This CL > makes sure the page tree is only traversed once. > > Time to load the PDF from the bug below in chrome official build: > Before this CL: around 1 minute 25 seconds > After this CL: around 4 seconds > > BUG=chromium:638513 > > Committed: https://pdfium.googlesource.com/pdfium/+/d3a2009d75eac3cda442f545ef0865afae7b35cf TBR=tsepez@chromium.org,weili@chromium.org,thestig@chromium.org # Not skipping CQ checks because original CL landed more than 1 days ago. BUG=chromium:638513 Review-Url: https://codereview.chromium.org/2461063003 --- core/fpdfapi/parser/cpdf_document.cpp | 256 +++++++++++++++------------------- core/fpdfapi/parser/cpdf_document.h | 21 +-- 2 files changed, 115 insertions(+), 162 deletions(-) diff --git a/core/fpdfapi/parser/cpdf_document.cpp b/core/fpdfapi/parser/cpdf_document.cpp index f548a26038..c5f64a790c 100644 --- a/core/fpdfapi/parser/cpdf_document.cpp +++ b/core/fpdfapi/parser/cpdf_document.cpp @@ -8,7 +8,6 @@ #include #include -#include #include #include "core/fpdfapi/cpdf_modulemgr.h" @@ -241,6 +240,83 @@ void InsertWidthArray1(CFX_Font* pFont, InsertWidthArrayImpl(widths, size, pWidthArray); } +int InsertDeletePDFPage(CPDF_Document* pDoc, + CPDF_Dictionary* pPages, + int nPagesToGo, + CPDF_Dictionary* pPage, + FX_BOOL bInsert, + std::set* pVisited) { + CPDF_Array* pKidList = pPages->GetArrayFor("Kids"); + if (!pKidList) + return -1; + + for (size_t i = 0; i < pKidList->GetCount(); i++) { + CPDF_Dictionary* pKid = pKidList->GetDictAt(i); + if (pKid->GetStringFor("Type") == "Page") { + if (nPagesToGo == 0) { + if (bInsert) { + pKidList->InsertAt(i, new CPDF_Reference(pDoc, pPage->GetObjNum())); + pPage->SetReferenceFor("Parent", pDoc, pPages->GetObjNum()); + } else { + pKidList->RemoveAt(i); + } + pPages->SetIntegerFor( + "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); + return 1; + } + nPagesToGo--; + } else { + int nPages = pKid->GetIntegerFor("Count"); + if (nPagesToGo < nPages) { + if (pdfium::ContainsKey(*pVisited, pKid)) + return -1; + + pdfium::ScopedSetInsertion insertion(pVisited, pKid); + if (InsertDeletePDFPage(pDoc, pKid, nPagesToGo, pPage, bInsert, + pVisited) < 0) { + return -1; + } + pPages->SetIntegerFor( + "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); + return 1; + } + nPagesToGo -= nPages; + } + } + return 0; +} + +int InsertNewPage(CPDF_Document* pDoc, + int iPage, + CPDF_Dictionary* pPageDict, + CFX_ArrayTemplate& pageList) { + CPDF_Dictionary* pRoot = pDoc->GetRoot(); + CPDF_Dictionary* pPages = pRoot ? pRoot->GetDictFor("Pages") : nullptr; + if (!pPages) + return -1; + + int nPages = pDoc->GetPageCount(); + if (iPage < 0 || iPage > nPages) + return -1; + + if (iPage == nPages) { + CPDF_Array* pPagesList = pPages->GetArrayFor("Kids"); + if (!pPagesList) { + pPagesList = new CPDF_Array; + pPages->SetFor("Kids", pPagesList); + } + pPagesList->Add(new CPDF_Reference(pDoc, pPageDict->GetObjNum())); + pPages->SetIntegerFor("Count", nPages + 1); + pPageDict->SetReferenceFor("Parent", pDoc, pPages->GetObjNum()); + } else { + std::set stack = {pPages}; + if (InsertDeletePDFPage(pDoc, pPages, iPage, pPageDict, TRUE, &stack) < 0) + return -1; + } + pageList.InsertAt(iPage, pPageDict->GetObjNum()); + return iPage; +} + int CountPages(CPDF_Dictionary* pPages, std::set* visited_pages) { int count = pPages->GetIntegerFor("Count"); @@ -337,7 +413,6 @@ CPDF_Document::CPDF_Document(std::unique_ptr pParser) m_pParser(std::move(pParser)), m_pRootDict(nullptr), m_pInfoDict(nullptr), - m_iLastPageTraversed(-1), m_bLinearized(false), m_iFirstPageNo(0), m_dwFirstPageObjNum(0), @@ -402,71 +477,40 @@ void CPDF_Document::LoadPages() { m_PageList.SetSize(RetrievePageCount()); } -void CPDF_Document::PopAndPropagate() { - m_pTreeTraversal.pop(); - if (m_pTreeTraversal.empty()) - return; - std::pair* top = &m_pTreeTraversal.top(); - top->second++; - if (top->second == - static_cast(top->first->GetArrayFor("Kids")->GetCount() - 1)) { - PopAndPropagate(); - } -} - -CPDF_Dictionary* CPDF_Document::TraversePDFPages(int iPage, int nPagesToGo) { - std::pair* lastProc = &m_pTreeTraversal.top(); - CPDF_Dictionary* pPages = lastProc->first; +CPDF_Dictionary* CPDF_Document::FindPDFPage(CPDF_Dictionary* pPages, + int iPage, + int nPagesToGo, + int level) { CPDF_Array* pKidList = pPages->GetArrayFor("Kids"); - if (!pKidList) { - PopAndPropagate(); - if (nPagesToGo != 1) - return nullptr; - m_PageList.SetAt(iPage, pPages->GetObjNum()); - return pPages; - } + if (!pKidList) + return nPagesToGo == 0 ? pPages : nullptr; - if (m_pTreeTraversal.size() >= FX_MAX_PAGE_LEVEL) { - PopAndPropagate(); + if (level >= FX_MAX_PAGE_LEVEL) return nullptr; - } - CPDF_Dictionary* page = nullptr; - for (size_t i = lastProc->second + 1; i < pKidList->GetCount(); i++) { + for (size_t i = 0; i < pKidList->GetCount(); i++) { CPDF_Dictionary* pKid = pKidList->GetDictAt(i); if (!pKid) { nPagesToGo--; - lastProc->second++; continue; } - if (pKid == pPages) { - lastProc->second++; + if (pKid == pPages) continue; - } if (!pKid->KeyExist("Kids")) { - m_PageList.SetAt(iPage - nPagesToGo + 1, pKid->GetObjNum()); + if (nPagesToGo == 0) + return pKid; + + m_PageList.SetAt(iPage - nPagesToGo, pKid->GetObjNum()); nPagesToGo--; - lastProc->second++; - if (nPagesToGo == 0) { - page = pKid; - break; - } } else { int nPages = pKid->GetIntegerFor("Count"); - m_pTreeTraversal.push(std::make_pair(pKid, -1)); - CPDF_Dictionary* pageKid = TraversePDFPages(iPage, nPagesToGo); - if (nPagesToGo <= nPages) { - page = pageKid; - break; - } + if (nPagesToGo < nPages) + return FindPDFPage(pKid, iPage, nPagesToGo, level + 1); + nPagesToGo -= nPages; } } - if (!m_pTreeTraversal.empty() && lastProc == &m_pTreeTraversal.top() && - lastProc->second == static_cast(pKidList->GetCount() - 1)) { - PopAndPropagate(); - } - return page; + return nullptr; } CPDF_Dictionary* CPDF_Document::GetPagesDict() const { @@ -490,20 +534,21 @@ CPDF_Dictionary* CPDF_Document::GetPage(int iPage) { } int objnum = m_PageList.GetAt(iPage); - if (!objnum) { - CPDF_Dictionary* pPages = GetPagesDict(); - if (!pPages) - return nullptr; - if (m_pTreeTraversal.empty()) - m_pTreeTraversal.push(std::make_pair(pPages, -1)); - CPDF_Dictionary* page = - TraversePDFPages(iPage, iPage - m_iLastPageTraversed); - m_iLastPageTraversed = iPage; - return page; + if (objnum) { + if (CPDF_Dictionary* pDict = ToDictionary(GetOrParseIndirectObject(objnum))) + return pDict; } - if (CPDF_Dictionary* pDict = ToDictionary(GetOrParseIndirectObject(objnum))) - return pDict; - return nullptr; + + CPDF_Dictionary* pPages = GetPagesDict(); + if (!pPages) + return nullptr; + + CPDF_Dictionary* pPage = FindPDFPage(pPages, iPage, iPage, 0); + if (!pPage) + return nullptr; + + m_PageList.SetAt(iPage, pPage->GetObjNum()); + return pPage; } void CPDF_Document::SetPageObjNum(int iPage, uint32_t objNum) { @@ -665,94 +710,13 @@ CPDF_Dictionary* CPDF_Document::CreateNewPage(int iPage) { CPDF_Dictionary* pDict = new CPDF_Dictionary(m_pByteStringPool); pDict->SetNameFor("Type", "Page"); uint32_t dwObjNum = AddIndirectObject(pDict); - if (InsertNewPage(iPage, pDict, m_PageList) < 0) { + if (InsertNewPage(this, iPage, pDict, m_PageList) < 0) { ReleaseIndirectObject(dwObjNum); return nullptr; } return pDict; } -int CPDF_Document::InsertDeletePDFPage(CPDF_Dictionary* pPages, - int nPagesToGo, - CPDF_Dictionary* pPage, - FX_BOOL bInsert, - std::set* pVisited) { - CPDF_Array* pKidList = pPages->GetArrayFor("Kids"); - if (!pKidList) - return -1; - - for (size_t i = 0; i < pKidList->GetCount(); i++) { - CPDF_Dictionary* pKid = pKidList->GetDictAt(i); - if (pKid->GetStringFor("Type") == "Page") { - if (nPagesToGo == 0) { - if (bInsert) { - pKidList->InsertAt(i, new CPDF_Reference(this, pPage->GetObjNum())); - pPage->SetReferenceFor("Parent", this, pPages->GetObjNum()); - } else { - pKidList->RemoveAt(i); - } - pPages->SetIntegerFor( - "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); - // Tree will change, so reset tree transversal variables - m_iLastPageTraversed = -1; - m_pTreeTraversal = std::stack>(); - return 1; - } - nPagesToGo--; - } else { - int nPages = pKid->GetIntegerFor("Count"); - if (nPagesToGo < nPages) { - if (pdfium::ContainsKey(*pVisited, pKid)) - return -1; - - pdfium::ScopedSetInsertion insertion(pVisited, pKid); - if (InsertDeletePDFPage(pKid, nPagesToGo, pPage, bInsert, pVisited) < - 0) { - return -1; - } - pPages->SetIntegerFor( - "Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1)); - return 1; - } - nPagesToGo -= nPages; - } - } - return 0; -} - -int CPDF_Document::InsertNewPage(int iPage, - CPDF_Dictionary* pPageDict, - CFX_ArrayTemplate& pageList) { - CPDF_Dictionary* pRoot = GetRoot(); - CPDF_Dictionary* pPages = pRoot ? pRoot->GetDictFor("Pages") : nullptr; - if (!pPages) - return -1; - - int nPages = GetPageCount(); - if (iPage < 0 || iPage > nPages) - return -1; - - if (iPage == nPages) { - CPDF_Array* pPagesList = pPages->GetArrayFor("Kids"); - if (!pPagesList) { - pPagesList = new CPDF_Array; - pPages->SetFor("Kids", pPagesList); - } - pPagesList->Add(new CPDF_Reference(this, pPageDict->GetObjNum())); - pPages->SetIntegerFor("Count", nPages + 1); - pPageDict->SetReferenceFor("Parent", this, pPages->GetObjNum()); - // Reset tree transversal variables - m_iLastPageTraversed = -1; - m_pTreeTraversal = std::stack>(); - } else { - std::set stack = {pPages}; - if (InsertDeletePDFPage(pPages, iPage, pPageDict, TRUE, &stack) < 0) - return -1; - } - pageList.InsertAt(iPage, pPageDict->GetObjNum()); - return iPage; -} - void CPDF_Document::DeletePage(int iPage) { CPDF_Dictionary* pPages = GetPagesDict(); if (!pPages) @@ -763,7 +727,7 @@ void CPDF_Document::DeletePage(int iPage) { return; std::set stack = {pPages}; - if (InsertDeletePDFPage(pPages, iPage, nullptr, FALSE, &stack) < 0) + if (InsertDeletePDFPage(this, pPages, iPage, nullptr, FALSE, &stack) < 0) return; m_PageList.RemoveAt(iPage); diff --git a/core/fpdfapi/parser/cpdf_document.h b/core/fpdfapi/parser/cpdf_document.h index ef9f663c3b..ea7bd328aa 100644 --- a/core/fpdfapi/parser/cpdf_document.h +++ b/core/fpdfapi/parser/cpdf_document.h @@ -9,7 +9,6 @@ #include #include -#include #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h" #include "core/fpdfapi/parser/cpdf_object.h" @@ -106,7 +105,10 @@ class CPDF_Document : public CPDF_IndirectObjectHolder { protected: // Retrieve page count information by getting count value from the tree nodes int RetrievePageCount() const; - CPDF_Dictionary* TraversePDFPages(int iPage, int nPagesToGo); + CPDF_Dictionary* FindPDFPage(CPDF_Dictionary* pPages, + int iPage, + int nPagesToGo, + int level); int FindPageIndex(CPDF_Dictionary* pNode, uint32_t& skip_count, uint32_t objnum, @@ -122,23 +124,10 @@ class CPDF_Document : public CPDF_IndirectObjectHolder { FX_BOOL bVert, CFX_ByteString basefont, std::function Insert); - int InsertDeletePDFPage(CPDF_Dictionary* pPages, - int nPagesToGo, - CPDF_Dictionary* pPage, - FX_BOOL bInsert, - std::set* pVisited); - int InsertNewPage(int iPage, - CPDF_Dictionary* pPageDict, - CFX_ArrayTemplate& pageList); - void PopAndPropagate(); + std::unique_ptr m_pParser; CPDF_Dictionary* m_pRootDict; CPDF_Dictionary* m_pInfoDict; - // Stack of page nodes to know current position in page tree. Int is the index - // of last processed child. - std::stack> m_pTreeTraversal; - // Index of last page (leaf) processed from page tree. - int m_iLastPageTraversed; bool m_bLinearized; int m_iFirstPageNo; uint32_t m_dwFirstPageObjNum; -- cgit v1.2.3