Unify parsing of cross ref table v4.

We can use 'Prev' value of first-page cross-reference table trailer for load main cross-reference table, instead of 'T' value of Linearized header (Offset of first entry in main cross-reference table). This is better solution, because this is allow us check entry count in main cross-ref table and unify loading of main cross-ref table with loading methods of non linearized document. See PDF specification: http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf#678 (page 678, Example 3/part 3) Change-Id: I59dcf3c73a0fb561221ded78e827e40535dbd717 Reviewed-on: https://pdfium-review.googlesource.com/13810 Commit-Queue: Art Snake <art-snake@yandex-team.ru> Reviewed-by: dsinclair <dsinclair@chromium.org>
author: Artem Strygin <art-snake@yandex-team.ru> 2017-10-02 22:08:44 +0300
committer: Chromium commit bot <commit-bot@chromium.org> 2017-10-04 15:41:16 +0000
commit: a5fc8975c865dc3cc90de8ff46ca13fb46c13391 (patch)
tree: 36b38e781140fc31eeec8a55d85299911117ac2b
parent: 4db6e37b18648dfe2c94b672276c7bf6554fd9d4 (diff)
download: pdfium-a5fc8975c865dc3cc90de8ff46ca13fb46c13391.tar.xz
4 files changed, 43 insertions, 48 deletions
diff --git a/core/fpdfapi/parser/cpdf_data_avail.cpp b/core/fpdfapi/parser/cpdf_data_avail.cpp
index 38e857f22f..91fc8c1087 100644
--- a/core/fpdfapi/parser/cpdf_data_avail.cpp
+++ b/core/fpdfapi/parser/cpdf_data_avail.cpp
@@ -1171,13 +1171,22 @@ CPDF_DataAvail::DocAvailStatus CPDF_DataAvail::CheckLinearizedData() {
     return DataError;
 
   if (!m_bMainXRefLoadTried) {
+    ASSERT(m_pDocument->GetParser()->GetTrailer());
+    const FX_SAFE_FILESIZE main_xref_offset =
+        m_pDocument->GetParser()->GetTrailer()->GetIntegerFor("Prev");
+    if (!main_xref_offset.IsValid())
+      return DataError;
+
+    if (main_xref_offset.ValueOrDie() == 0)
+      return DataAvailable;
+
     FX_SAFE_SIZE_T data_size = m_dwFileLen;
-    data_size -= m_pLinearized->GetLastXRefOffset();
+    data_size -= main_xref_offset.ValueOrDie();
     if (!data_size.IsValid())
       return DataError;
 
     if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
-            m_pLinearized->GetLastXRefOffset(), data_size.ValueOrDie()))
+            main_xref_offset.ValueOrDie(), data_size.ValueOrDie()))
       return DataNotAvailable;
 
     CPDF_Parser::Error eRet =
diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp
index c443c8baf2..b6231a7a6a 100644
--- a/core/fpdfapi/parser/cpdf_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_parser.cpp
@@ -102,8 +102,7 @@ CPDF_Parser::CPDF_Parser()
       m_bXRefStream(false),
       m_FileVersion(0),
       m_pEncryptDict(nullptr),
-      m_TrailerData(pdfium::MakeUnique<TrailerData>()),
-      m_dwLinearizedFirstPageXRefStartObjNum(0) {}
+      m_TrailerData(pdfium::MakeUnique<TrailerData>()) {}
 
 CPDF_Parser::~CPDF_Parser() {
   ReleaseEncryptHandler();
@@ -421,9 +420,8 @@ bool CPDF_Parser::LoadAllCrossRefV4(FX_FILESIZE xrefpos) {
   return true;
 }
 
-bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos,
-                                              uint32_t dwObjCount) {
-  if (!LoadLinearizedCrossRefV4(xrefpos, dwObjCount))
+bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos) {
+  if (!LoadCrossRefV4(xrefpos, false))
     return false;
 
   std::unique_ptr<CPDF_Dictionary> trailer = LoadTrailerV4();
@@ -477,18 +475,6 @@ bool CPDF_Parser::LoadLinearizedAllCrossRefV4(FX_FILESIZE xrefpos,
   return true;
 }
 
-bool CPDF_Parser::LoadLinearizedCrossRefV4(FX_FILESIZE pos,
-                                           uint32_t dwObjCount) {
-  FX_FILESIZE dwStartPos = pos - m_pSyntax->m_HeaderOffset;
-
-  m_pSyntax->SetPos(dwStartPos);
-  std::vector<CrossRefObjData> objects;
-  if (!ParseAndAppendCrossRefSubsectionData(0, dwObjCount, &objects))
-    return false;
-  MergeCrossRefObjectsData(objects);
-  return true;
-}
-
 bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData(
     uint32_t start_objnum,
     uint32_t count,
@@ -1318,9 +1304,10 @@ bool CPDF_Parser::ParseLinearizedHeader() {
   if (!m_pLinearized)
     return false;
 
-  m_LastXRefOffset = m_pLinearized->GetLastXRefOffset();
   // Move parser onto first page xref table start.
   m_pSyntax->GetNextWord(nullptr);
+
+  m_LastXRefOffset = m_pSyntax->GetPos();
   return true;
 }
 
@@ -1340,7 +1327,7 @@ CPDF_Parser::Error CPDF_Parser::StartLinearizedParse(
   m_bHasParsed = true;
   m_pDocument = pDocument;
 
-  FX_FILESIZE dwFirstXRefOffset = m_pSyntax->GetPos();
+  FX_FILESIZE dwFirstXRefOffset = m_LastXRefOffset;
   bool bXRefRebuilt = false;
   bool bLoadV4 = LoadCrossRefV4(dwFirstXRefOffset, false);
   if (!bLoadV4 && !LoadCrossRefV5(&dwFirstXRefOffset, true)) {
@@ -1350,8 +1337,6 @@ CPDF_Parser::Error CPDF_Parser::StartLinearizedParse(
     bXRefRebuilt = true;
     m_LastXRefOffset = 0;
   }
-  m_dwLinearizedFirstPageXRefStartObjNum =
-      m_ObjectInfo.empty() ? 0 : m_ObjectInfo.begin()->first;
   if (bLoadV4) {
     std::unique_ptr<CPDF_Dictionary> trailer = LoadTrailerV4();
     if (!trailer)
@@ -1423,33 +1408,20 @@ bool CPDF_Parser::LoadLinearizedAllCrossRefV5(FX_FILESIZE xrefpos) {
 }
 
 CPDF_Parser::Error CPDF_Parser::LoadLinearizedMainXRefTable() {
+  const FX_SAFE_FILESIZE main_xref_offset = GetTrailer()->GetIntegerFor("Prev");
+  if (!main_xref_offset.IsValid())
+    return FORMAT_ERROR;
+
+  if (main_xref_offset.ValueOrDie() == 0)
+    return SUCCESS;
+
   const AutoRestorer<uint32_t> save_metadata_objnum(&m_MetadataObjnum);
   m_MetadataObjnum = 0;
-  m_pSyntax->SetPos(m_LastXRefOffset - m_pSyntax->m_HeaderOffset);
-
-  uint8_t ch = 0;
-  uint32_t dwCount = 0;
-  m_pSyntax->GetNextChar(ch);
-  while (PDFCharIsWhitespace(ch)) {
-    ++dwCount;
-    if (m_pSyntax->m_FileLen <=
-        (FX_FILESIZE)(m_pSyntax->GetPos() + m_pSyntax->m_HeaderOffset)) {
-      break;
-    }
-    if (!m_pSyntax->GetNextChar(ch))
-      return HANDLER_ERROR;
-  }
-  m_LastXRefOffset += dwCount;
   m_ObjectStreamMap.clear();
   m_ObjCache.clear();
 
-  // In linearized document, the main cross ref always should start from 0
-  // objnum.
-  // And should have count equals to first obj number of first page cross ref
-  // table.
-  if (!LoadLinearizedAllCrossRefV4(m_LastXRefOffset,
-                                   m_dwLinearizedFirstPageXRefStartObjNum) &&
-      !LoadLinearizedAllCrossRefV5(m_LastXRefOffset)) {
+  if (!LoadLinearizedAllCrossRefV4(main_xref_offset.ValueOrDie()) &&
+      !LoadLinearizedAllCrossRefV5(main_xref_offset.ValueOrDie())) {
     m_LastXRefOffset = 0;
     return FORMAT_ERROR;
   }
diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h
index 7c6a75d8c5..96bd6f43a9 100644
--- a/core/fpdfapi/parser/cpdf_parser.h
+++ b/core/fpdfapi/parser/cpdf_parser.h
@@ -162,8 +162,7 @@ class CPDF_Parser {
   std::unique_ptr<CPDF_Dictionary> LoadTrailerV4();
   Error SetEncryptHandler();
   void ReleaseEncryptHandler();
-  bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount);
-  bool LoadLinearizedCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount);
+  bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos);
   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
   Error LoadLinearizedMainXRefTable();
   RetainPtr<CPDF_StreamAcc> GetObjectStream(uint32_t number);
@@ -207,7 +206,6 @@ class CPDF_Parser {
   ByteString m_Password;
   std::unique_ptr<TrailerData> m_TrailerData;
   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
-  uint32_t m_dwLinearizedFirstPageXRefStartObjNum;
 
   // A map of object numbers to indirect streams.
   std::map<uint32_t, RetainPtr<CPDF_StreamAcc>> m_ObjectStreamMap;
diff --git a/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp b/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp
index 99bc2c2d42..6aa3e2785a 100644
--- a/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp
+++ b/core/fpdfapi/parser/cpdf_parser_embeddertest.cpp
@@ -57,3 +57,19 @@ TEST_F(CPDFParserEmbeddertest, Bug_602650) {
 TEST_F(CPDFParserEmbeddertest, Bug_757705) {
   EXPECT_TRUE(OpenDocument("bug_757705.pdf"));
 }
+
+TEST_F(CPDFParserEmbeddertest, LoadMainCrossRefTable) {
+  EXPECT_TRUE(OpenDocument("feature_linearized_loading.pdf", nullptr, true));
+  // To check that main cross ref table is loaded correctly,will be enough to
+  // check that the second page was correctly loaded. Because it is contains
+  // crossrefs for second page.
+  EXPECT_EQ(2, GetPageCount());
+  FPDF_PAGE page = LoadPage(1);
+  EXPECT_NE(nullptr, page);
+  FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
+  EXPECT_NE(nullptr, text_page);
+  // The page should not be blank.
+  EXPECT_LT(0, FPDFText_CountChars(text_page));
+  FPDFText_ClosePage(text_page);
+  UnloadPage(page);
+}
author	Artem Strygin <art-snake@yandex-team.ru>	2017-10-02 22:08:44 +0300
committer	Chromium commit bot <commit-bot@chromium.org>	2017-10-04 15:41:16 +0000
commit	a5fc8975c865dc3cc90de8ff46ca13fb46c13391 (patch)
tree	36b38e781140fc31eeec8a55d85299911117ac2b
parent	4db6e37b18648dfe2c94b672276c7bf6554fd9d4 (diff)
download	pdfium-a5fc8975c865dc3cc90de8ff46ca13fb46c13391.tar.xz