Unify parsing of cross refs v4

Change-Id: I7e3d45263a0bae61fd86fd4c3710de7fc0b9347d Reviewed-on: https://pdfium-review.googlesource.com/9290 Reviewed-by: Wei Li <weili@chromium.org> Commit-Queue: Art Snake <art-snake@yandex-team.ru>
author: Artem Strygin <art-snake@yandex-team.ru> 2017-08-02 14:27:22 +0300
committer: Chromium commit bot <commit-bot@chromium.org> 2017-08-02 21:20:33 +0000
commit: 17b1c191da26e477c6898a8b06f2ff624f9e4c6b (patch)
tree: 6c383224fc0c8c1d9ef2c8eb9df6fe0352fe4b9a
parent: 17e54528fe0a2203074f4d086677d14c33cf7253 (diff)
download: pdfium-17b1c191da26e477c6898a8b06f2ff624f9e4c6b.tar.xz
2 files changed, 111 insertions, 77 deletions
diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp
index 2ca820eb98..e33cec0165 100644
--- a/core/fpdfapi/parser/cpdf_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_parser.cpp
@@ -440,76 +440,93 @@ bool CPDF_Parser::LoadLinearizedCrossRefV4(FX_FILESIZE pos,
 
   m_pSyntax->SetPos(dwStartPos);
   m_SortedOffset.insert(pos);
+  std::vector<CrossRefObjData> objects;
+  if (!ParseAndAppendCrossRefSubsectionData(0, dwObjCount, &objects))
+    return false;
+  MergeCrossRefObjectsData(objects);
+  return true;
+}
 
-  uint32_t start_objnum = 0;
-  uint32_t count = dwObjCount;
-  FX_FILESIZE SavedPos = m_pSyntax->GetPos();
+bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData(
+    uint32_t start_objnum,
+    uint32_t count,
+    std::vector<CrossRefObjData>* out_objects) {
+  // Each entry shall be exactly 20 byte.
+  // A sample entry looks like:
+  // "0000000000 00007 f\r\n"
+  static constexpr int32_t kEntryConstSize = 20;
+
+  if (!out_objects) {
+    m_pSyntax->SetPos(m_pSyntax->GetPos() + count * kEntryConstSize);
+    return true;
+  }
+  const size_t start_obj_index = out_objects->size();
+  out_objects->resize(start_obj_index + count);
 
-  const int32_t recordsize = 20;
-  std::vector<char> buf(1024 * recordsize + 1);
-  buf[1024 * recordsize] = '\0';
+  std::vector<char> buf(1024 * kEntryConstSize + 1);
+  buf[1024 * kEntryConstSize] = '\0';
 
   int32_t nBlocks = count / 1024 + 1;
   for (int32_t block = 0; block < nBlocks; block++) {
     int32_t block_size = block == nBlocks - 1 ? count % 1024 : 1024;
-    uint32_t dwReadSize = block_size * recordsize;
-    if ((FX_FILESIZE)(dwStartPos + dwReadSize) > m_pSyntax->m_FileLen)
-      return false;
-
     if (!m_pSyntax->ReadBlock(reinterpret_cast<uint8_t*>(buf.data()),
-                              dwReadSize)) {
+                              block_size * kEntryConstSize))
       return false;
-    }
 
     for (int32_t i = 0; i < block_size; i++) {
-      uint32_t objnum = start_objnum + block * 1024 + i;
-      char* pEntry = &buf[i * recordsize];
+      CrossRefObjData& obj_data =
+          (*out_objects)[start_obj_index + block * 1024 + i];
+
+      const uint32_t objnum = start_objnum + block * 1024 + i;
+
+      obj_data.obj_num = objnum;
+
+      ObjectInfo& info = obj_data.info;
+
+      char* pEntry = &buf[i * kEntryConstSize];
       if (pEntry[17] == 'f') {
-        m_ObjectInfo[objnum].pos = 0;
-        m_ObjectInfo[objnum].type = ObjectType::kFree;
+        info.pos = 0;
+        info.type = ObjectType::kFree;
       } else {
-        int32_t offset = FXSYS_atoi(pEntry);
-        if (offset == 0) {
+        const FX_SAFE_FILESIZE offset = FXSYS_atoi64(pEntry);
+        if (!offset.IsValid())
+          return false;
+
+        if (offset.ValueOrDie() == 0) {
           for (int32_t c = 0; c < 10; c++) {
             if (!std::isdigit(pEntry[c]))
               return false;
           }
         }
 
-        m_ObjectInfo[objnum].pos = offset;
-        int32_t version = FXSYS_atoi(pEntry + 11);
-        if (version >= 1)
-          m_bVersionUpdated = true;
+        info.pos = offset.ValueOrDie();
 
-        m_ObjectInfo[objnum].gennum = version;
-        if (m_ObjectInfo[objnum].pos < m_pSyntax->m_FileLen)
-          m_SortedOffset.insert(m_ObjectInfo[objnum].pos);
-
-        m_ObjectInfo[objnum].type = ObjectType::kNotCompressed;
+        // TODO(art-snake): The info.gennum is uint16_t, but version may be
+        // greated than max<uint16_t>. Needs solve this issue.
+        const int32_t version = FXSYS_atoi(pEntry + 11);
+        info.gennum = version;
+        info.type = ObjectType::kNotCompressed;
       }
     }
   }
-  m_pSyntax->SetPos(SavedPos + count * recordsize);
   return true;
 }
 
-bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos,
-                                 FX_FILESIZE streampos,
-                                 bool bSkip) {
-  m_pSyntax->SetPos(pos);
+bool CPDF_Parser::ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects,
+                                  uint32_t* start_obj_num_at_last_block) {
+  if (out_objects)
+    out_objects->clear();
+
   if (m_pSyntax->GetKeyword() != "xref")
     return false;
-
-  m_SortedOffset.insert(pos);
-  if (streampos)
-    m_SortedOffset.insert(streampos);
-
+  std::vector<CrossRefObjData> result_objects;
   while (1) {
     FX_FILESIZE SavedPos = m_pSyntax->GetPos();
     bool bIsNumber;
     CFX_ByteString word = m_pSyntax->GetNextWord(&bIsNumber);
-    if (word.IsEmpty())
+    if (word.IsEmpty()) {
       return false;
+    }
 
     if (!bIsNumber) {
       m_pSyntax->SetPos(SavedPos);
@@ -519,55 +536,57 @@ bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos,
     uint32_t start_objnum = FXSYS_atoui(word.c_str());
     if (start_objnum >= kMaxObjectNumber)
       return false;
+    if (start_obj_num_at_last_block)
+      *start_obj_num_at_last_block = start_objnum;
 
     uint32_t count = m_pSyntax->GetDirectNum();
     m_pSyntax->ToNextWord();
     SavedPos = m_pSyntax->GetPos();
-    const int32_t recordsize = 20;
-
-    m_dwXrefStartObjNum = start_objnum;
-    if (!bSkip) {
-      std::vector<char> buf(1024 * recordsize + 1);
-      buf[1024 * recordsize] = '\0';
-
-      int32_t nBlocks = count / 1024 + 1;
-      for (int32_t block = 0; block < nBlocks; block++) {
-        int32_t block_size = block == nBlocks - 1 ? count % 1024 : 1024;
-        m_pSyntax->ReadBlock(reinterpret_cast<uint8_t*>(buf.data()),
-                             block_size * recordsize);
-
-        for (int32_t i = 0; i < block_size; i++) {
-          uint32_t objnum = start_objnum + block * 1024 + i;
-          char* pEntry = &buf[i * recordsize];
-          if (pEntry[17] == 'f') {
-            m_ObjectInfo[objnum].pos = 0;
-            m_ObjectInfo[objnum].type = ObjectType::kFree;
-          } else {
-            FX_FILESIZE offset = (FX_FILESIZE)FXSYS_atoi64(pEntry);
-            if (offset == 0) {
-              for (int32_t c = 0; c < 10; c++) {
-                if (!std::isdigit(pEntry[c]))
-                  return false;
-              }
-            }
 
-            m_ObjectInfo[objnum].pos = offset;
-            int32_t version = FXSYS_atoi(pEntry + 11);
-            if (version >= 1)
-              m_bVersionUpdated = true;
+    if (!ParseAndAppendCrossRefSubsectionData(
+            start_objnum, count, out_objects ? &result_objects : nullptr)) {
+      return false;
+    }
+  }
+  if (out_objects)
+    *out_objects = std::move(result_objects);
+  return true;
+}
 
-            m_ObjectInfo[objnum].gennum = version;
-            if (m_ObjectInfo[objnum].pos < m_pSyntax->m_FileLen)
-              m_SortedOffset.insert(m_ObjectInfo[objnum].pos);
+bool CPDF_Parser::LoadCrossRefV4(FX_FILESIZE pos,
+                                 FX_FILESIZE streampos,
+                                 bool bSkip) {
+  m_pSyntax->SetPos(pos);
+  if (m_pSyntax->GetKeyword() != "xref")
+    return false;
 
-            m_ObjectInfo[objnum].type = ObjectType::kNotCompressed;
-          }
-        }
+  m_SortedOffset.insert(pos);
+  if (streampos)
+    m_SortedOffset.insert(streampos);
+
+  m_pSyntax->SetPos(pos);
+  std::vector<CrossRefObjData> objects;
+  if (!ParseCrossRefV4(bSkip ? nullptr : &objects, &m_dwXrefStartObjNum))
+    return false;
+
+  MergeCrossRefObjectsData(objects);
+
+  return !streampos || LoadCrossRefV5(&streampos, false);
+}
+
+void CPDF_Parser::MergeCrossRefObjectsData(
+    const std::vector<CrossRefObjData>& objects) {
+  for (const auto& obj : objects) {
+    m_ObjectInfo[obj.obj_num] = obj.info;
+    if (obj.info.type != ObjectType::kFree) {
+      if (obj.info.gennum > 0)
+        m_bVersionUpdated = true;
+      if (obj.info.type == ObjectType::kNotCompressed &&
+          obj.info.pos < m_pSyntax->m_FileLen) {
+        m_SortedOffset.insert(obj.info.pos);
       }
     }
-    m_pSyntax->SetPos(SavedPos + count * recordsize);
   }
-  return !streampos || LoadCrossRefV5(&streampos, false);
 }
 
 bool CPDF_Parser::LoadAllCrossRefV5(FX_FILESIZE xrefpos) {
diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h
index ece1e6a2d4..759d042360 100644
--- a/core/fpdfapi/parser/cpdf_parser.h
+++ b/core/fpdfapi/parser/cpdf_parser.h
@@ -148,6 +148,11 @@ class CPDF_Parser {
     kEndObj
   };
 
+  struct CrossRefObjData {
+    uint32_t obj_num = 0;
+    ObjectInfo info;
+  };
+
   CPDF_Object* ParseDirect(CPDF_Object* pObj);
   bool LoadAllCrossRefV4(FX_FILESIZE pos);
   bool LoadAllCrossRefV5(FX_FILESIZE pos);
@@ -169,6 +174,16 @@ class CPDF_Parser {
   // the objects.
   bool VerifyCrossRefV4();
 
+  // If out_objects is null, the parser position will be moved to end subsection
+  // without additional validation.
+  bool ParseAndAppendCrossRefSubsectionData(
+      uint32_t start_objnum,
+      uint32_t count,
+      std::vector<CrossRefObjData>* out_objects);
+  bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects,
+                       uint32_t* start_obj_num_at_last_block);
+  void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
+
   CFX_UnownedPtr<CPDF_Document> m_pDocument;
   bool m_bHasParsed;
   bool m_bXRefStream;
author	Artem Strygin <art-snake@yandex-team.ru>	2017-08-02 14:27:22 +0300
committer	Chromium commit bot <commit-bot@chromium.org>	2017-08-02 21:20:33 +0000
commit	17b1c191da26e477c6898a8b06f2ff624f9e4c6b (patch)
tree	6c383224fc0c8c1d9ef2c8eb9df6fe0352fe4b9a
parent	17e54528fe0a2203074f4d086677d14c33cf7253 (diff)
download	pdfium-17b1c191da26e477c6898a8b06f2ff624f9e4c6b.tar.xz