Fix hint tables parsing.

Sample PDF: https://yadi.sk/d/oWLtAEfy3YbEb3 For offsets, equal to the hint stream offset, added hint stream length to determine the actual offset, because linearization inserted the hint stream at the original location of the object. Also the number of bits needed to represent the numerator of the fractional position for each shared object reference may be zero, if each shared group contains only one object with obj num, incremented on 1. Change-Id: I4754d603f388354821e8d0cac97ad99a7578fe4b Reviewed-on: https://pdfium-review.googlesource.com/36610 Commit-Queue: Art Snake <art-snake@yandex-team.ru> Reviewed-by: Lei Zhang <thestig@chromium.org>
author: Artem Strygin <art-snake@yandex-team.ru> 2018-07-25 02:47:25 +0000
committer: Chromium commit bot <commit-bot@chromium.org> 2018-07-25 02:47:25 +0000
commit: 84d3394d88c42b798eedc938e6295ad1bf28ac66 (patch)
tree: aa8eba2a6b9bc892fe5d676d55a29f90f596583b
parent: 70ddc1ca22ad44a77006491b604a75f6514a4aa8 (diff)
download: pdfium-84d3394d88c42b798eedc938e6295ad1bf28ac66.tar.xz
4 files changed, 76 insertions, 14 deletions
diff --git a/core/fpdfapi/parser/cpdf_hint_tables.cpp b/core/fpdfapi/parser/cpdf_hint_tables.cpp
index 71a6d3688e..04e673bc97 100644
--- a/core/fpdfapi/parser/cpdf_hint_tables.cpp
+++ b/core/fpdfapi/parser/cpdf_hint_tables.cpp
@@ -117,7 +117,7 @@ bool CPDF_HintTables::ReadPageHintTable(CFX_BitStream* hStream) {
   // shared object referenced from a page, there is an indication of
   // where in the page's content stream the object is first referenced.
   const uint32_t dwSharedNumeratorBits = hStream->GetBits(16);
-  if (!IsValidPageOffsetHintTableBitCount(dwSharedNumeratorBits))
+  if (dwSharedNumeratorBits > 32)
     return false;
 
   // Item 13: Skip Item 13 which has 16 bits.
@@ -193,15 +193,17 @@ bool CPDF_HintTables::ReadPageHintTable(CFX_BitStream* hStream) {
   }
   hStream->ByteAlign();
 
-  for (uint32_t i = 0; i < nPages; i++) {
-    FX_SAFE_UINT32 safeSize = dwNSharedObjsArray[i];
-    safeSize *= dwSharedNumeratorBits;
-    if (!CanReadFromBitStream(hStream, safeSize))
-      return false;
+  if (dwSharedNumeratorBits) {
+    for (uint32_t i = 0; i < nPages; i++) {
+      FX_SAFE_UINT32 safeSize = dwNSharedObjsArray[i];
+      safeSize *= dwSharedNumeratorBits;
+      if (!CanReadFromBitStream(hStream, safeSize))
+        return false;
 
-    hStream->SkipBits(safeSize.ValueOrDie());
+      hStream->SkipBits(safeSize.ValueOrDie());
+    }
+    hStream->ByteAlign();
   }
-  hStream->ByteAlign();
 
   FX_SAFE_UINT32 safeTotalPageLen = nPages;
   safeTotalPageLen *= dwDeltaPageLenBits;
@@ -403,7 +405,11 @@ FX_FILESIZE CPDF_HintTables::HintsOffsetToFileOffset(
   // offset shall have the hint stream length added to it to determine the
   // actual offset relative to the beginning of the file.
   // See specification PDF 32000-1:2008 Annex F.4 (Hint tables).
-  if (file_offset.ValueOrDie() > m_pLinearized->GetHintStart())
+  // Note: The PDF spec does not mention this, but positions equal to the hint
+  // stream offset also need to have the hint stream length added to it. e.g.
+  // There exists linearized PDFs generated by Adobe software that have this
+  // property.
+  if (file_offset.ValueOrDie() >= m_pLinearized->GetHintStart())
     file_offset += m_pLinearized->GetHintLength();
 
   return file_offset.ValueOrDefault(0);
diff --git a/core/fpdfapi/parser/cpdf_hint_tables.h b/core/fpdfapi/parser/cpdf_hint_tables.h
index a161dc68f0..5b978f99b2 100644
--- a/core/fpdfapi/parser/cpdf_hint_tables.h
+++ b/core/fpdfapi/parser/cpdf_hint_tables.h
@@ -83,6 +83,8 @@ class CPDF_HintTables {
     return m_SharedObjGroupInfos;
   }
 
+  FX_FILESIZE GetFirstPageObjOffset() const { return m_szFirstPageObjOffset; }
+
  protected:
   bool ReadPageHintTable(CFX_BitStream* hStream);
   bool ReadSharedObjHintTable(CFX_BitStream* hStream, uint32_t offset);
diff --git a/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp b/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp
index af0e9ff745..8a7331d29b 100644
--- a/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp
+++ b/core/fpdfapi/parser/cpdf_hint_tables_unittest.cpp
@@ -10,9 +10,14 @@
 
 #include "core/fpdfapi/cpdf_modulemgr.h"
 #include "core/fpdfapi/parser/cpdf_data_avail.h"
+#include "core/fpdfapi/parser/cpdf_dictionary.h"
+#include "core/fpdfapi/parser/cpdf_linearized_header.h"
 #include "core/fpdfapi/parser/cpdf_object.h"
+#include "core/fpdfapi/parser/cpdf_read_validator.h"
+#include "core/fpdfapi/parser/cpdf_stream.h"
 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
 #include "core/fxcrt/fx_stream.h"
+#include "testing/fx_string_testhelpers.h"
 #include "testing/gmock/include/gmock/gmock.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "testing/utils/path_service.h"
@@ -20,16 +25,42 @@
 
 namespace {
 
-std::unique_ptr<CPDF_DataAvail> MakeDataAvailFromFile(
+RetainPtr<CPDF_ReadValidator> MakeValidatorFromFile(
     const std::string& file_name) {
   std::string file_path;
-  if (!PathService::GetTestFilePath(file_name, &file_path))
-    return nullptr;
+  PathService::GetTestFilePath(file_name, &file_path);
+  ASSERT(!file_path.empty());
+  return pdfium::MakeRetain<CPDF_ReadValidator>(
+      IFX_SeekableReadStream::CreateFromFilename(file_path.c_str()), nullptr);
+}
+
+std::unique_ptr<CPDF_DataAvail> MakeDataAvailFromFile(
+    const std::string& file_name) {
   return pdfium::MakeUnique<CPDF_DataAvail>(
-      nullptr, IFX_SeekableReadStream::CreateFromFilename(file_path.c_str()),
-      true);
+      nullptr, MakeValidatorFromFile(file_name), true);
 }
 
+class TestLinearizedHeader : public CPDF_LinearizedHeader {
+ public:
+  TestLinearizedHeader(const CPDF_Dictionary* pDict,
+                       FX_FILESIZE szLastXRefOffset)
+      : CPDF_LinearizedHeader(pDict, szLastXRefOffset) {}
+
+  static std::unique_ptr<CPDF_LinearizedHeader> MakeHeader(
+      const std::string& inline_data) {
+    CPDF_SyntaxParser parser;
+    parser.InitParser(
+        pdfium::MakeRetain<CFX_BufferSeekableReadStream>(
+            reinterpret_cast<const unsigned char*>(inline_data.data()),
+            inline_data.size()),
+        0);
+    std::unique_ptr<CPDF_Dictionary> dict =
+        ToDictionary(parser.GetObjectBody(nullptr));
+    ASSERT(dict);
+    return pdfium::MakeUnique<TestLinearizedHeader>(dict.get(), 0);
+  }
+};
+
 }  // namespace
 
 class CPDF_HintTablesTest : public testing::Test {
@@ -119,3 +150,26 @@ TEST_F(CPDF_HintTablesTest, PageAndGroupInfos) {
   EXPECT_EQ(10939, hint_tables->SharedGroupInfos()[5].m_szOffset);
   EXPECT_EQ(544u, hint_tables->SharedGroupInfos()[5].m_dwLength);
 }
+
+TEST_F(CPDF_HintTablesTest, FirstPageOffset) {
+  // Test that valid hint table is loaded, and have correct offset of first page
+  // object.
+  const auto linearized_header = TestLinearizedHeader::MakeHeader(
+      "<< /Linearized 1 /L 19326762 /H [ 123730 3816 ] /O 5932 /E 639518 /N "
+      "102 /T 19220281 >>");
+  ASSERT_TRUE(linearized_header);
+  // This hint table is extracted from linearized file, generated by qpdf tool.
+  RetainPtr<CPDF_ReadValidator> validator =
+      MakeValidatorFromFile("hint_table_102p.bin");
+  CPDF_SyntaxParser parser;
+  parser.InitParserWithValidator(validator, 0);
+  std::unique_ptr<CPDF_Stream> stream = ToStream(parser.GetObjectBody(nullptr));
+  ASSERT_TRUE(stream);
+  auto hint_tables = pdfium::MakeUnique<CPDF_HintTables>(
+      validator.Get(), linearized_header.get());
+  // Check that hint table will load.
+  ASSERT_TRUE(hint_tables->LoadHintStream(stream.get()));
+  // Check that hint table have correct first page offset.
+  // 127546 is predefined real value from original file.
+  EXPECT_EQ(127546, hint_tables->GetFirstPageObjOffset());
+}
diff --git a/testing/resources/hint_table_102p.bin b/testing/resources/hint_table_102p.bin
new file mode 100644
index 0000000000..4008b0daca
--- /dev/null
+++ b/testing/resources/hint_table_102p.bin
author	Artem Strygin <art-snake@yandex-team.ru>	2018-07-25 02:47:25 +0000
committer	Chromium commit bot <commit-bot@chromium.org>	2018-07-25 02:47:25 +0000
commit	84d3394d88c42b798eedc938e6295ad1bf28ac66 (patch)
tree	aa8eba2a6b9bc892fe5d676d55a29f90f596583b
parent	70ddc1ca22ad44a77006491b604a75f6514a4aa8 (diff)
download	pdfium-84d3394d88c42b798eedc938e6295ad1bf28ac66.tar.xz