From 548334e57cae1039824d3db97bab5348fbe674e2 Mon Sep 17 00:00:00 2001
From: Jane Liu <janeliulwq@google.com>
Date: Thu, 3 Aug 2017 16:33:40 -0400
Subject: APIs and tests for retrieving raw/decoded data from image objects

Added FPDFImageObj_GetImageDataDecoded() for retrieving the uncompressed
data of an image, and FPDFImageObj_GetImageDataRaw() for retrieving the
raw data of an image.
    * Refactored out DecodeStreamMaybeCopyAndReturnLength(), which is
      used to decode both attachment data and image data.
    * Within DecodeStreamMaybeCopyAndReturnLength(), used a different
      decoder function which takes care of multiple filters if exist. As
      a result, CPDF_StreamParser::DecodeInlineStream() which was made
      static previously is now moved back into namespace.

Bug=pdfium:677

Change-Id: I22a22c99acaca98ef8c15f88911f2646a2c854d5
Reviewed-on: https://pdfium-review.googlesource.com/9811
Commit-Queue: Jane Liu <janeliulwq@google.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
---
 core/fpdfapi/page/cpdf_streamparser.cpp | 21 +++++++-------
 core/fpdfapi/page/cpdf_streamparser.h   |  9 ------
 fpdfsdk/fpdfattachment.cpp              | 24 +---------------
 fpdfsdk/fpdfedit_embeddertest.cpp       | 51 +++++++++++++++++++++++++++++++++
 fpdfsdk/fpdfeditimg.cpp                 | 42 +++++++++++++++++++++++++++
 fpdfsdk/fpdfview.cpp                    | 44 ++++++++++++++++++++++++++--
 fpdfsdk/fpdfview_c_api_test.c           |  2 ++
 fpdfsdk/fsdk_define.h                   |  5 ++++
 public/fpdf_edit.h                      | 29 +++++++++++++++++++
 9 files changed, 181 insertions(+), 46 deletions(-)

diff --git a/core/fpdfapi/page/cpdf_streamparser.cpp b/core/fpdfapi/page/cpdf_streamparser.cpp
index 964d600b3d..9d7dd1ed55 100644
--- a/core/fpdfapi/page/cpdf_streamparser.cpp
+++ b/core/fpdfapi/page/cpdf_streamparser.cpp
@@ -60,17 +60,14 @@ uint32_t DecodeAllScanlines(std::unique_ptr<CCodec_ScanlineDecoder> pDecoder,
   return pDecoder->GetSrcOffset();
 }
 
-}  // namespace
-
-// Static
-uint32_t CPDF_StreamParser::DecodeInlineStream(const uint8_t* src_buf,
-                                               uint32_t limit,
-                                               int width,
-                                               int height,
-                                               const CFX_ByteString& decoder,
-                                               CPDF_Dictionary* pParam,
-                                               uint8_t** dest_buf,
-                                               uint32_t* dest_size) {
+uint32_t DecodeInlineStream(const uint8_t* src_buf,
+                            uint32_t limit,
+                            int width,
+                            int height,
+                            const CFX_ByteString& decoder,
+                            CPDF_Dictionary* pParam,
+                            uint8_t** dest_buf,
+                            uint32_t* dest_size) {
   if (decoder == "CCITTFaxDecode" || decoder == "CCF") {
     std::unique_ptr<CCodec_ScanlineDecoder> pDecoder =
         FPDFAPI_CreateFaxDecoder(src_buf, limit, width, height, pParam);
@@ -102,6 +99,8 @@ uint32_t CPDF_StreamParser::DecodeInlineStream(const uint8_t* src_buf,
   return 0xFFFFFFFF;
 }
 
+}  // namespace
+
 CPDF_StreamParser::CPDF_StreamParser(const uint8_t* pData, uint32_t dwSize)
     : m_pBuf(pData), m_Size(dwSize), m_Pos(0), m_pPool(nullptr) {}
 
diff --git a/core/fpdfapi/page/cpdf_streamparser.h b/core/fpdfapi/page/cpdf_streamparser.h
index 9f9a8eaf12..fdc418c634 100644
--- a/core/fpdfapi/page/cpdf_streamparser.h
+++ b/core/fpdfapi/page/cpdf_streamparser.h
@@ -21,15 +21,6 @@ class CPDF_StreamParser {
  public:
   enum SyntaxType { EndOfData, Number, Keyword, Name, Others };
 
-  static uint32_t DecodeInlineStream(const uint8_t* src_buf,
-                                     uint32_t limit,
-                                     int width,
-                                     int height,
-                                     const CFX_ByteString& decoder,
-                                     CPDF_Dictionary* pParam,
-                                     uint8_t** dest_buf,
-                                     uint32_t* dest_size);
-
   CPDF_StreamParser(const uint8_t* pData, uint32_t dwSize);
   CPDF_StreamParser(const uint8_t* pData,
                     uint32_t dwSize,
diff --git a/fpdfsdk/fpdfattachment.cpp b/fpdfsdk/fpdfattachment.cpp
index 0cb623f81c..5bdb3bd4a2 100644
--- a/fpdfsdk/fpdfattachment.cpp
+++ b/fpdfsdk/fpdfattachment.cpp
@@ -8,7 +8,6 @@
 #include <utility>
 
 #include "core/fdrm/crypto/fx_crypt.h"
-#include "core/fpdfapi/page/cpdf_streamparser.h"
 #include "core/fpdfapi/parser/cpdf_array.h"
 #include "core/fpdfapi/parser/cpdf_document.h"
 #include "core/fpdfapi/parser/cpdf_name.h"
@@ -273,26 +272,5 @@ FPDFAttachment_GetFile(FPDF_ATTACHMENT attachment,
   if (!pFileStream)
     return 0;
 
-  uint8_t* data = pFileStream->GetRawData();
-  uint32_t len = pFileStream->GetRawSize();
-  CPDF_Dictionary* pFileDict = pFileStream->GetDict();
-  if (!pFileDict || pFileDict->GetStringFor("Filter").IsEmpty()) {
-    if (buffer && buflen >= len)
-      memcpy(buffer, data, len);
-
-    return len;
-  }
-
-  // Decode the stream if a stream filter is specified.
-  uint8_t* decodedData = nullptr;
-  uint32_t decodedLen = 0;
-  CPDF_StreamParser::DecodeInlineStream(
-      data, len, pFileDict->GetIntegerFor("Width"),
-      pFileDict->GetIntegerFor("Height"), pFileDict->GetStringFor("Filter"),
-      pFileDict->GetDictFor("DecodeParms"), &decodedData, &decodedLen);
-  if (buffer && buflen >= decodedLen)
-    memcpy(buffer, decodedData, decodedLen);
-
-  FX_Free(decodedData);
-  return decodedLen;
+  return DecodeStreamMaybeCopyAndReturnLength(pFileStream, buffer, buflen);
 }
diff --git a/fpdfsdk/fpdfedit_embeddertest.cpp b/fpdfsdk/fpdfedit_embeddertest.cpp
index dcaeb945d7..f1bbb87422 100644
--- a/fpdfsdk/fpdfedit_embeddertest.cpp
+++ b/fpdfsdk/fpdfedit_embeddertest.cpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "core/fpdfapi/font/cpdf_font.h"
 #include "core/fpdfapi/page/cpdf_page.h"
@@ -979,3 +980,53 @@ TEST_F(FPDFEditEmbeddertest, ExtractImageBitmap) {
   FPDFBitmap_Destroy(bitmap);
   UnloadPage(page);
 }
+
+TEST_F(FPDFEditEmbeddertest, GetImageData) {
+  EXPECT_TRUE(OpenDocument("embedded_images.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+  ASSERT_EQ(39, FPDFPage_CountObject(page));
+
+  // Retrieve an image object with flate-encoded data stream.
+  FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, 33);
+  ASSERT_EQ(FPDF_PAGEOBJ_IMAGE, FPDFPageObj_GetType(obj));
+
+  // Check that the raw image data has the correct length and hash value.
+  unsigned long len = FPDFImageObj_GetImageDataRaw(obj, nullptr, 0);
+  std::vector<char> buf(len);
+  EXPECT_EQ(4091u, FPDFImageObj_GetImageDataRaw(obj, buf.data(), len));
+  EXPECT_EQ("f73802327d2e88e890f653961bcda81a",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  // Check that the decoded image data has the correct length and hash value.
+  len = FPDFImageObj_GetImageDataDecoded(obj, nullptr, 0);
+  buf.clear();
+  buf.resize(len);
+  EXPECT_EQ(28776u, FPDFImageObj_GetImageDataDecoded(obj, buf.data(), len));
+  EXPECT_EQ("cb3637934bb3b95a6e4ae1ea9eb9e56e",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  // Retrieve an image obejct with DCTDecode-encoded data stream.
+  obj = FPDFPage_GetObject(page, 37);
+  ASSERT_EQ(FPDF_PAGEOBJ_IMAGE, FPDFPageObj_GetType(obj));
+
+  // Check that the raw image data has the correct length and hash value.
+  len = FPDFImageObj_GetImageDataRaw(obj, nullptr, 0);
+  buf.clear();
+  buf.resize(len);
+  EXPECT_EQ(4370u, FPDFImageObj_GetImageDataRaw(obj, buf.data(), len));
+  EXPECT_EQ("6aae1f3710335023a9e12191be66b64b",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  // Check that the decoded image data has the correct length and hash value,
+  // which should be the same as those of the raw data, since this image is
+  // encoded by a single DCTDecode filter and decoding is a noop.
+  len = FPDFImageObj_GetImageDataDecoded(obj, nullptr, 0);
+  buf.clear();
+  buf.resize(len);
+  EXPECT_EQ(4370u, FPDFImageObj_GetImageDataDecoded(obj, buf.data(), len));
+  EXPECT_EQ("6aae1f3710335023a9e12191be66b64b",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  UnloadPage(page);
+}
diff --git a/fpdfsdk/fpdfeditimg.cpp b/fpdfsdk/fpdfeditimg.cpp
index bfd12b2441..0d0c54604b 100644
--- a/fpdfsdk/fpdfeditimg.cpp
+++ b/fpdfsdk/fpdfeditimg.cpp
@@ -137,3 +137,45 @@ FPDFImageObj_GetBitmap(FPDF_PAGEOBJECT image_object) {
 
   return pBitmap.Leak();
 }
+
+DLLEXPORT unsigned long STDCALL
+FPDFImageObj_GetImageDataDecoded(FPDF_PAGEOBJECT image_object,
+                                 void* buffer,
+                                 unsigned long buflen) {
+  CPDF_PageObject* pObj = CPDFPageObjectFromFPDFPageObject(image_object);
+  if (!pObj || !pObj->IsImage())
+    return 0;
+
+  CFX_RetainPtr<CPDF_Image> pImg = pObj->AsImage()->GetImage();
+  if (!pImg)
+    return 0;
+
+  CPDF_Stream* pImgStream = pImg->GetStream();
+  if (!pImgStream)
+    return 0;
+
+  return DecodeStreamMaybeCopyAndReturnLength(pImgStream, buffer, buflen);
+}
+
+DLLEXPORT unsigned long STDCALL
+FPDFImageObj_GetImageDataRaw(FPDF_PAGEOBJECT image_object,
+                             void* buffer,
+                             unsigned long buflen) {
+  CPDF_PageObject* pObj = CPDFPageObjectFromFPDFPageObject(image_object);
+  if (!pObj || !pObj->IsImage())
+    return 0;
+
+  CFX_RetainPtr<CPDF_Image> pImg = pObj->AsImage()->GetImage();
+  if (!pImg)
+    return 0;
+
+  CPDF_Stream* pImgStream = pImg->GetStream();
+  if (!pImgStream)
+    return 0;
+
+  uint32_t len = pImgStream->GetRawSize();
+  if (buffer && buflen >= len)
+    memcpy(buffer, pImgStream->GetRawData(), len);
+
+  return len;
+}
diff --git a/fpdfsdk/fpdfview.cpp b/fpdfsdk/fpdfview.cpp
index 5aa80139ae..57e4806d39 100644
--- a/fpdfsdk/fpdfview.cpp
+++ b/fpdfsdk/fpdfview.cpp
@@ -357,10 +357,48 @@ CFX_DIBitmap* CFXBitmapFromFPDFBitmap(FPDF_BITMAP bitmap) {
 unsigned long Utf16EncodeMaybeCopyAndReturnLength(const CFX_WideString& text,
                                                   void* buffer,
                                                   unsigned long buflen) {
-  CFX_ByteString encodedText = text.UTF16LE_Encode();
-  unsigned long len = encodedText.GetLength();
+  CFX_ByteString encoded_text = text.UTF16LE_Encode();
+  unsigned long len = encoded_text.GetLength();
   if (buffer && len <= buflen)
-    memcpy(buffer, encodedText.c_str(), len);
+    memcpy(buffer, encoded_text.c_str(), len);
+  return len;
+}
+
+unsigned long DecodeStreamMaybeCopyAndReturnLength(const CPDF_Stream* stream,
+                                                   void* buffer,
+                                                   unsigned long buflen) {
+  ASSERT(stream);
+  uint8_t* data = stream->GetRawData();
+  uint32_t len = stream->GetRawSize();
+  CPDF_Dictionary* dict = stream->GetDict();
+  CPDF_Object* decoder = dict ? dict->GetDirectObjectFor("Filter") : nullptr;
+  if (decoder && (decoder->IsArray() || decoder->IsName())) {
+    // Decode the stream if one or more stream filters are specified.
+    uint8_t* decoded_data = nullptr;
+    uint32_t decoded_len = 0;
+    CFX_ByteString dummy_last_decoder;
+    CPDF_Dictionary* dummy_last_param;
+    if (PDF_DataDecode(data, len, dict, dict->GetIntegerFor("DL"), false,
+                       &decoded_data, &decoded_len, &dummy_last_decoder,
+                       &dummy_last_param)) {
+      if (buffer && buflen >= decoded_len)
+        memcpy(buffer, decoded_data, decoded_len);
+
+      // Free the buffer for the decoded data if it was allocated by
+      // PDF_DataDecode(). Note that for images with a single image-specific
+      // filter, |decoded_data| is directly assigned to be |data|, so
+      // |decoded_data| does not need to be freed.
+      if (decoded_data != data)
+        FX_Free(decoded_data);
+
+      return decoded_len;
+    }
+  }
+  // Copy the raw data and return its length if there is no valid filter
+  // specified or if decoding failed.
+  if (buffer && buflen >= len)
+    memcpy(buffer, data, len);
+
   return len;
 }
 
diff --git a/fpdfsdk/fpdfview_c_api_test.c b/fpdfsdk/fpdfview_c_api_test.c
index e47f4d172c..d40437c278 100644
--- a/fpdfsdk/fpdfview_c_api_test.c
+++ b/fpdfsdk/fpdfview_c_api_test.c
@@ -133,6 +133,8 @@ int CheckPDFiumCApi() {
     CHK(FPDFImageObj_SetMatrix);
     CHK(FPDFImageObj_SetBitmap);
     CHK(FPDFImageObj_GetBitmap);
+    CHK(FPDFImageObj_GetImageDataDecoded);
+    CHK(FPDFImageObj_GetImageDataRaw);
     CHK(FPDFPageObj_CreateNewPath);
     CHK(FPDFPageObj_CreateNewRect);
     CHK(FPDFPath_SetStrokeColor);
diff --git a/fpdfsdk/fsdk_define.h b/fpdfsdk/fsdk_define.h
index 610b854a9d..91efc27f1a 100644
--- a/fpdfsdk/fsdk_define.h
+++ b/fpdfsdk/fsdk_define.h
@@ -26,6 +26,7 @@ class CPDF_Page;
 class CPDF_PageObject;
 class CPDF_PageRenderContext;
 class CPDF_PathObject;
+class CPDF_Stream;
 class IFSDK_PAUSE_Adapter;
 
 // Layering prevents fxcrt from knowing about FPDF_FILEACCESS, so this can't
@@ -77,6 +78,10 @@ unsigned long Utf16EncodeMaybeCopyAndReturnLength(const CFX_WideString& text,
                                                   void* buffer,
                                                   unsigned long buflen);
 
+unsigned long DecodeStreamMaybeCopyAndReturnLength(const CPDF_Stream* stream,
+                                                   void* buffer,
+                                                   unsigned long buflen);
+
 void FSDK_SetSandBoxPolicy(FPDF_DWORD policy, FPDF_BOOL enable);
 FPDF_BOOL FSDK_IsSandBoxPolicyEnabled(FPDF_DWORD policy);
 void FPDF_RenderPage_Retail(CPDF_PageRenderContext* pContext,
diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h
index e5607d1716..dc710b5f52 100644
--- a/public/fpdf_edit.h
+++ b/public/fpdf_edit.h
@@ -298,6 +298,35 @@ DLLEXPORT FPDF_BOOL STDCALL FPDFImageObj_SetBitmap(FPDF_PAGE* pages,
 DLLEXPORT FPDF_BITMAP STDCALL
 FPDFImageObj_GetBitmap(FPDF_PAGEOBJECT image_object);
 
+// Get the decoded image data of |image_object|. The decoded data is the
+// uncompressed image data, i.e. the raw image data after having all filters
+// applied. |buffer| is only modified if |buflen| is longer than the length of
+// the decoded image data.
+//
+//   image_object - handle to an image object.
+//   buffer       - buffer for holding the decoded image data in raw bytes.
+//   buflen       - length of the buffer.
+//
+// Returns the length of the decoded image data.
+DLLEXPORT unsigned long STDCALL
+FPDFImageObj_GetImageDataDecoded(FPDF_PAGEOBJECT image_object,
+                                 void* buffer,
+                                 unsigned long buflen);
+
+// Get the raw image data of |image_object|. The raw data is the image data as
+// stored in the PDF without applying any filters. |buffer| is only modified if
+// |buflen| is longer than the length of the raw image data.
+//
+//   image_object - handle to an image object.
+//   buffer       - buffer for holding the raw image data in raw bytes.
+//   buflen       - length of the buffer.
+//
+// Returns the length of the raw image data.
+DLLEXPORT unsigned long STDCALL
+FPDFImageObj_GetImageDataRaw(FPDF_PAGEOBJECT image_object,
+                             void* buffer,
+                             unsigned long buflen);
+
 // Create a new path object at an initial position.
 //
 //   x - initial horizontal position.
-- 
cgit v1.2.3