Properly handle language markers in decoded text

In text like document title 0x001B is used as a marker for the beginning/end of a language metadata section. Currently PDFium does nothing with this data, but when returning the 'decoded' text it needs to be stripped out. The existing code assumed that the two bytes following a marker would be the data to be removed and did nothing to track if it was in/out of one of these regions. This led to a situation where it would always strip the two bytes following the region, since it assumed the end marker was the beginning of a new region. This CL corrects the detection and handling of these regions, and adds a regression test for the reported bug. BUG=pdfium:182 Change-Id: I92ddba5666274a8986fed03f502a0331f150f7ac Reviewed-on: https://pdfium-review.googlesource.com/41070 Reviewed-by: Henrique Nakashima <hnakashima@chromium.org> Commit-Queue: Ryan Harrison <rharrison@chromium.org>
author: Ryan Harrison <rharrison@chromium.org> 2018-08-22 20:50:14 +0000
committer: Chromium commit bot <commit-bot@chromium.org> 2018-08-22 20:50:14 +0000
commit: 15f1a88dece664ae7300d9a60fe124cec1f2b9de (patch)
tree: a13d363deadc618c919ba6cbf2b7982511a7db7c
parent: 7e7e0b8379c4bdcf3e16cd2298afe49f03fefdfb (diff)
download: pdfium-15f1a88dece664ae7300d9a60fe124cec1f2b9de.tar.xz
3 files changed, 37 insertions, 13 deletions
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 928d650103..063d21ae5f 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp
@@ -29,8 +29,12 @@ namespace {
 
 const uint32_t kMaxStreamSize = 20 * 1024 * 1024;
 
-uint16_t GetUnicodeFromBytes(const uint8_t* bytes, bool bBE) {
-  return bBE ? (bytes[0] << 8 | bytes[1]) : (bytes[1] << 8 | bytes[0]);
+uint16_t GetUnicodeFromBigEndianBytes(const uint8_t* bytes) {
+  return bytes[0] << 8 | bytes[1];
+}
+
+uint16_t GetUnicodeFromLittleEndianBytes(const uint8_t* bytes) {
+  return bytes[1] << 8 | bytes[0];
 }
 
 bool CheckFlateDecodeParams(int Colors, int BitsPerComponent, int Columns) {
@@ -436,21 +440,30 @@ WideString PDF_DecodeText(const uint8_t* src_data, uint32_t src_len) {
       return result;
 
     pdfium::span<wchar_t> dest_buf = result.GetBuffer(max_chars);
-    bool bBE = src_data[0] == 0xfe || (src_data[0] == 0xff && !src_data[2]);
-    const uint8_t* uni_str = src_data + 2;
+    uint16_t (*GetUnicodeFromBytes)(const uint8_t*) =
+        src_data[0] == 0xfe ? GetUnicodeFromBigEndianBytes
+                            : GetUnicodeFromLittleEndianBytes;
+    const uint8_t* unicode_str = src_data + 2;
     for (uint32_t i = 0; i < max_chars * 2; i += 2) {
-      uint16_t unicode = GetUnicodeFromBytes(uni_str + i, bBE);
-      if (unicode != 0x1b) {
-        dest_buf[dest_pos++] = unicode;
-        continue;
-      }
-      i += 2;
-      while (i < max_chars * 2) {
-        uint16_t unicode2 = GetUnicodeFromBytes(uni_str + i, bBE);
+      uint16_t unicode = GetUnicodeFromBytes(unicode_str + i);
+
+      // 0x001B is a begin/end marker for language metadata region that
+      // should not be in the decoded text.
+      if (unicode == 0x001B) {
         i += 2;
-        if (unicode2 == 0x1b)
+        for (; i < max_chars * 2; i += 2) {
+          unicode = GetUnicodeFromBytes(unicode_str + i);
+          if (unicode == 0x001B) {
+            i += 2;
+            unicode = GetUnicodeFromBytes(unicode_str + i);
+            break;
+          }
+        }
+        if (i >= max_chars * 2)
           break;
       }
+
+      dest_buf[dest_pos++] = unicode;
     }
   } else {
     pdfium::span<wchar_t> dest_buf = result.GetBuffer(src_len);
diff --git a/fpdfsdk/fpdf_doc_embeddertest.cpp b/fpdfsdk/fpdf_doc_embeddertest.cpp
index a9eb4b8cb7..5c0223ee46 100644
--- a/fpdfsdk/fpdf_doc_embeddertest.cpp
+++ b/fpdfsdk/fpdf_doc_embeddertest.cpp
@@ -358,6 +358,17 @@ TEST_F(FPDFDocEmbeddertest, GetMetaText) {
             WideString::FromUTF16LE(buf, FXSYS_len(kExpectedModDate)));
 }
 
+TEST_F(FPDFDocEmbeddertest, Bug_182) {
+  ASSERT_TRUE(OpenDocument("bug_182.pdf"));
+
+  unsigned short buf[128];
+  constexpr wchar_t kExpectedTitle[] = L"Super Visual Formade 印刷";
+
+  ASSERT_EQ(48u, FPDF_GetMetaText(document(), "Title", buf, sizeof(buf)));
+  EXPECT_EQ(WideString(kExpectedTitle),
+            WideString::FromUTF16LE(buf, FXSYS_len(kExpectedTitle)));
+}
+
 TEST_F(FPDFDocEmbeddertest, GetMetaTextSameObjectNumber) {
   ASSERT_TRUE(OpenDocument("annotation_highlight_square_with_ap.pdf"));
 
diff --git a/testing/resources/bug_182.pdf b/testing/resources/bug_182.pdf
new file mode 100644
index 0000000000..bf35cc918a
--- /dev/null
+++ b/testing/resources/bug_182.pdf
author	Ryan Harrison <rharrison@chromium.org>	2018-08-22 20:50:14 +0000
committer	Chromium commit bot <commit-bot@chromium.org>	2018-08-22 20:50:14 +0000
commit	15f1a88dece664ae7300d9a60fe124cec1f2b9de (patch)
tree	a13d363deadc618c919ba6cbf2b7982511a7db7c
parent	7e7e0b8379c4bdcf3e16cd2298afe49f03fefdfb (diff)
download	pdfium-15f1a88dece664ae7300d9a60fe124cec1f2b9de.tar.xz