summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Sepez <tsepez@chromium.org>2018-08-09 21:32:47 +0000
committerChromium commit bot <commit-bot@chromium.org>2018-08-09 21:32:47 +0000
commitfd7cede17e027a83de2aff3bc0f5ee875271e444 (patch)
treec1576d19a8e8c99ebbcf9dba4e75ef4665e631cc
parent60627d6eafd025dde711e532eee6866840c04bef (diff)
downloadpdfium-fd7cede17e027a83de2aff3bc0f5ee875271e444.tar.xz
Move all utf8 decoding under fx_string.h
Put encoder in cfx_utf8encoder.{h,cpp} to parallel decoder. Add tests, and fix one corner case involving 0xff. Change-Id: Ib97540afdc708bcc6280a79c76734ec68ea72690 Reviewed-on: https://pdfium-review.googlesource.com/39770 Commit-Queue: Lei Zhang <thestig@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org>
-rw-r--r--BUILD.gn2
-rw-r--r--core/fxcrt/bytestring.cpp6
-rw-r--r--core/fxcrt/cfx_utf8decoder.cpp2
-rw-r--r--core/fxcrt/cfx_utf8encoder.cpp43
-rw-r--r--core/fxcrt/cfx_utf8encoder.h31
-rw-r--r--core/fxcrt/fx_string.cpp64
-rw-r--r--core/fxcrt/fx_string.h2
-rw-r--r--core/fxcrt/fx_string_unittest.cpp49
-rw-r--r--core/fxcrt/widestring.cpp10
9 files changed, 144 insertions, 65 deletions
diff --git a/BUILD.gn b/BUILD.gn
index afa83e358a..f08ce43225 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -858,6 +858,8 @@ jumbo_static_library("fxcrt") {
"core/fxcrt/cfx_seekablestreamproxy.h",
"core/fxcrt/cfx_utf8decoder.cpp",
"core/fxcrt/cfx_utf8decoder.h",
+ "core/fxcrt/cfx_utf8encoder.cpp",
+ "core/fxcrt/cfx_utf8encoder.h",
"core/fxcrt/cfx_widetextbuf.cpp",
"core/fxcrt/cfx_widetextbuf.h",
"core/fxcrt/fileaccess_iface.h",
diff --git a/core/fxcrt/bytestring.cpp b/core/fxcrt/bytestring.cpp
index b6c1ce7bbd..3ff0e3537a 100644
--- a/core/fxcrt/bytestring.cpp
+++ b/core/fxcrt/bytestring.cpp
@@ -669,11 +669,7 @@ size_t ByteString::Replace(const ByteStringView& pOld,
}
WideString ByteString::UTF8Decode() const {
- CFX_UTF8Decoder decoder;
- for (size_t i = 0; i < GetLength(); i++) {
- decoder.Input(static_cast<uint8_t>(m_pData->m_String[i]));
- }
- return WideString(decoder.GetResult());
+ return WideString::FromUTF8(AsStringView());
}
int ByteString::Compare(const ByteStringView& str) const {
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index bee5e16da4..8adab5c602 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp
@@ -43,5 +43,7 @@ void CFX_UTF8Decoder::Input(uint8_t byte) {
} else if (byte < 0xfe) {
m_PendingBytes = 5;
m_PendingChar = (byte & 0x01) << 30;
+ } else {
+ m_PendingBytes = 0;
}
}
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
new file mode 100644
index 0000000000..9ed149f1ad
--- /dev/null
+++ b/core/fxcrt/cfx_utf8encoder.cpp
@@ -0,0 +1,43 @@
+// Copyright 2018 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/cfx_utf8encoder.h"
+
+CFX_UTF8Encoder::CFX_UTF8Encoder() = default;
+
+CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
+
+void CFX_UTF8Encoder::Input(wchar_t unicodeAsWchar) {
+ uint32_t unicode = static_cast<uint32_t>(unicodeAsWchar);
+ if (unicode < 0x80) {
+ m_Buffer.push_back(unicode);
+ } else {
+ if (unicode >= 0x80000000)
+ return;
+
+ int nbytes = 0;
+ if (unicode < 0x800)
+ nbytes = 2;
+ else if (unicode < 0x10000)
+ nbytes = 3;
+ else if (unicode < 0x200000)
+ nbytes = 4;
+ else if (unicode < 0x4000000)
+ nbytes = 5;
+ else
+ nbytes = 6;
+
+ static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+ int order = 1 << ((nbytes - 1) * 6);
+ int code = unicodeAsWchar;
+ m_Buffer.push_back(prefix[nbytes - 2] | (code / order));
+ for (int i = 0; i < nbytes - 1; i++) {
+ code = code % order;
+ order >>= 6;
+ m_Buffer.push_back(0x80 | (code / order));
+ }
+ }
+}
diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
new file mode 100644
index 0000000000..d44a829108
--- /dev/null
+++ b/core/fxcrt/cfx_utf8encoder.h
@@ -0,0 +1,31 @@
+// Copyright 2018 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
+#define CORE_FXCRT_CFX_UTF8ENCODER_H_
+
+#include <vector>
+
+#include "core/fxcrt/fx_string.h"
+
+class CFX_UTF8Encoder {
+ public:
+ CFX_UTF8Encoder();
+ ~CFX_UTF8Encoder();
+
+ void Input(wchar_t unicodeAsWchar);
+
+ // The data returned by GetResult() is invalidated when this is modified by
+ // appending any data.
+ ByteStringView GetResult() const {
+ return ByteStringView(m_Buffer.data(), m_Buffer.size());
+ }
+
+ private:
+ std::vector<uint8_t> m_Buffer;
+};
+
+#endif // CORE_FXCRT_CFX_UTF8ENCODER_H_
diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp
index daf995560b..c9993f9ab8 100644
--- a/core/fxcrt/fx_string.cpp
+++ b/core/fxcrt/fx_string.cpp
@@ -7,61 +7,12 @@
#include <limits>
#include <vector>
+#include "core/fxcrt/cfx_utf8decoder.h"
+#include "core/fxcrt/cfx_utf8encoder.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_string.h"
#include "third_party/base/compiler_specific.h"
-namespace {
-
-class CFX_UTF8Encoder {
- public:
- CFX_UTF8Encoder() {}
- ~CFX_UTF8Encoder() {}
-
- void Input(wchar_t unicodeAsWchar) {
- uint32_t unicode = static_cast<uint32_t>(unicodeAsWchar);
- if (unicode < 0x80) {
- m_Buffer.push_back(unicode);
- } else {
- if (unicode >= 0x80000000)
- return;
-
- int nbytes = 0;
- if (unicode < 0x800)
- nbytes = 2;
- else if (unicode < 0x10000)
- nbytes = 3;
- else if (unicode < 0x200000)
- nbytes = 4;
- else if (unicode < 0x4000000)
- nbytes = 5;
- else
- nbytes = 6;
-
- static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
- int order = 1 << ((nbytes - 1) * 6);
- int code = unicodeAsWchar;
- m_Buffer.push_back(prefix[nbytes - 2] | (code / order));
- for (int i = 0; i < nbytes - 1; i++) {
- code = code % order;
- order >>= 6;
- m_Buffer.push_back(0x80 | (code / order));
- }
- }
- }
-
- // The data returned by GetResult() is invalidated when this is modified by
- // appending any data.
- ByteStringView GetResult() const {
- return ByteStringView(m_Buffer.data(), m_Buffer.size());
- }
-
- private:
- std::vector<uint8_t> m_Buffer;
-};
-
-} // namespace
-
ByteString FX_UTF8Encode(const WideStringView& wsStr) {
size_t len = wsStr.GetLength();
const wchar_t* pStr = wsStr.unterminated_c_str();
@@ -72,6 +23,17 @@ ByteString FX_UTF8Encode(const WideStringView& wsStr) {
return ByteString(encoder.GetResult());
}
+WideString FX_UTF8Decode(const ByteStringView& bsStr) {
+ if (bsStr.IsEmpty())
+ return WideString();
+
+ CFX_UTF8Decoder decoder;
+ for (size_t i = 0; i < bsStr.GetLength(); i++)
+ decoder.Input(bsStr[i]);
+
+ return WideString(decoder.GetResult());
+}
+
namespace {
const float fraction_scales[] = {0.1f, 0.01f, 0.001f,
diff --git a/core/fxcrt/fx_string.h b/core/fxcrt/fx_string.h
index 4c24181e37..2cf823738a 100644
--- a/core/fxcrt/fx_string.h
+++ b/core/fxcrt/fx_string.h
@@ -15,6 +15,8 @@
((uint32_t)c4))
ByteString FX_UTF8Encode(const WideStringView& wsStr);
+WideString FX_UTF8Decode(const ByteStringView& bsStr);
+
float FX_atof(const ByteStringView& str);
float FX_atof(const WideStringView& wsStr);
bool FX_atonum(const ByteStringView& str, void* pData);
diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index b311165638..60e7f07523 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp
@@ -51,3 +51,52 @@ TEST(fxstring, FX_atonum) {
EXPECT_FALSE(FX_atonum("3.24", &f));
EXPECT_FLOAT_EQ(3.24f, f);
}
+
+TEST(fxstring, FX_UTF8Encode) {
+ EXPECT_EQ("", FX_UTF8Encode(WideStringView()));
+ EXPECT_EQ(
+ "x"
+ "\xc2\x80"
+ "\xc3\xbf"
+ "\xef\xbc\xac"
+ "y",
+ FX_UTF8Encode(L"x"
+ L"\u0080"
+ L"\u00ff"
+ L"\uff2c"
+ L"y"));
+}
+
+TEST(fxstring, FX_UTF8Decode) {
+ EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
+ EXPECT_EQ(
+ L"x"
+ L"\u0080"
+ L"\u00ff"
+ L"\uff2c"
+ L"y",
+ FX_UTF8Decode("x"
+ "\xc2\x80"
+ "\xc3\xbf"
+ "\xef\xbc\xac"
+ "y"));
+ EXPECT_EQ(L"a(A) b() c() d() e().",
+ FX_UTF8Decode("a(\xc2\x41) " // Invalid continuation.
+ "b(\xc2\xc2) " // Invalid continuation.
+ "c(\xc2\xff\x80) " // Invalid continuation.
+ "d(\x80\x80) " // Invalid leading.
+ "e(\xff\x80\x80)" // Invalid leading.
+ "."));
+}
+
+TEST(fxstring, FX_UTF8EncodeDecodeConsistency) {
+ WideString wstr;
+ wstr.Reserve(0x10000);
+ for (int w = 0; w < 0x10000; ++w)
+ wstr += static_cast<wchar_t>(w);
+
+ ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
+ WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
+ EXPECT_EQ(0x10000u, wstr2.GetLength());
+ EXPECT_EQ(wstr, wstr2);
+}
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 97073f170d..e3c08d72e1 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -12,7 +12,6 @@
#include <cctype>
#include <cwctype>
-#include "core/fxcrt/cfx_utf8decoder.h"
#include "core/fxcrt/fx_codepage.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_safe_types.h"
@@ -885,14 +884,7 @@ WideString WideString::FromLocal(const ByteStringView& bstr) {
// static
WideString WideString::FromUTF8(const ByteStringView& str) {
- if (str.IsEmpty())
- return WideString();
-
- CFX_UTF8Decoder decoder;
- for (size_t i = 0; i < str.GetLength(); i++)
- decoder.Input(str[i]);
-
- return WideString(decoder.GetResult());
+ return FX_UTF8Decode(str);
}
// static