From fd7cede17e027a83de2aff3bc0f5ee875271e444 Mon Sep 17 00:00:00 2001 From: Tom Sepez Date: Thu, 9 Aug 2018 21:32:47 +0000 Subject: Move all utf8 decoding under fx_string.h Put encoder in cfx_utf8encoder.{h,cpp} to parallel decoder. Add tests, and fix one corner case involving 0xff. Change-Id: Ib97540afdc708bcc6280a79c76734ec68ea72690 Reviewed-on: https://pdfium-review.googlesource.com/39770 Commit-Queue: Lei Zhang Reviewed-by: Lei Zhang --- BUILD.gn | 2 ++ core/fxcrt/bytestring.cpp | 6 +--- core/fxcrt/cfx_utf8decoder.cpp | 2 ++ core/fxcrt/cfx_utf8encoder.cpp | 43 ++++++++++++++++++++++++++ core/fxcrt/cfx_utf8encoder.h | 31 +++++++++++++++++++ core/fxcrt/fx_string.cpp | 64 ++++++++------------------------------- core/fxcrt/fx_string.h | 2 ++ core/fxcrt/fx_string_unittest.cpp | 49 ++++++++++++++++++++++++++++++ core/fxcrt/widestring.cpp | 10 +----- 9 files changed, 144 insertions(+), 65 deletions(-) create mode 100644 core/fxcrt/cfx_utf8encoder.cpp create mode 100644 core/fxcrt/cfx_utf8encoder.h diff --git a/BUILD.gn b/BUILD.gn index afa83e358a..f08ce43225 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -858,6 +858,8 @@ jumbo_static_library("fxcrt") { "core/fxcrt/cfx_seekablestreamproxy.h", "core/fxcrt/cfx_utf8decoder.cpp", "core/fxcrt/cfx_utf8decoder.h", + "core/fxcrt/cfx_utf8encoder.cpp", + "core/fxcrt/cfx_utf8encoder.h", "core/fxcrt/cfx_widetextbuf.cpp", "core/fxcrt/cfx_widetextbuf.h", "core/fxcrt/fileaccess_iface.h", diff --git a/core/fxcrt/bytestring.cpp b/core/fxcrt/bytestring.cpp index b6c1ce7bbd..3ff0e3537a 100644 --- a/core/fxcrt/bytestring.cpp +++ b/core/fxcrt/bytestring.cpp @@ -669,11 +669,7 @@ size_t ByteString::Replace(const ByteStringView& pOld, } WideString ByteString::UTF8Decode() const { - CFX_UTF8Decoder decoder; - for (size_t i = 0; i < GetLength(); i++) { - decoder.Input(static_cast(m_pData->m_String[i])); - } - return WideString(decoder.GetResult()); + return WideString::FromUTF8(AsStringView()); } int ByteString::Compare(const ByteStringView& str) const { diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp index bee5e16da4..8adab5c602 100644 --- a/core/fxcrt/cfx_utf8decoder.cpp +++ b/core/fxcrt/cfx_utf8decoder.cpp @@ -43,5 +43,7 @@ void CFX_UTF8Decoder::Input(uint8_t byte) { } else if (byte < 0xfe) { m_PendingBytes = 5; m_PendingChar = (byte & 0x01) << 30; + } else { + m_PendingBytes = 0; } } diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp new file mode 100644 index 0000000000..9ed149f1ad --- /dev/null +++ b/core/fxcrt/cfx_utf8encoder.cpp @@ -0,0 +1,43 @@ +// Copyright 2018 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/cfx_utf8encoder.h" + +CFX_UTF8Encoder::CFX_UTF8Encoder() = default; + +CFX_UTF8Encoder::~CFX_UTF8Encoder() = default; + +void CFX_UTF8Encoder::Input(wchar_t unicodeAsWchar) { + uint32_t unicode = static_cast(unicodeAsWchar); + if (unicode < 0x80) { + m_Buffer.push_back(unicode); + } else { + if (unicode >= 0x80000000) + return; + + int nbytes = 0; + if (unicode < 0x800) + nbytes = 2; + else if (unicode < 0x10000) + nbytes = 3; + else if (unicode < 0x200000) + nbytes = 4; + else if (unicode < 0x4000000) + nbytes = 5; + else + nbytes = 6; + + static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + int order = 1 << ((nbytes - 1) * 6); + int code = unicodeAsWchar; + m_Buffer.push_back(prefix[nbytes - 2] | (code / order)); + for (int i = 0; i < nbytes - 1; i++) { + code = code % order; + order >>= 6; + m_Buffer.push_back(0x80 | (code / order)); + } + } +} diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h new file mode 100644 index 0000000000..d44a829108 --- /dev/null +++ b/core/fxcrt/cfx_utf8encoder.h @@ -0,0 +1,31 @@ +// Copyright 2018 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_ +#define CORE_FXCRT_CFX_UTF8ENCODER_H_ + +#include + +#include "core/fxcrt/fx_string.h" + +class CFX_UTF8Encoder { + public: + CFX_UTF8Encoder(); + ~CFX_UTF8Encoder(); + + void Input(wchar_t unicodeAsWchar); + + // The data returned by GetResult() is invalidated when this is modified by + // appending any data. + ByteStringView GetResult() const { + return ByteStringView(m_Buffer.data(), m_Buffer.size()); + } + + private: + std::vector m_Buffer; +}; + +#endif // CORE_FXCRT_CFX_UTF8ENCODER_H_ diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp index daf995560b..c9993f9ab8 100644 --- a/core/fxcrt/fx_string.cpp +++ b/core/fxcrt/fx_string.cpp @@ -7,61 +7,12 @@ #include #include +#include "core/fxcrt/cfx_utf8decoder.h" +#include "core/fxcrt/cfx_utf8encoder.h" #include "core/fxcrt/fx_extension.h" #include "core/fxcrt/fx_string.h" #include "third_party/base/compiler_specific.h" -namespace { - -class CFX_UTF8Encoder { - public: - CFX_UTF8Encoder() {} - ~CFX_UTF8Encoder() {} - - void Input(wchar_t unicodeAsWchar) { - uint32_t unicode = static_cast(unicodeAsWchar); - if (unicode < 0x80) { - m_Buffer.push_back(unicode); - } else { - if (unicode >= 0x80000000) - return; - - int nbytes = 0; - if (unicode < 0x800) - nbytes = 2; - else if (unicode < 0x10000) - nbytes = 3; - else if (unicode < 0x200000) - nbytes = 4; - else if (unicode < 0x4000000) - nbytes = 5; - else - nbytes = 6; - - static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; - int order = 1 << ((nbytes - 1) * 6); - int code = unicodeAsWchar; - m_Buffer.push_back(prefix[nbytes - 2] | (code / order)); - for (int i = 0; i < nbytes - 1; i++) { - code = code % order; - order >>= 6; - m_Buffer.push_back(0x80 | (code / order)); - } - } - } - - // The data returned by GetResult() is invalidated when this is modified by - // appending any data. - ByteStringView GetResult() const { - return ByteStringView(m_Buffer.data(), m_Buffer.size()); - } - - private: - std::vector m_Buffer; -}; - -} // namespace - ByteString FX_UTF8Encode(const WideStringView& wsStr) { size_t len = wsStr.GetLength(); const wchar_t* pStr = wsStr.unterminated_c_str(); @@ -72,6 +23,17 @@ ByteString FX_UTF8Encode(const WideStringView& wsStr) { return ByteString(encoder.GetResult()); } +WideString FX_UTF8Decode(const ByteStringView& bsStr) { + if (bsStr.IsEmpty()) + return WideString(); + + CFX_UTF8Decoder decoder; + for (size_t i = 0; i < bsStr.GetLength(); i++) + decoder.Input(bsStr[i]); + + return WideString(decoder.GetResult()); +} + namespace { const float fraction_scales[] = {0.1f, 0.01f, 0.001f, diff --git a/core/fxcrt/fx_string.h b/core/fxcrt/fx_string.h index 4c24181e37..2cf823738a 100644 --- a/core/fxcrt/fx_string.h +++ b/core/fxcrt/fx_string.h @@ -15,6 +15,8 @@ ((uint32_t)c4)) ByteString FX_UTF8Encode(const WideStringView& wsStr); +WideString FX_UTF8Decode(const ByteStringView& bsStr); + float FX_atof(const ByteStringView& str); float FX_atof(const WideStringView& wsStr); bool FX_atonum(const ByteStringView& str, void* pData); diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp index b311165638..60e7f07523 100644 --- a/core/fxcrt/fx_string_unittest.cpp +++ b/core/fxcrt/fx_string_unittest.cpp @@ -51,3 +51,52 @@ TEST(fxstring, FX_atonum) { EXPECT_FALSE(FX_atonum("3.24", &f)); EXPECT_FLOAT_EQ(3.24f, f); } + +TEST(fxstring, FX_UTF8Encode) { + EXPECT_EQ("", FX_UTF8Encode(WideStringView())); + EXPECT_EQ( + "x" + "\xc2\x80" + "\xc3\xbf" + "\xef\xbc\xac" + "y", + FX_UTF8Encode(L"x" + L"\u0080" + L"\u00ff" + L"\uff2c" + L"y")); +} + +TEST(fxstring, FX_UTF8Decode) { + EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView())); + EXPECT_EQ( + L"x" + L"\u0080" + L"\u00ff" + L"\uff2c" + L"y", + FX_UTF8Decode("x" + "\xc2\x80" + "\xc3\xbf" + "\xef\xbc\xac" + "y")); + EXPECT_EQ(L"a(A) b() c() d() e().", + FX_UTF8Decode("a(\xc2\x41) " // Invalid continuation. + "b(\xc2\xc2) " // Invalid continuation. + "c(\xc2\xff\x80) " // Invalid continuation. + "d(\x80\x80) " // Invalid leading. + "e(\xff\x80\x80)" // Invalid leading. + ".")); +} + +TEST(fxstring, FX_UTF8EncodeDecodeConsistency) { + WideString wstr; + wstr.Reserve(0x10000); + for (int w = 0; w < 0x10000; ++w) + wstr += static_cast(w); + + ByteString bstr = FX_UTF8Encode(wstr.AsStringView()); + WideString wstr2 = FX_UTF8Decode(bstr.AsStringView()); + EXPECT_EQ(0x10000u, wstr2.GetLength()); + EXPECT_EQ(wstr, wstr2); +} diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp index 97073f170d..e3c08d72e1 100644 --- a/core/fxcrt/widestring.cpp +++ b/core/fxcrt/widestring.cpp @@ -12,7 +12,6 @@ #include #include -#include "core/fxcrt/cfx_utf8decoder.h" #include "core/fxcrt/fx_codepage.h" #include "core/fxcrt/fx_extension.h" #include "core/fxcrt/fx_safe_types.h" @@ -885,14 +884,7 @@ WideString WideString::FromLocal(const ByteStringView& bstr) { // static WideString WideString::FromUTF8(const ByteStringView& str) { - if (str.IsEmpty()) - return WideString(); - - CFX_UTF8Decoder decoder; - for (size_t i = 0; i < str.GetLength(); i++) - decoder.Input(str[i]); - - return WideString(decoder.GetResult()); + return FX_UTF8Decode(str); } // static -- cgit v1.2.3