From 34dab07ed6e826666fd0589069f2c9b5bd2ba4dc Mon Sep 17 00:00:00 2001 From: Tom Sepez Date: Wed, 8 Aug 2018 17:49:02 +0000 Subject: Move ByteString::FromUnicode() to WideString::ToDefANSI() Turns out that "FromUnicode" is misleading in that, on linux, it simply removes any characters beyond 0xFF and passes the rest unchanged, so no unicode decoding actually takes place. On Windows, it passes it into the system function specifying FX_CODEPAGE_DefANSI, converting it into the so-called "default ANSI code plane", passing some characters, converting others to '?' and still others to 'A'. Either way, nothing resembling UTF8 comes out of this, so pick a better name. These now immediately look suspicious, so a follow-up CL will see which ones should really be WideString::UTF8Encode() instead. Making this a normal method on a widestring rather than a static method on a bytestring feels more natural; this is parallel to the UTF8Encode and UTF16LE_Encode functions. Add a test that shows these conversions. Change-Id: Ia7551b47199eba61b5c328a97bfe9176ac8e583c Reviewed-on: https://pdfium-review.googlesource.com/39690 Reviewed-by: Lei Zhang Commit-Queue: Tom Sepez --- core/fpdfapi/parser/cpdf_document.cpp | 2 +- core/fpdfdoc/cpdf_filespec.cpp | 5 ++--- core/fpdfdoc/cpdf_interform.cpp | 4 ++-- core/fxcrt/bytestring.cpp | 20 -------------------- core/fxcrt/bytestring.h | 2 -- core/fxcrt/widestring.cpp | 19 +++++++++++++++++++ core/fxcrt/widestring.h | 1 + core/fxcrt/widestring_unittest.cpp | 26 ++++++++++++++++++++++++++ core/fxge/win32/fx_win32_print.cpp | 2 +- fpdfsdk/cpdfsdk_interform.cpp | 4 ++-- fxbarcode/BC_UtilCodingConvert.cpp | 8 +++----- fxjs/cfxjse_class.cpp | 3 +-- fxjs/cjs_field.cpp | 4 ++-- fxjs/cjs_global.cpp | 18 ++++++++---------- fxjs/cjs_publicmethods.cpp | 8 ++++---- fxjs/cjs_runtime.cpp | 2 +- xfa/fgas/font/cfgas_fontmgr.cpp | 3 +-- xfa/fgas/font/cfgas_gefont.cpp | 2 +- xfa/fgas/font/cfgas_pdffontmgr.cpp | 2 +- xfa/fxfa/parser/cxfa_node.cpp | 2 +- 20 files changed, 77 insertions(+), 60 deletions(-) diff --git a/core/fpdfapi/parser/cpdf_document.cpp b/core/fpdfapi/parser/cpdf_document.cpp index 7641dbd702..2a432623ff 100644 --- a/core/fpdfapi/parser/cpdf_document.cpp +++ b/core/fpdfapi/parser/cpdf_document.cpp @@ -814,7 +814,7 @@ CPDF_Font* CPDF_Document::AddWindowsFont(LOGFONTW* pLogFont, bool bTranslateName) { LOGFONTA lfa; memcpy(&lfa, pLogFont, (char*)lfa.lfFaceName - (char*)&lfa); - ByteString face = ByteString::FromUnicode(pLogFont->lfFaceName); + ByteString face = WideString(pLogFont->lfFaceName).ToDefANSI(); if (face.GetLength() >= LF_FACESIZE) return nullptr; diff --git a/core/fpdfdoc/cpdf_filespec.cpp b/core/fpdfdoc/cpdf_filespec.cpp index 2cc51e33d3..aa93b91a94 100644 --- a/core/fpdfdoc/cpdf_filespec.cpp +++ b/core/fpdfdoc/cpdf_filespec.cpp @@ -203,10 +203,9 @@ void CPDF_FileSpec::SetFileName(const WideString& wsFileName) { WideString wsStr = EncodeFileName(wsFileName); if (m_pObj->IsString()) { - m_pWritableObj->SetString(ByteString::FromUnicode(wsStr)); + m_pWritableObj->SetString(wsStr.ToDefANSI()); } else if (CPDF_Dictionary* pDict = m_pWritableObj->AsDictionary()) { - pDict->SetNewFor(pdfium::stream::kF, - ByteString::FromUnicode(wsStr), false); + pDict->SetNewFor(pdfium::stream::kF, wsStr.ToDefANSI(), false); pDict->SetNewFor("UF", PDF_EncodeText(wsStr), false); } } diff --git a/core/fpdfdoc/cpdf_interform.cpp b/core/fpdfdoc/cpdf_interform.cpp index d7cee35704..26dfbc17f0 100644 --- a/core/fpdfdoc/cpdf_interform.cpp +++ b/core/fpdfdoc/cpdf_interform.cpp @@ -1030,8 +1030,8 @@ std::unique_ptr CPDF_InterForm::ExportToFDF( if (!pdf_path.IsEmpty()) { if (bSimpleFileSpec) { WideString wsFilePath = CPDF_FileSpec::EncodeFileName(pdf_path); - pMainDict->SetNewFor( - pdfium::stream::kF, ByteString::FromUnicode(wsFilePath), false); + pMainDict->SetNewFor(pdfium::stream::kF, + wsFilePath.ToDefANSI(), false); pMainDict->SetNewFor("UF", PDF_EncodeText(wsFilePath), false); } else { diff --git a/core/fxcrt/bytestring.cpp b/core/fxcrt/bytestring.cpp index 4d55c98912..b6c1ce7bbd 100644 --- a/core/fxcrt/bytestring.cpp +++ b/core/fxcrt/bytestring.cpp @@ -676,26 +676,6 @@ WideString ByteString::UTF8Decode() const { return WideString(decoder.GetResult()); } -// static -ByteString ByteString::FromUnicode(const WideString& wstr) { - int src_len = wstr.GetLength(); - int dest_len = - FXSYS_WideCharToMultiByte(FX_CODEPAGE_DefANSI, 0, wstr.c_str(), src_len, - nullptr, 0, nullptr, nullptr); - if (!dest_len) - return ByteString(); - - ByteString bstr; - { - // Span's lifetime must end before ReleaseBuffer() below. - pdfium::span dest_buf = bstr.GetBuffer(dest_len); - FXSYS_WideCharToMultiByte(FX_CODEPAGE_DefANSI, 0, wstr.c_str(), src_len, - dest_buf.data(), dest_len, nullptr, nullptr); - } - bstr.ReleaseBuffer(dest_len); - return bstr; -} - int ByteString::Compare(const ByteStringView& str) const { if (!m_pData) return str.IsEmpty() ? 0 : -1; diff --git a/core/fxcrt/bytestring.h b/core/fxcrt/bytestring.h index 5722c4925d..5975acbddd 100644 --- a/core/fxcrt/bytestring.h +++ b/core/fxcrt/bytestring.h @@ -66,8 +66,6 @@ class ByteString { void clear() { m_pData.Reset(); } - static ByteString FromUnicode(const WideString& str) WARN_UNUSED_RESULT; - // Explicit conversion to C-style string. // Note: Any subsequent modification of |this| will invalidate the result. const char* c_str() const { return m_pData ? m_pData->m_String : ""; } diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp index cde1973d26..7dd1c30eb0 100644 --- a/core/fxcrt/widestring.cpp +++ b/core/fxcrt/widestring.cpp @@ -673,6 +673,25 @@ intptr_t WideString::ReferenceCountForTesting() const { return m_pData ? m_pData->m_nRefs : 0; } +// static +ByteString WideString::ToDefANSI() const { + int src_len = GetLength(); + int dest_len = FXSYS_WideCharToMultiByte( + FX_CODEPAGE_DefANSI, 0, c_str(), src_len, nullptr, 0, nullptr, nullptr); + if (!dest_len) + return ByteString(); + + ByteString bstr; + { + // Span's lifetime must end before ReleaseBuffer() below. + pdfium::span dest_buf = bstr.GetBuffer(dest_len); + FXSYS_WideCharToMultiByte(FX_CODEPAGE_DefANSI, 0, c_str(), src_len, + dest_buf.data(), dest_len, nullptr, nullptr); + } + bstr.ReleaseBuffer(dest_len); + return bstr; +} + ByteString WideString::UTF8Encode() const { return FX_UTF8Encode(AsStringView()); } diff --git a/core/fxcrt/widestring.h b/core/fxcrt/widestring.h index b531292c57..dc5dd23428 100644 --- a/core/fxcrt/widestring.h +++ b/core/fxcrt/widestring.h @@ -195,6 +195,7 @@ class WideString { size_t Replace(const WideStringView& pOld, const WideStringView& pNew); size_t Remove(wchar_t ch); + ByteString ToDefANSI() const; ByteString UTF8Encode() const; // This method will add \0\0 to the end of the string to represent the diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp index 9017fe0c54..9d38aa45e9 100644 --- a/core/fxcrt/widestring_unittest.cpp +++ b/core/fxcrt/widestring_unittest.cpp @@ -999,6 +999,32 @@ TEST(WideString, UTF16LE_Encode) { } } +TEST(WideString, ToDefANSI) { + EXPECT_EQ("", WideString().ToDefANSI()); +#if _FX_PLATFORM_ == _FX_PLATFORM_WINDOWS_ + const char* kResult = + "x" + "?" + "\xff" + "A" + "?" + "y"; +#else + const char* kResult = + "x" + "\x80" + "\xff" + "y"; +#endif + EXPECT_EQ(kResult, WideString(L"x" + L"\u0080" + L"\u00ff" + L"\u0100" + L"\u208c" + L"y") + .ToDefANSI()); +} + TEST(WideStringView, FromVector) { std::vector null_vec; WideStringView null_string(null_vec); diff --git a/core/fxge/win32/fx_win32_print.cpp b/core/fxge/win32/fx_win32_print.cpp index f36fa84364..1e2b2d4e9a 100644 --- a/core/fxge/win32/fx_win32_print.cpp +++ b/core/fxge/win32/fx_win32_print.cpp @@ -650,7 +650,7 @@ bool CTextOnlyPrinterDriver::DrawDeviceText(int nChars, wsText += charpos.m_Unicode; } size_t len = totalLength; - ByteString text = ByteString::FromUnicode(wsText); + ByteString text = wsText.ToDefANSI(); while (len > 0) { char buffer[1026]; size_t send_len = std::min(len, static_cast(1024)); diff --git a/fpdfsdk/cpdfsdk_interform.cpp b/fpdfsdk/cpdfsdk_interform.cpp index 3cfc84dfd9..6a4b591ed0 100644 --- a/fpdfsdk/cpdfsdk_interform.cpp +++ b/fpdfsdk/cpdfsdk_interform.cpp @@ -506,10 +506,10 @@ bool CPDFSDK_InterForm::FDFToURLEncodedData(uint8_t*& pBuf, size_t& nBufSize) { continue; WideString name; name = pField->GetUnicodeTextFor("T"); - ByteString name_b = ByteString::FromUnicode(name); + ByteString name_b = name.ToDefANSI(); ByteString csBValue = pField->GetStringFor("V"); WideString csWValue = PDF_DecodeText(csBValue); - ByteString csValue_b = ByteString::FromUnicode(csWValue); + ByteString csValue_b = csWValue.ToDefANSI(); fdfEncodedData << name_b << "=" << csValue_b; if (i != pFields->GetCount() - 1) fdfEncodedData << "&"; diff --git a/fxbarcode/BC_UtilCodingConvert.cpp b/fxbarcode/BC_UtilCodingConvert.cpp index 77c52f2074..96aeaddb7c 100644 --- a/fxbarcode/BC_UtilCodingConvert.cpp +++ b/fxbarcode/BC_UtilCodingConvert.cpp @@ -12,7 +12,7 @@ CBC_UtilCodingConvert::~CBC_UtilCodingConvert() {} void CBC_UtilCodingConvert::UnicodeToLocale(const WideString& src, ByteString& dst) { - dst = ByteString::FromUnicode(src); + dst = src.ToDefANSI(); } void CBC_UtilCodingConvert::LocaleToUtf8(const ByteString& src, @@ -34,15 +34,13 @@ void CBC_UtilCodingConvert::Utf8ToLocale(const std::vector& src, for (uint8_t value : src) utf8 += value; - WideString unicode = WideString::FromUTF8(utf8.AsStringView()); - dst = ByteString::FromUnicode(unicode); + dst = WideString::FromUTF8(utf8.AsStringView()).ToDefANSI(); } void CBC_UtilCodingConvert::Utf8ToLocale(const uint8_t* src, int32_t count, ByteString& dst) { - WideString unicode = WideString::FromUTF8(ByteStringView(src, count)); - dst = ByteString::FromUnicode(unicode); + dst = WideString::FromUTF8(ByteStringView(src, count)).ToDefANSI(); } void CBC_UtilCodingConvert::UnicodeToUTF8(const WideString& src, diff --git a/fxjs/cfxjse_class.cpp b/fxjs/cfxjse_class.cpp index e7e54c8867..9207fd5094 100644 --- a/fxjs/cfxjse_class.cpp +++ b/fxjs/cfxjse_class.cpp @@ -112,8 +112,7 @@ void DynPropGetterAdapter_MethodCallback( WideString err = JSFormatErrorString(pClassDescriptor->name, *szPropName, result.Error()); v8::MaybeLocal str = v8::String::NewFromUtf8( - info.GetIsolate(), ByteString::FromUnicode(err).c_str(), - v8::NewStringType::kNormal); + info.GetIsolate(), err.ToDefANSI().c_str(), v8::NewStringType::kNormal); info.GetIsolate()->ThrowException(str.ToLocalChecked()); return; } diff --git a/fxjs/cjs_field.cpp b/fxjs/cjs_field.cpp index 19e6712f2a..cd97372ad0 100644 --- a/fxjs/cjs_field.cpp +++ b/fxjs/cjs_field.cpp @@ -731,7 +731,7 @@ CJS_Return CJS_Field::set_border_style(CJS_Runtime* pRuntime, if (!m_bCanSet) return CJS_Return(JSMessage::kReadOnlyError); - ByteString byte_str = ByteString::FromUnicode(pRuntime->ToWideString(vp)); + ByteString byte_str = pRuntime->ToWideString(vp).ToDefANSI(); if (m_bDelay) { AddDelay_String(FP_BORDERSTYLE, byte_str); } else { @@ -2002,7 +2002,7 @@ CJS_Return CJS_Field::set_text_font(CJS_Runtime* pRuntime, if (!m_bCanSet) return CJS_Return(JSMessage::kReadOnlyError); - if (ByteString::FromUnicode(pRuntime->ToWideString(vp)).IsEmpty()) + if (pRuntime->ToWideString(vp).ToDefANSI().IsEmpty()) return CJS_Return(JSMessage::kValueError); return CJS_Return(); } diff --git a/fxjs/cjs_global.cpp b/fxjs/cjs_global.cpp index 567d54853c..1865f57ddb 100644 --- a/fxjs/cjs_global.cpp +++ b/fxjs/cjs_global.cpp @@ -232,7 +232,7 @@ CJS_Return CJS_Global::QueryProperty(const wchar_t* propname) { CJS_Return CJS_Global::DelProperty(CJS_Runtime* pRuntime, const wchar_t* propname) { - auto it = m_MapGlobal.find(ByteString::FromUnicode(propname)); + auto it = m_MapGlobal.find(WideString(propname).ToDefANSI()); if (it == m_MapGlobal.end()) return CJS_Return(JSMessage::kUnknownProperty); @@ -242,7 +242,7 @@ CJS_Return CJS_Global::DelProperty(CJS_Runtime* pRuntime, CJS_Return CJS_Global::GetProperty(CJS_Runtime* pRuntime, const wchar_t* propname) { - auto it = m_MapGlobal.find(ByteString::FromUnicode(propname)); + auto it = m_MapGlobal.find(WideString(propname).ToDefANSI()); if (it == m_MapGlobal.end()) return CJS_Return(); @@ -272,7 +272,7 @@ CJS_Return CJS_Global::GetProperty(CJS_Runtime* pRuntime, CJS_Return CJS_Global::SetProperty(CJS_Runtime* pRuntime, const wchar_t* propname, v8::Local vp) { - ByteString sPropName = ByteString::FromUnicode(propname); + ByteString sPropName = WideString(propname).ToDefANSI(); if (vp->IsNumber()) { return SetGlobalVariables(sPropName, JS_GlobalDataType::NUMBER, pRuntime->ToDouble(vp), false, "", @@ -284,10 +284,9 @@ CJS_Return CJS_Global::SetProperty(CJS_Runtime* pRuntime, v8::Local(), false); } if (vp->IsString()) { - return SetGlobalVariables( - sPropName, JS_GlobalDataType::STRING, 0, false, - ByteString::FromUnicode(pRuntime->ToWideString(vp)), - v8::Local(), false); + return SetGlobalVariables(sPropName, JS_GlobalDataType::STRING, 0, false, + pRuntime->ToWideString(vp).ToDefANSI(), + v8::Local(), false); } if (vp->IsObject()) { return SetGlobalVariables(sPropName, JS_GlobalDataType::OBJECT, 0, false, @@ -310,8 +309,7 @@ CJS_Return CJS_Global::setPersistent( if (params.size() != 2) return CJS_Return(JSMessage::kParamError); - auto it = m_MapGlobal.find( - ByteString::FromUnicode(pRuntime->ToWideString(params[0]))); + auto it = m_MapGlobal.find(pRuntime->ToWideString(params[0]).ToDefANSI()); if (it == m_MapGlobal.end() || it->second->bDeleted) return CJS_Return(JSMessage::kGlobalNotFoundError); @@ -432,7 +430,7 @@ void CJS_Global::ObjectToArray(CJS_Runtime* pRuntime, continue; } if (v->IsString()) { - ByteString sValue = ByteString::FromUnicode(pRuntime->ToWideString(v)); + ByteString sValue = pRuntime->ToWideString(v).ToDefANSI(); CJS_KeyValue* pObjElement = new CJS_KeyValue; pObjElement->nType = JS_GlobalDataType::STRING; pObjElement->sKey = sKey; diff --git a/fxjs/cjs_publicmethods.cpp b/fxjs/cjs_publicmethods.cpp index a86ece3461..f2cd3ff7c1 100644 --- a/fxjs/cjs_publicmethods.cpp +++ b/fxjs/cjs_publicmethods.cpp @@ -327,7 +327,7 @@ v8::Local CJS_PublicMethods::AF_MakeArrayFromList( return pRuntime->ToArray(val); WideString wsStr = pRuntime->ToWideString(val); - ByteString t = ByteString::FromUnicode(wsStr); + ByteString t = wsStr.ToDefANSI(); const char* p = t.c_str(); int nIndex = 0; @@ -882,7 +882,7 @@ CJS_Return CJS_PublicMethods::AFNumber_Format( return CJS_Return(L"No event handler"); WideString& Value = pEvent->Value(); - ByteString strValue = StrTrim(ByteString::FromUnicode(Value)); + ByteString strValue = StrTrim(Value.ToDefANSI()); if (strValue.IsEmpty()) return CJS_Return(); @@ -1078,7 +1078,7 @@ CJS_Return CJS_PublicMethods::AFPercent_Format( return CJS_Return(JSMessage::kBadObjectError); WideString& Value = pEvent->Value(); - ByteString strValue = StrTrim(ByteString::FromUnicode(Value)); + ByteString strValue = StrTrim(Value.ToDefANSI()); if (strValue.IsEmpty()) return CJS_Return(); @@ -1683,7 +1683,7 @@ CJS_Return CJS_PublicMethods::AFRange_Validate( if (pEvent->Value().IsEmpty()) return CJS_Return(); - double dEentValue = atof(ByteString::FromUnicode(pEvent->Value()).c_str()); + double dEentValue = atof(pEvent->Value().ToDefANSI().c_str()); bool bGreaterThan = pRuntime->ToBoolean(params[0]); double dGreaterThan = pRuntime->ToDouble(params[1]); bool bLessThan = pRuntime->ToBoolean(params[2]); diff --git a/fxjs/cjs_runtime.cpp b/fxjs/cjs_runtime.cpp index 9329a483f4..6042538e30 100644 --- a/fxjs/cjs_runtime.cpp +++ b/fxjs/cjs_runtime.cpp @@ -234,7 +234,7 @@ v8::Local CJS_Runtime::MaybeCoerceToNumber( v8::Local value) { bool bAllowNaN = false; if (value->IsString()) { - ByteString bstr = ByteString::FromUnicode(ToWideString(value)); + ByteString bstr = ToWideString(value).ToDefANSI(); if (bstr.GetLength() == 0) return value; if (bstr == "NaN") diff --git a/xfa/fgas/font/cfgas_fontmgr.cpp b/xfa/fgas/font/cfgas_fontmgr.cpp index 5ab91b3cc6..f173bf24ae 100644 --- a/xfa/fgas/font/cfgas_fontmgr.cpp +++ b/xfa/fgas/font/cfgas_fontmgr.cpp @@ -416,8 +416,7 @@ ByteString CFX_FontSourceEnum_File::GetNextFile() { } ByteString bsName; bool bFolder; - ByteString bsFolderSeparator = - ByteString::FromUnicode(WideString(kFolderSeparator)); + ByteString bsFolderSeparator = WideString(kFolderSeparator).ToDefANSI(); while (true) { if (!FX_GetNextFile(pCurHandle, &bsName, &bFolder)) { FX_CloseFolder(pCurHandle); diff --git a/xfa/fgas/font/cfgas_gefont.cpp b/xfa/fgas/font/cfgas_gefont.cpp index 740504f088..b5cb7962af 100644 --- a/xfa/fgas/font/cfgas_gefont.cpp +++ b/xfa/fgas/font/cfgas_gefont.cpp @@ -76,7 +76,7 @@ bool CFGAS_GEFont::LoadFontInternal(const wchar_t* pszFontFamily, return false; ByteString csFontFamily; if (pszFontFamily) - csFontFamily = ByteString::FromUnicode(pszFontFamily); + csFontFamily = WideString(pszFontFamily).ToDefANSI(); int32_t iWeight = FontStyleIsBold(dwFontStyles) ? FXFONT_FW_BOLD : FXFONT_FW_NORMAL; diff --git a/xfa/fgas/font/cfgas_pdffontmgr.cpp b/xfa/fgas/font/cfgas_pdffontmgr.cpp index 80d6196d4e..4220d4d6c5 100644 --- a/xfa/fgas/font/cfgas_pdffontmgr.cpp +++ b/xfa/fgas/font/cfgas_pdffontmgr.cpp @@ -80,7 +80,7 @@ RetainPtr CFGAS_PDFFontMgr::GetFont( if (it != m_FontMap.end()) return it->second; - ByteString bsPsName = ByteString::FromUnicode(WideString(wsFontFamily)); + ByteString bsPsName = WideString(wsFontFamily).ToDefANSI(); bool bBold = FontStyleIsBold(dwFontStyles); bool bItalic = FontStyleIsItalic(dwFontStyles); ByteString strFontName = PsNameToFontName(bsPsName, bBold, bItalic); diff --git a/xfa/fxfa/parser/cxfa_node.cpp b/xfa/fxfa/parser/cxfa_node.cpp index 7ff205c6f4..8babe45ef3 100644 --- a/xfa/fxfa/parser/cxfa_node.cpp +++ b/xfa/fxfa/parser/cxfa_node.cpp @@ -214,7 +214,7 @@ RetainPtr XFA_LoadImageData(CXFA_FFDoc* pDoc, pdfium::MakeRetain(pImageBuffer, iRead, false); } } else { - bsContent = ByteString::FromUnicode(wsImage); + bsContent = wsImage.ToDefANSI(); pImageFileRead = pdfium::MakeRetain( const_cast(bsContent.raw_str()), bsContent.GetLength(), false); -- cgit v1.2.3