diff options
Diffstat (limited to 'core')
-rw-r--r-- | core/fxcrt/cfx_seekablestreamproxy.cpp | 307 | ||||
-rw-r--r-- | core/fxcrt/cfx_seekablestreamproxy.h | 54 | ||||
-rw-r--r-- | core/fxcrt/fx_codepage.h | 137 |
3 files changed, 498 insertions, 0 deletions
diff --git a/core/fxcrt/cfx_seekablestreamproxy.cpp b/core/fxcrt/cfx_seekablestreamproxy.cpp new file mode 100644 index 0000000000..fe6b8dd375 --- /dev/null +++ b/core/fxcrt/cfx_seekablestreamproxy.cpp @@ -0,0 +1,307 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#include "core/fxcrt/cfx_seekablestreamproxy.h" + +#if _FX_OS_ == _FX_WIN32_DESKTOP_ || _FX_OS_ == _FX_WIN32_MOBILE_ || \ + _FX_OS_ == _FX_WIN64_ +#include <io.h> +#endif + +#include <algorithm> +#include <limits> +#include <memory> +#include <utility> +#include <vector> + +#include "core/fxcrt/fx_codepage.h" +#include "core/fxcrt/fx_ext.h" +#include "third_party/base/ptr_util.h" +#include "third_party/base/stl_util.h" + +namespace { + +// Returns {src bytes consumed, dst bytes produced}. +std::pair<FX_STRSIZE, FX_STRSIZE> UTF8Decode(const char* pSrc, + FX_STRSIZE srcLen, + wchar_t* pDst, + FX_STRSIZE dstLen) { + ASSERT(pDst && dstLen > 0); + + if (srcLen < 1) + return {0, 0}; + + uint32_t dwCode = 0; + int32_t iPending = 0; + FX_STRSIZE iSrcNum = 0; + FX_STRSIZE iDstNum = 0; + FX_STRSIZE iIndex = 0; + int32_t k = 1; + while (iIndex < srcLen) { + uint8_t byte = static_cast<uint8_t>(*(pSrc + iIndex)); + if (byte < 0x80) { + iPending = 0; + k = 1; + iDstNum++; + iSrcNum += k; + *pDst++ = byte; + if (iDstNum >= dstLen) + break; + } else if (byte < 0xc0) { + if (iPending < 1) + break; + + iPending--; + dwCode |= (byte & 0x3f) << (iPending * 6); + if (iPending == 0) { + iDstNum++; + iSrcNum += k; + *pDst++ = dwCode; + if (iDstNum >= dstLen) + break; + } + } else if (byte < 0xe0) { + iPending = 1; + k = 2; + dwCode = (byte & 0x1f) << 6; + } else if (byte < 0xf0) { + iPending = 2; + k = 3; + dwCode = (byte & 0x0f) << 12; + } else if (byte < 0xf8) { + iPending = 3; + k = 4; + dwCode = (byte & 0x07) << 18; + } else if (byte < 0xfc) { + iPending = 4; + k = 5; + dwCode = (byte & 0x03) << 24; + } else if (byte < 0xfe) { + iPending = 5; + k = 6; + dwCode = (byte & 0x01) << 30; + } else { + break; + } + iIndex++; + } + return {iSrcNum, iDstNum}; +} + +void UTF16ToWChar(void* pBuffer, FX_STRSIZE iLength) { + ASSERT(pBuffer && iLength > 0); + + if (sizeof(wchar_t) == 2) + return; + + uint16_t* pSrc = static_cast<uint16_t*>(pBuffer); + wchar_t* pDst = static_cast<wchar_t*>(pBuffer); + while (--iLength >= 0) + pDst[iLength] = static_cast<wchar_t>(pSrc[iLength]); +} + +void SwapByteOrder(wchar_t* pStr, FX_STRSIZE iLength) { + ASSERT(pStr); + + if (iLength < 0) + iLength = FXSYS_wcslen(pStr); + + uint16_t wch; + if (sizeof(wchar_t) > 2) { + while (iLength-- > 0) { + wch = static_cast<uint16_t>(*pStr); + wch = (wch >> 8) | (wch << 8); + wch &= 0x00FF; + *pStr = wch; + ++pStr; + } + return; + } + + while (iLength-- > 0) { + wch = static_cast<uint16_t>(*pStr); + wch = (wch >> 8) | (wch << 8); + *pStr = wch; + ++pStr; + } +} + +} // namespace + +#if _FX_ENDIAN_ == _FX_LITTLE_ENDIAN_ +#define BOM_MASK 0x00FFFFFF +#define BOM_UTF8 0x00BFBBEF +#define BOM_UTF16_MASK 0x0000FFFF +#define BOM_UTF16_BE 0x0000FFFE +#define BOM_UTF16_LE 0x0000FEFF +#else +#define BOM_MASK 0xFFFFFF00 +#define BOM_UTF8 0xEFBBBF00 +#define BOM_UTF16_MASK 0xFFFF0000 +#define BOM_UTF16_BE 0xFEFF0000 +#define BOM_UTF16_LE 0xFFFE0000 +#endif // _FX_ENDIAN_ == _FX_LITTLE_ENDIAN_ + +CFX_SeekableStreamProxy::CFX_SeekableStreamProxy( + const CFX_RetainPtr<IFX_SeekableStream>& stream, + bool isWriteStream) + : m_IsWriteStream(isWriteStream), + m_wCodePage(FX_CODEPAGE_DefANSI), + m_wBOMLength(0), + m_iPosition(0), + m_pStream(stream) { + ASSERT(m_pStream); + + if (isWriteStream) { + m_iPosition = m_pStream->GetSize(); + return; + } + + FX_FILESIZE iPosition = GetPosition(); + Seek(CFX_SeekableStreamProxy::Pos::Begin, 0); + + uint32_t bom; + ReadData(reinterpret_cast<uint8_t*>(&bom), 3); + + bom &= BOM_MASK; + if (bom == BOM_UTF8) { + m_wBOMLength = 3; + m_wCodePage = FX_CODEPAGE_UTF8; + } else { + bom &= BOM_UTF16_MASK; + if (bom == BOM_UTF16_BE) { + m_wBOMLength = 2; + m_wCodePage = FX_CODEPAGE_UTF16BE; + } else if (bom == BOM_UTF16_LE) { + m_wBOMLength = 2; + m_wCodePage = FX_CODEPAGE_UTF16LE; + } else { + m_wBOMLength = 0; + m_wCodePage = FXSYS_GetACP(); + } + } + + Seek(CFX_SeekableStreamProxy::Pos::Begin, + std::max(static_cast<FX_FILESIZE>(m_wBOMLength), iPosition)); +} + +CFX_SeekableStreamProxy::CFX_SeekableStreamProxy(uint8_t* data, FX_STRSIZE size) + : CFX_SeekableStreamProxy(IFX_MemoryStream::Create(data, size), false) {} + +CFX_SeekableStreamProxy::~CFX_SeekableStreamProxy() {} + +void CFX_SeekableStreamProxy::Seek(CFX_SeekableStreamProxy::Pos eSeek, + FX_FILESIZE iOffset) { + switch (eSeek) { + case CFX_SeekableStreamProxy::Pos::Begin: + m_iPosition = iOffset; + break; + case CFX_SeekableStreamProxy::Pos::Current: + m_iPosition += iOffset; + break; + } + m_iPosition = + pdfium::clamp(m_iPosition, static_cast<FX_FILESIZE>(0), GetLength()); +} + +void CFX_SeekableStreamProxy::SetCodePage(uint16_t wCodePage) { + if (m_wBOMLength > 0) + return; + m_wCodePage = wCodePage; +} + +FX_STRSIZE CFX_SeekableStreamProxy::ReadData(uint8_t* pBuffer, + FX_STRSIZE iBufferSize) { + ASSERT(pBuffer && iBufferSize > 0); + + if (m_IsWriteStream) + return -1; + + iBufferSize = std::min( + iBufferSize, static_cast<FX_STRSIZE>(m_pStream->GetSize() - m_iPosition)); + if (iBufferSize <= 0) + return 0; + + if (m_pStream->ReadBlock(pBuffer, m_iPosition, iBufferSize)) { + pdfium::base::CheckedNumeric<FX_FILESIZE> new_pos = m_iPosition; + new_pos += iBufferSize; + if (!new_pos.IsValid()) + return 0; + + m_iPosition = new_pos.ValueOrDie(); + return iBufferSize; + } + return 0; +} + +FX_STRSIZE CFX_SeekableStreamProxy::ReadString(wchar_t* pStr, + FX_STRSIZE iMaxLength, + bool* bEOS) { + ASSERT(pStr && iMaxLength > 0); + + if (m_IsWriteStream) + return -1; + + if (m_wCodePage == FX_CODEPAGE_UTF16LE || + m_wCodePage == FX_CODEPAGE_UTF16BE) { + FX_FILESIZE iBytes = iMaxLength * 2; + FX_STRSIZE iLen = ReadData(reinterpret_cast<uint8_t*>(pStr), iBytes); + iMaxLength = iLen / 2; + if (sizeof(wchar_t) > 2) + UTF16ToWChar(pStr, iMaxLength); + +#if _FX_ENDIAN_ == _FX_BIG_ENDIAN_ + if (m_wCodePage == FX_CODEPAGE_UTF16LE) + SwapByteOrder(pStr, iMaxLength); +#else + if (m_wCodePage == FX_CODEPAGE_UTF16BE) + SwapByteOrder(pStr, iMaxLength); +#endif + + } else { + FX_FILESIZE pos = GetPosition(); + FX_STRSIZE iBytes = + std::min(iMaxLength, static_cast<FX_STRSIZE>(GetLength() - pos)); + + if (iBytes > 0) { + std::vector<uint8_t> buf(iBytes); + + FX_STRSIZE iLen = ReadData(buf.data(), iBytes); + if (m_wCodePage != FX_CODEPAGE_UTF8) + return -1; + + FX_STRSIZE iSrc = 0; + std::tie(iSrc, iMaxLength) = UTF8Decode( + reinterpret_cast<const char*>(buf.data()), iLen, pStr, iMaxLength); + Seek(CFX_SeekableStreamProxy::Pos::Current, iSrc - iLen); + } else { + iMaxLength = 0; + } + } + + *bEOS = IsEOF(); + return iMaxLength; +} + +void CFX_SeekableStreamProxy::WriteString(const CFX_WideStringC& str) { + if (!m_IsWriteStream || str.GetLength() == 0 || + m_wCodePage != FX_CODEPAGE_UTF8) { + return; + } + if (!m_pStream->WriteBlock(str.c_str(), m_iPosition, + str.GetLength() * sizeof(wchar_t))) { + return; + } + + pdfium::base::CheckedNumeric<FX_STRSIZE> new_pos = m_iPosition; + new_pos += str.GetLength() * sizeof(wchar_t); + if (!new_pos.IsValid()) { + m_iPosition = std::numeric_limits<FX_STRSIZE>::max(); + return; + } + + m_iPosition = new_pos.ValueOrDie(); +} diff --git a/core/fxcrt/cfx_seekablestreamproxy.h b/core/fxcrt/cfx_seekablestreamproxy.h new file mode 100644 index 0000000000..d059fb8956 --- /dev/null +++ b/core/fxcrt/cfx_seekablestreamproxy.h @@ -0,0 +1,54 @@ +// Copyright 2017 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_CFX_SEEKABLESTREAMPROXY_H_ +#define CORE_FXCRT_CFX_SEEKABLESTREAMPROXY_H_ + +#include <algorithm> + +#include "core/fxcrt/cfx_retain_ptr.h" +#include "core/fxcrt/fx_stream.h" +#include "core/fxcrt/fx_system.h" + +class CFX_SeekableStreamProxy : public CFX_Retainable { + public: + enum class Pos { + Begin = 0, + Current, + }; + + template <typename T, typename... Args> + friend CFX_RetainPtr<T> pdfium::MakeRetain(Args&&... args); + + FX_FILESIZE GetLength() const { return m_pStream->GetSize(); } + FX_FILESIZE GetPosition() { return m_iPosition; } + FX_STRSIZE GetBOMLength() const { return std::max(0, m_wBOMLength); } + bool IsEOF() const { return m_iPosition >= GetLength(); } + + void Seek(CFX_SeekableStreamProxy::Pos eSeek, FX_FILESIZE iOffset); + FX_STRSIZE ReadString(wchar_t* pStr, FX_STRSIZE iMaxLength, bool* bEOS); + + void WriteString(const CFX_WideStringC& str); + + uint16_t GetCodePage() const { return m_wCodePage; } + void SetCodePage(uint16_t wCodePage); + + private: + CFX_SeekableStreamProxy(const CFX_RetainPtr<IFX_SeekableStream>& stream, + bool isWriteSteam); + CFX_SeekableStreamProxy(uint8_t* data, FX_STRSIZE size); + ~CFX_SeekableStreamProxy() override; + + FX_STRSIZE ReadData(uint8_t* pBuffer, FX_STRSIZE iBufferSize); + + bool m_IsWriteStream; + uint16_t m_wCodePage; + FX_STRSIZE m_wBOMLength; + FX_FILESIZE m_iPosition; + CFX_RetainPtr<IFX_SeekableStream> m_pStream; +}; + +#endif // CORE_FXCRT_CFX_SEEKABLESTREAMPROXY_H_ diff --git a/core/fxcrt/fx_codepage.h b/core/fxcrt/fx_codepage.h new file mode 100644 index 0000000000..db8655dbf6 --- /dev/null +++ b/core/fxcrt/fx_codepage.h @@ -0,0 +1,137 @@ +// Copyright 2014 PDFium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com + +#ifndef CORE_FXCRT_FX_CODEPAGE_H_ +#define CORE_FXCRT_FX_CODEPAGE_H_ + +#include "core/fxcrt/fx_basic.h" + +#define FX_CODEPAGE_DefANSI 0 +#define FX_CODEPAGE_DefOEM 1 +#define FX_CODEPAGE_DefMAC 2 +#define FX_CODEPAGE_Thread 3 +#define FX_CODEPAGE_Symbol 42 +#define FX_CODEPAGE_MSDOS_US 437 +#define FX_CODEPAGE_Arabic_ASMO708 708 +#define FX_CODEPAGE_Arabic_ASMO449Plus 709 +#define FX_CODEPAGE_Arabic_Transparent 710 +#define FX_CODEPAGE_Arabic_NafithaEnhanced 711 +#define FX_CODEPAGE_Arabic_TransparentASMO 720 +#define FX_CODEPAGE_MSDOS_Greek1 737 +#define FX_CODEPAGE_MSDOS_Baltic 775 +#define FX_CODEPAGE_MSWin31_WesternEuropean 819 +#define FX_CODEPAGE_MSDOS_WesternEuropean 850 +#define FX_CODEPAGE_MSDOS_EasternEuropean 852 +#define FX_CODEPAGE_MSDOS_Latin3 853 +#define FX_CODEPAGE_MSDOS_Cyrillic 855 +#define FX_CODEPAGE_MSDOS_Turkish 857 +#define FX_CODEPAGE_MSDOS_Latin1Euro 858 +#define FX_CODEPAGE_MSDOS_Portuguese 860 +#define FX_CODEPAGE_MSDOS_Icelandic 861 +#define FX_CODEPAGE_MSDOS_Hebrew 862 +#define FX_CODEPAGE_MSDOS_FrenchCanadian 863 +#define FX_CODEPAGE_MSDOS_Arabic 864 +#define FX_CODEPAGE_MSDOS_Norwegian 865 +#define FX_CODEPAGE_MSDOS_Russian 866 +#define FX_CODEPAGE_MSDOS_Greek2 869 +#define FX_CODEPAGE_MSDOS_Thai 874 +#define FX_CODEPAGE_MSDOS_KamenickyCS 895 +#define FX_CODEPAGE_ShiftJIS 932 +#define FX_CODEPAGE_ChineseSimplified 936 +#define FX_CODEPAGE_Korean 949 +#define FX_CODEPAGE_ChineseTraditional 950 +#define FX_CODEPAGE_UTF16LE 1200 +#define FX_CODEPAGE_UTF16BE 1201 +#define FX_CODEPAGE_MSWin_EasternEuropean 1250 +#define FX_CODEPAGE_MSWin_Cyrillic 1251 +#define FX_CODEPAGE_MSWin_WesternEuropean 1252 +#define FX_CODEPAGE_MSWin_Greek 1253 +#define FX_CODEPAGE_MSWin_Turkish 1254 +#define FX_CODEPAGE_MSWin_Hebrew 1255 +#define FX_CODEPAGE_MSWin_Arabic 1256 +#define FX_CODEPAGE_MSWin_Baltic 1257 +#define FX_CODEPAGE_MSWin_Vietnamese 1258 +#define FX_CODEPAGE_Johab 1361 +#define FX_CODEPAGE_MAC_Roman 10000 +#define FX_CODEPAGE_MAC_ShiftJIS 10001 +#define FX_CODEPAGE_MAC_ChineseTraditional 10002 +#define FX_CODEPAGE_MAC_Korean 10003 +#define FX_CODEPAGE_MAC_Arabic 10004 +#define FX_CODEPAGE_MAC_Hebrew 10005 +#define FX_CODEPAGE_MAC_Greek 10006 +#define FX_CODEPAGE_MAC_Cyrillic 10007 +#define FX_CODEPAGE_MAC_ChineseSimplified 10008 +#define FX_CODEPAGE_MAC_Thai 10021 +#define FX_CODEPAGE_MAC_EasternEuropean 10029 +#define FX_CODEPAGE_MAC_Turkish 10081 +#define FX_CODEPAGE_UTF32LE 12000 +#define FX_CODEPAGE_UTF32BE 12001 +#define FX_CODEPAGE_ISO8859_1 28591 +#define FX_CODEPAGE_ISO8859_2 28592 +#define FX_CODEPAGE_ISO8859_3 28593 +#define FX_CODEPAGE_ISO8859_4 28594 +#define FX_CODEPAGE_ISO8859_5 28595 +#define FX_CODEPAGE_ISO8859_6 28596 +#define FX_CODEPAGE_ISO8859_7 28597 +#define FX_CODEPAGE_ISO8859_8 28598 +#define FX_CODEPAGE_ISO8859_9 28599 +#define FX_CODEPAGE_ISO8859_10 28600 +#define FX_CODEPAGE_ISO8859_11 28601 +#define FX_CODEPAGE_ISO8859_12 28602 +#define FX_CODEPAGE_ISO8859_13 28603 +#define FX_CODEPAGE_ISO8859_14 28604 +#define FX_CODEPAGE_ISO8859_15 28605 +#define FX_CODEPAGE_ISO8859_16 28606 +#define FX_CODEPAGE_ISCII_Devanagari 57002 +#define FX_CODEPAGE_ISCII_Bengali 57003 +#define FX_CODEPAGE_ISCII_Tamil 57004 +#define FX_CODEPAGE_ISCII_Telugu 57005 +#define FX_CODEPAGE_ISCII_Assamese 57006 +#define FX_CODEPAGE_ISCII_Oriya 57007 +#define FX_CODEPAGE_ISCII_Kannada 57008 +#define FX_CODEPAGE_ISCII_Malayalam 57009 +#define FX_CODEPAGE_ISCII_Gujarati 57010 +#define FX_CODEPAGE_ISCII_Punjabi 57011 +#define FX_CODEPAGE_UTF7 65000 +#define FX_CODEPAGE_UTF8 65001 + +#define FX_CHARSET_ANSI 0 +#define FX_CHARSET_Default 1 +#define FX_CHARSET_Symbol 2 +#define FX_CHARSET_MAC_Roman 77 +#define FX_CHARSET_MAC_ShiftJIS 78 +#define FX_CHARSET_MAC_Korean 79 +#define FX_CHARSET_MAC_ChineseSimplified 80 +#define FX_CHARSET_MAC_ChineseTriditional 81 +#define FX_CHARSET_MAC_Johab 82 +#define FX_CHARSET_MAC_Hebrew 83 +#define FX_CHARSET_MAC_Arabic 84 +#define FX_CHARSET_MAC_Greek 85 +#define FX_CHARSET_MAC_Turkish 86 +#define FX_CHARSET_MAC_Thai 87 +#define FX_CHARSET_MAC_EasternEuropean 88 +#define FX_CHARSET_MAC_Cyrillic 89 +#define FX_CHARSET_ShiftJIS 128 +#define FX_CHARSET_Korean 129 +#define FX_CHARSET_Johab 130 +#define FX_CHARSET_ChineseSimplified 134 +#define FX_CHARSET_ChineseTriditional 136 +#define FX_CHARSET_MSWin_Greek 161 +#define FX_CHARSET_MSWin_Turkish 162 +#define FX_CHARSET_MSWin_Vietnamese 163 +#define FX_CHARSET_MSWin_Hebrew 177 +#define FX_CHARSET_MSWin_Arabic 178 +#define FX_CHARSET_ArabicTraditional 179 +#define FX_CHARSET_ArabicUser 180 +#define FX_CHARSET_HebrewUser 181 +#define FX_CHARSET_MSWin_Baltic 186 +#define FX_CHARSET_MSWin_Cyrillic 204 +#define FX_CHARSET_Thai 222 +#define FX_CHARSET_MSWin_EasterEuropean 238 +#define FX_CHARSET_US 254 +#define FX_CHARSET_OEM 255 + +#endif // CORE_FXCRT_FX_CODEPAGE_H_ |