summaryrefslogtreecommitdiff
path: root/core/fpdfapi/font/cpdf_cmap.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'core/fpdfapi/font/cpdf_cmap.cpp')
-rw-r--r--core/fpdfapi/font/cpdf_cmap.cpp488
1 files changed, 488 insertions, 0 deletions
diff --git a/core/fpdfapi/font/cpdf_cmap.cpp b/core/fpdfapi/font/cpdf_cmap.cpp
new file mode 100644
index 0000000000..55f5ccc5c5
--- /dev/null
+++ b/core/fpdfapi/font/cpdf_cmap.cpp
@@ -0,0 +1,488 @@
+// Copyright 2017 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fpdfapi/font/cpdf_cmap.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "core/fpdfapi/cmaps/cmap_int.h"
+#include "core/fpdfapi/font/cpdf_cmapmanager.h"
+#include "core/fpdfapi/font/cpdf_cmapparser.h"
+#include "core/fpdfapi/parser/cpdf_simple_parser.h"
+
+namespace {
+
+struct ByteRange {
+ uint8_t m_First;
+ uint8_t m_Last; // Inclusive.
+};
+
+struct PredefinedCMap {
+ const char* m_pName;
+ CIDSet m_Charset;
+ CIDCoding m_Coding;
+ CPDF_CMap::CodingScheme m_CodingScheme;
+ uint8_t m_LeadingSegCount;
+ ByteRange m_LeadingSegs[2];
+};
+
+const PredefinedCMap g_PredefinedCMaps[] = {
+ {"GB-EUC",
+ CIDSET_GB1,
+ CIDCODING_GB,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfe}}},
+ {"GBpc-EUC",
+ CIDSET_GB1,
+ CIDCODING_GB,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfc}}},
+ {"GBK-EUC",
+ CIDSET_GB1,
+ CIDCODING_GB,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x81, 0xfe}}},
+ {"GBKp-EUC",
+ CIDSET_GB1,
+ CIDCODING_GB,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x81, 0xfe}}},
+ {"GBK2K-EUC",
+ CIDSET_GB1,
+ CIDCODING_GB,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x81, 0xfe}}},
+ {"GBK2K",
+ CIDSET_GB1,
+ CIDCODING_GB,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x81, 0xfe}}},
+ {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
+ {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
+ {"B5pc",
+ CIDSET_CNS1,
+ CIDCODING_BIG5,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfc}}},
+ {"HKscs-B5",
+ CIDSET_CNS1,
+ CIDCODING_BIG5,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x88, 0xfe}}},
+ {"ETen-B5",
+ CIDSET_CNS1,
+ CIDCODING_BIG5,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfe}}},
+ {"ETenms-B5",
+ CIDSET_CNS1,
+ CIDCODING_BIG5,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfe}}},
+ {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
+ {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
+ {"83pv-RKSJ",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x81, 0x9f}, {0xe0, 0xfc}}},
+ {"90ms-RKSJ",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x81, 0x9f}, {0xe0, 0xfc}}},
+ {"90msp-RKSJ",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x81, 0x9f}, {0xe0, 0xfc}}},
+ {"90pv-RKSJ",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x81, 0x9f}, {0xe0, 0xfc}}},
+ {"Add-RKSJ",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x81, 0x9f}, {0xe0, 0xfc}}},
+ {"EUC",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x8e, 0x8e}, {0xa1, 0xfe}}},
+ {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
+ {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
+ {"Ext-RKSJ",
+ CIDSET_JAPAN1,
+ CIDCODING_JIS,
+ CPDF_CMap::MixedTwoBytes,
+ 2,
+ {{0x81, 0x9f}, {0xe0, 0xfc}}},
+ {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
+ {"UniJIS-UCS2-HW",
+ CIDSET_JAPAN1,
+ CIDCODING_UCS2,
+ CPDF_CMap::TwoBytes,
+ 0,
+ {}},
+ {"UniJIS-UTF16",
+ CIDSET_JAPAN1,
+ CIDCODING_UTF16,
+ CPDF_CMap::TwoBytes,
+ 0,
+ {}},
+ {"KSC-EUC",
+ CIDSET_KOREA1,
+ CIDCODING_KOREA,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfe}}},
+ {"KSCms-UHC",
+ CIDSET_KOREA1,
+ CIDCODING_KOREA,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x81, 0xfe}}},
+ {"KSCms-UHC-HW",
+ CIDSET_KOREA1,
+ CIDCODING_KOREA,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0x81, 0xfe}}},
+ {"KSCpc-EUC",
+ CIDSET_KOREA1,
+ CIDCODING_KOREA,
+ CPDF_CMap::MixedTwoBytes,
+ 1,
+ {{0xa1, 0xfd}}},
+ {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
+ {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
+};
+
+int CheckFourByteCodeRange(uint8_t* codes,
+ int size,
+ const std::vector<CPDF_CMap::CodeRange>& ranges) {
+ int iSeg = pdfium::CollectionSize<int>(ranges) - 1;
+ while (iSeg >= 0) {
+ if (ranges[iSeg].m_CharSize < size) {
+ --iSeg;
+ continue;
+ }
+ int iChar = 0;
+ while (iChar < size) {
+ if (codes[iChar] < ranges[iSeg].m_Lower[iChar] ||
+ codes[iChar] > ranges[iSeg].m_Upper[iChar]) {
+ break;
+ }
+ ++iChar;
+ }
+ if (iChar == ranges[iSeg].m_CharSize)
+ return 2;
+ if (iChar)
+ return (size == ranges[iSeg].m_CharSize) ? 2 : 1;
+ iSeg--;
+ }
+ return 0;
+}
+
+int GetFourByteCharSizeImpl(uint32_t charcode,
+ const std::vector<CPDF_CMap::CodeRange>& ranges) {
+ if (ranges.empty())
+ return 1;
+
+ uint8_t codes[4];
+ codes[0] = codes[1] = 0x00;
+ codes[2] = (uint8_t)(charcode >> 8 & 0xFF);
+ codes[3] = (uint8_t)charcode;
+ int offset = 0;
+ int size = 4;
+ for (int i = 0; i < 4; ++i) {
+ int iSeg = pdfium::CollectionSize<int>(ranges) - 1;
+ while (iSeg >= 0) {
+ if (ranges[iSeg].m_CharSize < size) {
+ --iSeg;
+ continue;
+ }
+ int iChar = 0;
+ while (iChar < size) {
+ if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
+ codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
+ break;
+ }
+ ++iChar;
+ }
+ if (iChar == ranges[iSeg].m_CharSize)
+ return size;
+ --iSeg;
+ }
+ --size;
+ ++offset;
+ }
+ return 1;
+}
+
+} // namespace
+
+CPDF_CMap::CPDF_CMap()
+ : m_bLoaded(false),
+ m_bVertical(false),
+ m_Charset(CIDSET_UNKNOWN),
+ m_CodingScheme(TwoBytes),
+ m_Coding(CIDCODING_UNKNOWN),
+ m_pEmbedMap(nullptr) {}
+
+CPDF_CMap::~CPDF_CMap() {}
+
+void CPDF_CMap::LoadPredefined(CPDF_CMapManager* pMgr,
+ const CFX_ByteString& bsName,
+ bool bPromptCJK) {
+ m_PredefinedCMap = bsName;
+ if (m_PredefinedCMap == "Identity-H" || m_PredefinedCMap == "Identity-V") {
+ m_Coding = CIDCODING_CID;
+ m_bVertical = bsName[9] == 'V';
+ m_bLoaded = true;
+ return;
+ }
+ CFX_ByteString cmapid = m_PredefinedCMap;
+ m_bVertical = cmapid.Right(1) == "V";
+ if (cmapid.GetLength() > 2) {
+ cmapid = cmapid.Left(cmapid.GetLength() - 2);
+ }
+ const PredefinedCMap* map = nullptr;
+ for (size_t i = 0; i < FX_ArraySize(g_PredefinedCMaps); ++i) {
+ if (cmapid == CFX_ByteStringC(g_PredefinedCMaps[i].m_pName)) {
+ map = &g_PredefinedCMaps[i];
+ break;
+ }
+ }
+ if (!map)
+ return;
+
+ m_Charset = map->m_Charset;
+ m_Coding = map->m_Coding;
+ m_CodingScheme = map->m_CodingScheme;
+ if (m_CodingScheme == MixedTwoBytes) {
+ m_MixedTwoByteLeadingBytes = std::vector<bool>(256);
+ for (uint32_t i = 0; i < map->m_LeadingSegCount; ++i) {
+ const ByteRange& seg = map->m_LeadingSegs[i];
+ for (int b = seg.m_First; b <= seg.m_Last; ++b)
+ m_MixedTwoByteLeadingBytes[b] = true;
+ }
+ }
+ FPDFAPI_FindEmbeddedCMap(bsName, m_Charset, m_Coding, m_pEmbedMap);
+ if (!m_pEmbedMap)
+ return;
+
+ m_bLoaded = true;
+}
+
+void CPDF_CMap::LoadEmbedded(const uint8_t* pData, uint32_t size) {
+ m_DirectCharcodeToCIDTable = std::vector<uint16_t>(65536);
+ CPDF_CMapParser parser(this);
+ CPDF_SimpleParser syntax(pData, size);
+ while (1) {
+ CFX_ByteStringC word = syntax.GetWord();
+ if (word.IsEmpty()) {
+ break;
+ }
+ parser.ParseWord(word);
+ }
+ if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) {
+ m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings();
+ std::sort(
+ m_AdditionalCharcodeToCIDMappings.begin(),
+ m_AdditionalCharcodeToCIDMappings.end(),
+ [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
+ return arg1.m_EndCode < arg2.m_EndCode;
+ });
+ }
+}
+
+uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
+ if (m_Coding == CIDCODING_CID)
+ return static_cast<uint16_t>(charcode);
+
+ if (m_pEmbedMap)
+ return FPDFAPI_CIDFromCharCode(m_pEmbedMap, charcode);
+
+ if (m_DirectCharcodeToCIDTable.empty())
+ return static_cast<uint16_t>(charcode);
+
+ if (charcode < 0x10000)
+ return m_DirectCharcodeToCIDTable[charcode];
+
+ auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
+ m_AdditionalCharcodeToCIDMappings.end(), charcode,
+ [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
+ return arg.m_EndCode < val;
+ });
+ if (it == m_AdditionalCharcodeToCIDMappings.end() ||
+ it->m_StartCode > charcode) {
+ return 0;
+ }
+ return it->m_StartCID + charcode - it->m_StartCode;
+}
+
+uint32_t CPDF_CMap::GetNextChar(const char* pString,
+ int nStrLen,
+ int& offset) const {
+ auto* pBytes = reinterpret_cast<const uint8_t*>(pString);
+ switch (m_CodingScheme) {
+ case OneByte: {
+ return pBytes[offset++];
+ }
+ case TwoBytes: {
+ uint8_t byte1 = pBytes[offset++];
+ return 256 * byte1 + pBytes[offset++];
+ }
+ case MixedTwoBytes: {
+ uint8_t byte1 = pBytes[offset++];
+ if (!m_MixedTwoByteLeadingBytes[byte1])
+ return byte1;
+ return 256 * byte1 + pBytes[offset++];
+ }
+ case MixedFourBytes: {
+ uint8_t codes[4];
+ int char_size = 1;
+ codes[0] = pBytes[offset++];
+ while (1) {
+ int ret = CheckFourByteCodeRange(codes, char_size,
+ m_MixedFourByteLeadingRanges);
+ if (ret == 0)
+ return 0;
+ if (ret == 2) {
+ uint32_t charcode = 0;
+ for (int i = 0; i < char_size; i++)
+ charcode = (charcode << 8) + codes[i];
+ return charcode;
+ }
+ if (char_size == 4 || offset == nStrLen)
+ return 0;
+ codes[char_size++] = pBytes[offset++];
+ }
+ break;
+ }
+ }
+ return 0;
+}
+
+int CPDF_CMap::GetCharSize(uint32_t charcode) const {
+ switch (m_CodingScheme) {
+ case OneByte:
+ return 1;
+ case TwoBytes:
+ return 2;
+ case MixedTwoBytes:
+ if (charcode < 0x100)
+ return 1;
+ return 2;
+ case MixedFourBytes:
+ if (charcode < 0x100)
+ return 1;
+ if (charcode < 0x10000)
+ return 2;
+ if (charcode < 0x1000000)
+ return 3;
+ return 4;
+ }
+ return 1;
+}
+
+int CPDF_CMap::CountChar(const char* pString, int size) const {
+ switch (m_CodingScheme) {
+ case OneByte:
+ return size;
+ case TwoBytes:
+ return (size + 1) / 2;
+ case MixedTwoBytes: {
+ int count = 0;
+ for (int i = 0; i < size; i++) {
+ count++;
+ if (m_MixedTwoByteLeadingBytes[reinterpret_cast<const uint8_t*>(
+ pString)[i]]) {
+ i++;
+ }
+ }
+ return count;
+ }
+ case MixedFourBytes: {
+ int count = 0, offset = 0;
+ while (offset < size) {
+ GetNextChar(pString, size, offset);
+ count++;
+ }
+ return count;
+ }
+ }
+ return size;
+}
+
+int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
+ switch (m_CodingScheme) {
+ case OneByte:
+ str[0] = (uint8_t)charcode;
+ return 1;
+ case TwoBytes:
+ str[0] = (uint8_t)(charcode / 256);
+ str[1] = (uint8_t)(charcode % 256);
+ return 2;
+ case MixedTwoBytes:
+ if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[(uint8_t)charcode]) {
+ str[0] = (uint8_t)charcode;
+ return 1;
+ }
+ str[0] = (uint8_t)(charcode >> 8);
+ str[1] = (uint8_t)charcode;
+ return 2;
+ case MixedFourBytes:
+ if (charcode < 0x100) {
+ int iSize =
+ GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges);
+ if (iSize == 0)
+ iSize = 1;
+ str[iSize - 1] = (uint8_t)charcode;
+ if (iSize > 1)
+ memset(str, 0, iSize - 1);
+ return iSize;
+ }
+ if (charcode < 0x10000) {
+ str[0] = (uint8_t)(charcode >> 8);
+ str[1] = (uint8_t)charcode;
+ return 2;
+ }
+ if (charcode < 0x1000000) {
+ str[0] = (uint8_t)(charcode >> 16);
+ str[1] = (uint8_t)(charcode >> 8);
+ str[2] = (uint8_t)charcode;
+ return 3;
+ }
+ str[0] = (uint8_t)(charcode >> 24);
+ str[1] = (uint8_t)(charcode >> 16);
+ str[2] = (uint8_t)(charcode >> 8);
+ str[3] = (uint8_t)charcode;
+ return 4;
+ }
+ return 0;
+}