summaryrefslogtreecommitdiff
path: root/source/pdf/pdf-unicode.c
blob: fc4409c4ccda6ddf18997dc8926806a8d6b1ff4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#include "mupdf/fitz.h"
#include "mupdf/pdf.h"

#include <string.h>

/* Load or synthesize ToUnicode map for fonts */

static void
pdf_remap_cmap_range(fz_context *ctx, pdf_cmap *ucs_from_gid,
	unsigned int cpt, unsigned int gid, unsigned int n, pdf_cmap *ucs_from_cpt)
{
	unsigned int k;
	int ucsbuf[8];
	int ucslen;

	for (k = 0; k <= n; ++k)
	{
		ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt + k, ucsbuf);
		if (ucslen == 1)
			pdf_map_range_to_range(ctx, ucs_from_gid, gid + k, gid + k, ucsbuf[0]);
		else if (ucslen > 1)
			pdf_map_one_to_many(ctx, ucs_from_gid, gid + k, ucsbuf, ucslen);
	}
}

static pdf_cmap *
pdf_remap_cmap(fz_context *ctx, pdf_cmap *gid_from_cpt, pdf_cmap *ucs_from_cpt)
{
	pdf_cmap *ucs_from_gid;
	unsigned int a, b, x;
	int i;

	ucs_from_gid = pdf_new_cmap(ctx);

	fz_try(ctx)
	{
		if (gid_from_cpt->usecmap)
			ucs_from_gid->usecmap = pdf_remap_cmap(ctx, gid_from_cpt->usecmap, ucs_from_cpt);

		for (i = 0; i < gid_from_cpt->codespace_len; i++)
		{
			pdf_add_codespace(ctx, ucs_from_gid,
					gid_from_cpt->codespace[i].low,
					gid_from_cpt->codespace[i].high,
					gid_from_cpt->codespace[i].n);
		}

		for (i = 0; i < gid_from_cpt->rlen; ++i)
		{
			a = gid_from_cpt->ranges[i].low;
			b = gid_from_cpt->ranges[i].high;
			x = gid_from_cpt->ranges[i].out;
			pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt);
		}

		for (i = 0; i < gid_from_cpt->xlen; ++i)
		{
			a = gid_from_cpt->xranges[i].low;
			b = gid_from_cpt->xranges[i].high;
			x = gid_from_cpt->xranges[i].out;
			pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt);
		}

		/* Font encoding CMaps don't have one-to-many mappings, so we can ignore the mranges. */

		pdf_sort_cmap(ctx, ucs_from_gid);
	}
	fz_catch(ctx)
	{
		pdf_drop_cmap(ctx, ucs_from_gid);
		fz_rethrow(ctx);
	}

	return ucs_from_gid;
}

void
pdf_load_to_unicode(fz_context *ctx, pdf_document *doc, pdf_font_desc *font,
	const char **strings, char *collection, pdf_obj *cmapstm)
{
	unsigned int cpt;

	if (pdf_is_stream(ctx, cmapstm))
	{
		pdf_cmap *ucs_from_cpt = pdf_load_embedded_cmap(ctx, doc, cmapstm);
		fz_try(ctx)
			font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt);
		fz_always(ctx)
			pdf_drop_cmap(ctx, ucs_from_cpt);
		fz_catch(ctx)
			fz_rethrow(ctx);
		font->size += pdf_cmap_size(ctx, font->to_unicode);
	}

	else if (collection)
	{
		if (!strcmp(collection, "Adobe-CNS1"))
			font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-CNS1-UCS2");
		else if (!strcmp(collection, "Adobe-GB1"))
			font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-GB1-UCS2");
		else if (!strcmp(collection, "Adobe-Japan1"))
			font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Japan1-UCS2");
		else if (!strcmp(collection, "Adobe-Korea1"))
			font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Korea1-UCS2");

		return;
	}

	if (strings)
	{
		/* TODO one-to-many mappings */

		font->cid_to_ucs_len = 256;
		font->cid_to_ucs = fz_malloc_array(ctx, 256, sizeof *font->cid_to_ucs);
		font->size += 256 * sizeof *font->cid_to_ucs;

		for (cpt = 0; cpt < 256; cpt++)
		{
			if (strings[cpt])
				font->cid_to_ucs[cpt] = pdf_lookup_agl(strings[cpt]);
			else
				font->cid_to_ucs[cpt] = FZ_REPLACEMENT_CHARACTER;
		}
	}

	if (!font->to_unicode && !font->cid_to_ucs)
	{
		/* TODO: synthesize a ToUnicode if it's a freetype font with
		 * cmap and/or post tables or if it has glyph names. */
	}
}