summaryrefslogtreecommitdiff
path: root/pdf/pdf_parse.c
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2011-04-04 18:18:16 +0200
committerTor Andersson <tor.andersson@artifex.com>2011-04-04 18:18:16 +0200
commitf81e5ab22ba18963e56aad43c1c7fa9826935f3d (patch)
treecf3b261e90df51014755a8d1395116f839f73c95 /pdf/pdf_parse.c
parentc8d226b5bfb5dab2db10ea5175966de7bac9640e (diff)
downloadmupdf-f81e5ab22ba18963e56aad43c1c7fa9826935f3d.tar.xz
pdf: Rename mupdf directory.
Diffstat (limited to 'pdf/pdf_parse.c')
-rw-r--r--pdf/pdf_parse.c538
1 files changed, 538 insertions, 0 deletions
diff --git a/pdf/pdf_parse.c b/pdf/pdf_parse.c
new file mode 100644
index 00000000..b32f4d01
--- /dev/null
+++ b/pdf/pdf_parse.c
@@ -0,0 +1,538 @@
+#include "fitz.h"
+#include "mupdf.h"
+
+fz_rect pdf_torect(fz_obj *array)
+{
+ fz_rect r;
+ float a = fz_toreal(fz_arrayget(array, 0));
+ float b = fz_toreal(fz_arrayget(array, 1));
+ float c = fz_toreal(fz_arrayget(array, 2));
+ float d = fz_toreal(fz_arrayget(array, 3));
+ r.x0 = MIN(a, c);
+ r.y0 = MIN(b, d);
+ r.x1 = MAX(a, c);
+ r.y1 = MAX(b, d);
+ return r;
+}
+
+fz_matrix pdf_tomatrix(fz_obj *array)
+{
+ fz_matrix m;
+ m.a = fz_toreal(fz_arrayget(array, 0));
+ m.b = fz_toreal(fz_arrayget(array, 1));
+ m.c = fz_toreal(fz_arrayget(array, 2));
+ m.d = fz_toreal(fz_arrayget(array, 3));
+ m.e = fz_toreal(fz_arrayget(array, 4));
+ m.f = fz_toreal(fz_arrayget(array, 5));
+ return m;
+}
+
+char *
+pdf_toutf8(fz_obj *src)
+{
+ unsigned char *srcptr = (unsigned char *) fz_tostrbuf(src);
+ char *dstptr, *dst;
+ int srclen = fz_tostrlen(src);
+ int dstlen = 0;
+ int ucs;
+ int i;
+
+ if (srclen > 2 && srcptr[0] == 254 && srcptr[1] == 255)
+ {
+ for (i = 2; i < srclen; i += 2)
+ {
+ ucs = (srcptr[i] << 8) | srcptr[i+1];
+ dstlen += runelen(ucs);
+ }
+
+ dstptr = dst = fz_malloc(dstlen + 1);
+
+ for (i = 2; i < srclen; i += 2)
+ {
+ ucs = (srcptr[i] << 8) | srcptr[i+1];
+ dstptr += runetochar(dstptr, &ucs);
+ }
+ }
+
+ else
+ {
+ for (i = 0; i < srclen; i++)
+ dstlen += runelen(pdf_docencoding[srcptr[i]]);
+
+ dstptr = dst = fz_malloc(dstlen + 1);
+
+ for (i = 0; i < srclen; i++)
+ {
+ ucs = pdf_docencoding[srcptr[i]];
+ dstptr += runetochar(dstptr, &ucs);
+ }
+ }
+
+ *dstptr = '\0';
+ return dst;
+}
+
+unsigned short *
+pdf_toucs2(fz_obj *src)
+{
+ unsigned char *srcptr = (unsigned char *) fz_tostrbuf(src);
+ unsigned short *dstptr, *dst;
+ int srclen = fz_tostrlen(src);
+ int i;
+
+ if (srclen > 2 && srcptr[0] == 254 && srcptr[1] == 255)
+ {
+ dstptr = dst = fz_calloc((srclen - 2) / 2 + 1, sizeof(short));
+ for (i = 2; i < srclen; i += 2)
+ *dstptr++ = (srcptr[i] << 8) | srcptr[i+1];
+ }
+
+ else
+ {
+ dstptr = dst = fz_calloc(srclen + 1, sizeof(short));
+ for (i = 0; i < srclen; i++)
+ *dstptr++ = pdf_docencoding[srcptr[i]];
+ }
+
+ *dstptr = '\0';
+ return dst;
+}
+
+fz_obj *
+pdf_toutf8name(fz_obj *src)
+{
+ char *buf = pdf_toutf8(src);
+ fz_obj *dst = fz_newname(buf);
+ fz_free(buf);
+ return dst;
+}
+
+fz_error
+pdf_parsearray(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
+{
+ fz_error error = fz_okay;
+ fz_obj *ary = nil;
+ fz_obj *obj = nil;
+ int a = 0, b = 0, n = 0;
+ int tok;
+ int len;
+
+ ary = fz_newarray(4);
+
+ while (1)
+ {
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ {
+ fz_dropobj(ary);
+ return fz_rethrow(error, "cannot parse array");
+ }
+
+ if (tok != PDF_TINT && tok != PDF_TR)
+ {
+ if (n > 0)
+ {
+ obj = fz_newint(a);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ }
+ if (n > 1)
+ {
+ obj = fz_newint(b);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ }
+ n = 0;
+ }
+
+ if (tok == PDF_TINT && n == 2)
+ {
+ obj = fz_newint(a);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ a = b;
+ n --;
+ }
+
+ switch (tok)
+ {
+ case PDF_TCARRAY:
+ *op = ary;
+ return fz_okay;
+
+ case PDF_TINT:
+ if (n == 0)
+ a = atoi(buf);
+ if (n == 1)
+ b = atoi(buf);
+ n ++;
+ break;
+
+ case PDF_TR:
+ if (n != 2)
+ {
+ fz_dropobj(ary);
+ return fz_throw("cannot parse indirect reference in array");
+ }
+ obj = fz_newindirect(a, b, xref);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ n = 0;
+ break;
+
+ case PDF_TOARRAY:
+ error = pdf_parsearray(&obj, xref, file, buf, cap);
+ if (error)
+ {
+ fz_dropobj(ary);
+ return fz_rethrow(error, "cannot parse array");
+ }
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+
+ case PDF_TODICT:
+ error = pdf_parsedict(&obj, xref, file, buf, cap);
+ if (error)
+ {
+ fz_dropobj(ary);
+ return fz_rethrow(error, "cannot parse array");
+ }
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+
+ case PDF_TNAME:
+ obj = fz_newname(buf);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+ case PDF_TREAL:
+ obj = fz_newreal(atof(buf));
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+ case PDF_TSTRING:
+ obj = fz_newstring(buf, len);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+ case PDF_TTRUE:
+ obj = fz_newbool(1);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+ case PDF_TFALSE:
+ obj = fz_newbool(0);
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+ case PDF_TNULL:
+ obj = fz_newnull();
+ fz_arraypush(ary, obj);
+ fz_dropobj(obj);
+ break;
+
+ default:
+ fz_dropobj(ary);
+ return fz_throw("cannot parse token in array");
+ }
+ }
+}
+
+fz_error
+pdf_parsedict(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
+{
+ fz_error error = fz_okay;
+ fz_obj *dict = nil;
+ fz_obj *key = nil;
+ fz_obj *val = nil;
+ int tok;
+ int len;
+ int a, b;
+
+ dict = fz_newdict(8);
+
+ while (1)
+ {
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ {
+ fz_dropobj(dict);
+ return fz_rethrow(error, "cannot parse dict");
+ }
+
+skip:
+ if (tok == PDF_TCDICT)
+ {
+ *op = dict;
+ return fz_okay;
+ }
+
+ /* for BI .. ID .. EI in content streams */
+ if (tok == PDF_TKEYWORD && !strcmp(buf, "ID"))
+ {
+ *op = dict;
+ return fz_okay;
+ }
+
+ if (tok != PDF_TNAME)
+ {
+ fz_dropobj(dict);
+ return fz_throw("invalid key in dict");
+ }
+
+ key = fz_newname(buf);
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ {
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_rethrow(error, "cannot parse dict");
+ }
+
+ switch (tok)
+ {
+ case PDF_TOARRAY:
+ error = pdf_parsearray(&val, xref, file, buf, cap);
+ if (error)
+ {
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_rethrow(error, "cannot parse dict");
+ }
+ break;
+
+ case PDF_TODICT:
+ error = pdf_parsedict(&val, xref, file, buf, cap);
+ if (error)
+ {
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_rethrow(error, "cannot parse dict");
+ }
+ break;
+
+ case PDF_TNAME: val = fz_newname(buf); break;
+ case PDF_TREAL: val = fz_newreal(atof(buf)); break;
+ case PDF_TSTRING: val = fz_newstring(buf, len); break;
+ case PDF_TTRUE: val = fz_newbool(1); break;
+ case PDF_TFALSE: val = fz_newbool(0); break;
+ case PDF_TNULL: val = fz_newnull(); break;
+
+ case PDF_TINT:
+ /* 64-bit to allow for numbers > INT_MAX and overflow */
+ a = (int) strtoll(buf, 0, 10);
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ {
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_rethrow(error, "cannot parse dict");
+ }
+ if (tok == PDF_TCDICT || tok == PDF_TNAME ||
+ (tok == PDF_TKEYWORD && !strcmp(buf, "ID")))
+ {
+ val = fz_newint(a);
+ fz_dictput(dict, key, val);
+ fz_dropobj(val);
+ fz_dropobj(key);
+ goto skip;
+ }
+ if (tok == PDF_TINT)
+ {
+ b = atoi(buf);
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ {
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_rethrow(error, "cannot parse dict");
+ }
+ if (tok == PDF_TR)
+ {
+ val = fz_newindirect(a, b, xref);
+ break;
+ }
+ }
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_throw("invalid indirect reference in dict");
+
+ default:
+ fz_dropobj(key);
+ fz_dropobj(dict);
+ return fz_throw("unknown token in dict");
+ }
+
+ fz_dictput(dict, key, val);
+ fz_dropobj(val);
+ fz_dropobj(key);
+ }
+}
+
+fz_error
+pdf_parsestmobj(fz_obj **op, pdf_xref *xref, fz_stream *file, char *buf, int cap)
+{
+ fz_error error;
+ int tok;
+ int len;
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse token in object stream");
+
+ switch (tok)
+ {
+ case PDF_TOARRAY:
+ error = pdf_parsearray(op, xref, file, buf, cap);
+ if (error)
+ return fz_rethrow(error, "cannot parse object stream");
+ break;
+ case PDF_TODICT:
+ error = pdf_parsedict(op, xref, file, buf, cap);
+ if (error)
+ return fz_rethrow(error, "cannot parse object stream");
+ break;
+ case PDF_TNAME: *op = fz_newname(buf); break;
+ case PDF_TREAL: *op = fz_newreal(atof(buf)); break;
+ case PDF_TSTRING: *op = fz_newstring(buf, len); break;
+ case PDF_TTRUE: *op = fz_newbool(1); break;
+ case PDF_TFALSE: *op = fz_newbool(0); break;
+ case PDF_TNULL: *op = fz_newnull(); break;
+ case PDF_TINT: *op = fz_newint(atoi(buf)); break;
+ default: return fz_throw("unknown token in object stream");
+ }
+
+ return fz_okay;
+}
+
+fz_error
+pdf_parseindobj(fz_obj **op, pdf_xref *xref,
+ fz_stream *file, char *buf, int cap,
+ int *onum, int *ogen, int *ostmofs)
+{
+ fz_error error = fz_okay;
+ fz_obj *obj = nil;
+ int num = 0, gen = 0, stmofs;
+ int tok;
+ int len;
+ int a, b;
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ if (tok != PDF_TINT)
+ return fz_throw("expected object number (%d %d R)", num, gen);
+ num = atoi(buf);
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ if (tok != PDF_TINT)
+ return fz_throw("expected generation number (%d %d R)", num, gen);
+ gen = atoi(buf);
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ if (tok != PDF_TOBJ)
+ return fz_throw("expected 'obj' keyword (%d %d R)", num, gen);
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+
+ switch (tok)
+ {
+ case PDF_TOARRAY:
+ error = pdf_parsearray(&obj, xref, file, buf, cap);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ break;
+
+ case PDF_TODICT:
+ error = pdf_parsedict(&obj, xref, file, buf, cap);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ break;
+
+ case PDF_TNAME: obj = fz_newname(buf); break;
+ case PDF_TREAL: obj = fz_newreal(atof(buf)); break;
+ case PDF_TSTRING: obj = fz_newstring(buf, len); break;
+ case PDF_TTRUE: obj = fz_newbool(1); break;
+ case PDF_TFALSE: obj = fz_newbool(0); break;
+ case PDF_TNULL: obj = fz_newnull(); break;
+
+ case PDF_TINT:
+ a = atoi(buf);
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ if (tok == PDF_TSTREAM || tok == PDF_TENDOBJ)
+ {
+ obj = fz_newint(a);
+ goto skip;
+ }
+ if (tok == PDF_TINT)
+ {
+ b = atoi(buf);
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ if (tok == PDF_TR)
+ {
+ obj = fz_newindirect(a, b, xref);
+ break;
+ }
+ }
+ return fz_throw("expected 'R' keyword (%d %d R)", num, gen);
+
+ case PDF_TENDOBJ:
+ obj = fz_newnull();
+ goto skip;
+
+ default:
+ return fz_throw("syntax error in object (%d %d R)", num, gen);
+ }
+
+ error = pdf_lex(&tok, file, buf, cap, &len);
+ if (error)
+ {
+ fz_dropobj(obj);
+ return fz_rethrow(error, "cannot parse indirect object (%d %d R)", num, gen);
+ }
+
+skip:
+ if (tok == PDF_TSTREAM)
+ {
+ int c = fz_readbyte(file);
+ while (c == ' ')
+ c = fz_readbyte(file);
+ if (c == '\r')
+ {
+ c = fz_peekbyte(file);
+ if (c != '\n')
+ fz_warn("line feed missing after stream begin marker (%d %d R)", num, gen);
+ else
+ fz_readbyte(file);
+ }
+ stmofs = fz_tell(file);
+ }
+ else if (tok == PDF_TENDOBJ)
+ {
+ stmofs = 0;
+ }
+ else
+ {
+ fz_warn("expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
+ stmofs = 0;
+ }
+
+ if (onum) *onum = num;
+ if (ogen) *ogen = gen;
+ if (ostmofs) *ostmofs = stmofs;
+ *op = obj;
+ return fz_okay;
+}