summaryrefslogtreecommitdiff
path: root/pdf
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2012-02-23 15:07:04 +0000
committerRobin Watts <robin.watts@artifex.com>2012-02-25 08:14:40 -0800
commit9ef2a68e77842456ab30594a9a8d2c0535314715 (patch)
tree630c1b23b64ed15984b6c065635929f08bf613e8 /pdf
parentd28129c2ff6a78c50877426f90167d63334ab18a (diff)
downloadmupdf-9ef2a68e77842456ab30594a9a8d2c0535314715.tar.xz
Revamp pdf lexing code
A huge amount (20%+ on some files) of our runtime is spent in fz_atof. A survey of results on the net suggests we will get much better speed by writing our own atof. Part of the job of doing this involves parsing the string to identify the component parts of the number - ludicrously, we are already doing this as part of the lexing process, so it would make sense to do the atoi/atof as part of this process. In order to do this, we need somewhere to store the lexed results; rather than add a float * and an int * to every single pdf_lex call, we generalise the calls to pass a pdf_lexbuf * pointer instead of separate buffer/max/string length pointers. This should help us overall.
Diffstat (limited to 'pdf')
-rw-r--r--pdf/mupdf.h42
-rw-r--r--pdf/pdf_cmap_parse.c130
-rw-r--r--pdf/pdf_function.c33
-rw-r--r--pdf/pdf_interpret.c63
-rw-r--r--pdf/pdf_lex.c117
-rw-r--r--pdf/pdf_parse.c102
-rw-r--r--pdf/pdf_repair.c47
-rw-r--r--pdf/pdf_xref.c95
8 files changed, 337 insertions, 292 deletions
diff --git a/pdf/mupdf.h b/pdf/mupdf.h
index b233288f..15a96541 100644
--- a/pdf/mupdf.h
+++ b/pdf/mupdf.h
@@ -102,12 +102,40 @@ enum
PDF_NUM_TOKENS
};
-int pdf_lex(fz_stream *f, char *buf, int n, int *len);
+enum
+{
+ PDF_LEXBUF_SMALL = 256,
+ PDF_LEXBUF_LARGE = 65536
+};
+
+
+
+typedef struct pdf_lexbuf_s pdf_lexbuf;
+typedef struct pdf_lexbuf_large_s pdf_lexbuf_large;
+
+struct pdf_lexbuf_s
+{
+ int size;
+ int len;
+ int i;
+ float f;
+ char scratch[PDF_LEXBUF_SMALL];
+};
+
+struct pdf_lexbuf_large_s
+{
+ pdf_lexbuf base;
+ char scratch[PDF_LEXBUF_LARGE - PDF_LEXBUF_SMALL];
+};
+
+
+
+int pdf_lex(fz_stream *f, pdf_lexbuf *lexbuf);
-fz_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, char *buf, int cap);
-fz_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, char *buf, int cap);
-fz_obj *pdf_parse_stm_obj(pdf_document *doc, fz_stream *f, char *buf, int cap);
-fz_obj *pdf_parse_ind_obj(pdf_document *doc, fz_stream *f, char *buf, int cap, int *num, int *gen, int *stm_ofs);
+fz_obj *pdf_parse_array(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
+fz_obj *pdf_parse_dict(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
+fz_obj *pdf_parse_stm_obj(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf);
+fz_obj *pdf_parse_ind_obj(pdf_document *doc, fz_stream *f, pdf_lexbuf *buf, int *num, int *gen, int *stm_ofs);
fz_rect pdf_to_rect(fz_context *ctx, fz_obj *array);
fz_matrix pdf_to_matrix(fz_context *ctx, fz_obj *array);
@@ -170,7 +198,7 @@ struct pdf_document_s
fz_obj **page_objs;
fz_obj **page_refs;
- char scratch[65536];
+ pdf_lexbuf_large lexbuf;
};
fz_obj *pdf_resolve_indirect(fz_obj *ref);
@@ -194,7 +222,7 @@ pdf_document *pdf_open_document(fz_context *ctx, const char *filename);
void pdf_close_document(pdf_document *doc);
/* private */
-void pdf_repair_xref(pdf_document *doc, char *buf, int bufsize);
+void pdf_repair_xref(pdf_document *doc, pdf_lexbuf *buf);
void pdf_repair_obj_stms(pdf_document *doc);
void pdf_debug_xref(pdf_document *);
void pdf_resize_xref(pdf_document *doc, int newcap);
diff --git a/pdf/pdf_cmap_parse.c b/pdf/pdf_cmap_parse.c
index fb37c4a9..5c21393e 100644
--- a/pdf/pdf_cmap_parse.c
+++ b/pdf/pdf_cmap_parse.c
@@ -49,14 +49,14 @@ pdf_code_from_string(char *buf, int len)
}
static int
-pdf_lex_cmap(fz_stream *file, char *buf, int n, int *sl)
+pdf_lex_cmap(fz_stream *file, pdf_lexbuf *buf)
{
- int tok = pdf_lex(file, buf, n, sl);
+ int tok = pdf_lex(file, buf);
/* RJW: Lost debugging here: "cannot parse cmap token" */
if (tok == PDF_TOK_KEYWORD)
- tok = pdf_cmap_token_from_keyword(buf);
+ tok = pdf_cmap_token_from_keyword(buf->scratch);
return tok;
}
@@ -64,15 +64,15 @@ pdf_lex_cmap(fz_stream *file, char *buf, int n, int *sl)
static void
pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ buf.size = PDF_LEXBUF_SMALL;
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok == PDF_TOK_NAME)
- fz_strlcpy(cmap->cmap_name, buf, sizeof(cmap->cmap_name));
+ fz_strlcpy(cmap->cmap_name, buf.scratch, sizeof(cmap->cmap_name));
else
fz_warn(ctx, "expected name after CMapName in cmap");
}
@@ -80,15 +80,15 @@ pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
static void
pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ buf.size = PDF_LEXBUF_SMALL;
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok == PDF_TOK_INT)
- pdf_set_wmode(ctx, cmap, atoi(buf));
+ pdf_set_wmode(ctx, cmap, buf.i);
else
fz_warn(ctx, "expected integer after WMode in cmap");
}
@@ -96,14 +96,14 @@ pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
static void
pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
int lo, hi;
+ buf.size = PDF_LEXBUF_SMALL;
while (1)
{
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok == TOK_END_CODESPACE_RANGE)
@@ -111,13 +111,13 @@ pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
else if (tok == PDF_TOK_STRING)
{
- lo = pdf_code_from_string(buf, len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ lo = pdf_code_from_string(buf.scratch, buf.len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok == PDF_TOK_STRING)
{
- hi = pdf_code_from_string(buf, len);
- pdf_add_codespace(ctx, cmap, lo, hi, len);
+ hi = pdf_code_from_string(buf.scratch, buf.len);
+ pdf_add_codespace(ctx, cmap, lo, hi, buf.len);
}
else break;
}
@@ -131,14 +131,14 @@ pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
static void
pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
int lo, hi, dst;
+ buf.size = PDF_LEXBUF_SMALL;
while (1)
{
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok == TOK_END_CID_RANGE)
@@ -147,21 +147,21 @@ pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
else if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string or endcidrange");
- lo = pdf_code_from_string(buf, len);
+ lo = pdf_code_from_string(buf.scratch, buf.len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string");
- hi = pdf_code_from_string(buf, len);
+ hi = pdf_code_from_string(buf.scratch, buf.len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: Lost debugging: "syntaxerror in cmap" */
if (tok != PDF_TOK_INT)
fz_throw(ctx, "expected integer");
- dst = atoi(buf);
+ dst = buf.i;
pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
}
@@ -170,14 +170,14 @@ pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
static void
pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
int src, dst;
+ buf.size = PDF_LEXBUF_SMALL;
while (1)
{
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok == TOK_END_CID_CHAR)
@@ -186,15 +186,15 @@ pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
else if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string or endcidchar");
- src = pdf_code_from_string(buf, len);
+ src = pdf_code_from_string(buf.scratch, buf.len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok != PDF_TOK_INT)
fz_throw(ctx, "expected integer");
- dst = atoi(buf);
+ dst = buf.i;
pdf_map_range_to_range(ctx, cmap, src, src, dst);
}
@@ -203,15 +203,15 @@ pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
static void
pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, int lo, int hi)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
int dst[256];
int i;
+ buf.size = PDF_LEXBUF_SMALL;
while (1)
{
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok == PDF_TOK_CLOSE_ARRAY)
@@ -221,12 +221,12 @@ pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, int l
else if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string or ]");
- if (len / 2)
+ if (buf.len / 2)
{
- for (i = 0; i < len / 2; i++)
- dst[i] = pdf_code_from_string(buf + i * 2, 2);
+ for (i = 0; i < buf.len / 2; i++)
+ dst[i] = pdf_code_from_string(&buf.scratch[i * 2], 2);
- pdf_map_one_to_many(ctx, cmap, lo, dst, len / 2);
+ pdf_map_one_to_many(ctx, cmap, lo, dst, buf.len / 2);
}
lo ++;
@@ -236,14 +236,14 @@ pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, int l
static void
pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
int lo, hi, dst;
+ buf.size = PDF_LEXBUF_SMALL;
while (1)
{
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok == TOK_END_BF_RANGE)
@@ -252,23 +252,23 @@ pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
else if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string or endbfrange");
- lo = pdf_code_from_string(buf, len);
+ lo = pdf_code_from_string(buf.scratch, buf.len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string");
- hi = pdf_code_from_string(buf, len);
+ hi = pdf_code_from_string(buf.scratch, buf.len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok == PDF_TOK_STRING)
{
- if (len == 2)
+ if (buf.len == 2)
{
- dst = pdf_code_from_string(buf, len);
+ dst = pdf_code_from_string(buf.scratch, buf.len);
pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
}
else
@@ -276,10 +276,10 @@ pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
int dststr[256];
int i;
- if (len / 2)
+ if (buf.len / 2)
{
- for (i = 0; i < len / 2; i++)
- dststr[i] = pdf_code_from_string(buf + i * 2, 2);
+ for (i = 0; i < buf.len / 2; i++)
+ dststr[i] = pdf_code_from_string(&buf.scratch[i * 2], 2);
while (lo <= hi)
{
@@ -307,16 +307,16 @@ pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
static void
pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
{
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
int dst[256];
int src;
int i;
+ buf.size = PDF_LEXBUF_SMALL;
while (1)
{
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
if (tok == TOK_END_BF_CHAR)
@@ -325,18 +325,18 @@ pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file)
else if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string or endbfchar");
- src = pdf_code_from_string(buf, len);
+ src = pdf_code_from_string(buf.scratch, buf.len);
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
/* RJW: "syntaxerror in cmap" */
/* Note: does not handle /dstName */
if (tok != PDF_TOK_STRING)
fz_throw(ctx, "expected string");
- if (len / 2)
+ if (buf.len / 2)
{
- for (i = 0; i < len / 2; i++)
- dst[i] = pdf_code_from_string(buf + i * 2, 2);
+ for (i = 0; i < buf.len / 2; i++)
+ dst[i] = pdf_code_from_string(&buf.scratch[i * 2], 2);
pdf_map_one_to_many(ctx, cmap, src, dst, i);
}
}
@@ -347,11 +347,11 @@ pdf_load_cmap(fz_context *ctx, fz_stream *file)
{
pdf_cmap *cmap;
char key[64];
- char buf[256];
+ pdf_lexbuf buf;
int tok;
- int len;
const char *where;
+ buf.size = PDF_LEXBUF_SMALL;
cmap = pdf_new_cmap(ctx);
strcpy(key, ".notdef");
@@ -363,25 +363,25 @@ pdf_load_cmap(fz_context *ctx, fz_stream *file)
while (1)
{
where = "";
- tok = pdf_lex_cmap(file, buf, sizeof buf, &len);
+ tok = pdf_lex_cmap(file, &buf);
if (tok == PDF_TOK_EOF || tok == TOK_END_CMAP)
break;
else if (tok == PDF_TOK_NAME)
{
- if (!strcmp(buf, "CMapName"))
+ if (!strcmp(buf.scratch, "CMapName"))
{
where = " after CMapName";
pdf_parse_cmap_name(ctx, cmap, file);
}
- else if (!strcmp(buf, "WMode"))
+ else if (!strcmp(buf.scratch, "WMode"))
{
where = " after WMode";
pdf_parse_wmode(ctx, cmap, file);
}
else
- fz_strlcpy(key, buf, sizeof key);
+ fz_strlcpy(key, buf.scratch, sizeof key);
}
else if (tok == TOK_USECMAP)
diff --git a/pdf/pdf_function.c b/pdf/pdf_function.c
index 4478827c..17373f42 100644
--- a/pdf/pdf_function.c
+++ b/pdf/pdf_function.c
@@ -683,18 +683,18 @@ resize_code(fz_context *ctx, pdf_function *func, int newsize)
static void
parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
{
- char buf[64];
- int len;
+ pdf_lexbuf buf;
int tok;
int opptr, elseptr, ifptr;
int a, b, mid, cmp;
fz_context *ctx = stream->ctx;
- memset(buf, 0, sizeof(buf));
+ buf.size = PDF_LEXBUF_SMALL;
+ memset(buf.scratch, 0, sizeof(buf.scratch));
while (1)
{
- tok = pdf_lex(stream, buf, sizeof buf, &len);
+ tok = pdf_lex(stream, &buf);
/* RJW: "calculator function lexical error" */
switch(tok)
@@ -705,7 +705,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
case PDF_TOK_INT:
resize_code(ctx, func, *codeptr);
func->u.p.code[*codeptr].type = PS_INT;
- func->u.p.code[*codeptr].u.i = atoi(buf);
+ func->u.p.code[*codeptr].u.i = buf.i;
++*codeptr;
break;
@@ -726,7 +726,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
case PDF_TOK_REAL:
resize_code(ctx, func, *codeptr);
func->u.p.code[*codeptr].type = PS_REAL;
- func->u.p.code[*codeptr].u.f = fz_atof(buf);
+ func->u.p.code[*codeptr].u.f = buf.f;
++*codeptr;
break;
@@ -740,7 +740,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
parse_code(func, stream, codeptr);
/* RJW: "error in 'if' branch" */
- tok = pdf_lex(stream, buf, sizeof buf, &len);
+ tok = pdf_lex(stream, &buf);
/* RJW: "calculator function syntax error" */
if (tok == PDF_TOK_OPEN_BRACE)
@@ -749,7 +749,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
parse_code(func, stream, codeptr);
/* RJW: "error in 'else' branch" */
- tok = pdf_lex(stream, buf, sizeof buf, &len);
+ tok = pdf_lex(stream, &buf);
/* RJW: "calculator function syntax error" */
}
else
@@ -760,7 +760,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
if (tok != PDF_TOK_KEYWORD)
fz_throw(ctx, "missing keyword in 'if-else' context");
- if (!strcmp(buf, "if"))
+ if (!strcmp(buf.scratch, "if"))
{
if (elseptr >= 0)
fz_throw(ctx, "too many branches for 'if'");
@@ -771,7 +771,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
func->u.p.code[opptr+3].type = PS_BLOCK;
func->u.p.code[opptr+3].u.block = *codeptr;
}
- else if (!strcmp(buf, "ifelse"))
+ else if (!strcmp(buf.scratch, "ifelse"))
{
if (elseptr < 0)
fz_throw(ctx, "not enough branches for 'ifelse'");
@@ -786,7 +786,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
}
else
{
- fz_throw(ctx, "unknown keyword in 'if-else' context: '%s'", buf);
+ fz_throw(ctx, "unknown keyword in 'if-else' context: '%s'", buf.scratch);
}
break;
@@ -804,7 +804,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
while (b - a > 1)
{
mid = (a + b) / 2;
- cmp = strcmp(buf, ps_op_names[mid]);
+ cmp = strcmp(buf.scratch, ps_op_names[mid]);
if (cmp > 0)
a = mid;
else if (cmp < 0)
@@ -813,7 +813,7 @@ parse_code(pdf_function *func, fz_stream *stream, int *codeptr)
a = b = mid;
}
if (cmp != 0)
- fz_throw(ctx, "unknown operator: '%s'", buf);
+ fz_throw(ctx, "unknown operator: '%s'", buf.scratch);
resize_code(ctx, func, *codeptr);
func->u.p.code[*codeptr].type = PS_OPERATOR;
@@ -832,12 +832,13 @@ load_postscript_func(pdf_function *func, pdf_document *xref, fz_obj *dict, int n
{
fz_stream *stream = NULL;
int codeptr;
- char buf[64];
+ pdf_lexbuf buf;
int tok;
- int len;
fz_context *ctx = xref->ctx;
int locked = 0;
+ buf.size = PDF_LEXBUF_SMALL;
+
fz_var(stream);
fz_var(locked);
@@ -846,7 +847,7 @@ load_postscript_func(pdf_function *func, pdf_document *xref, fz_obj *dict, int n
stream = pdf_open_stream(xref, num, gen);
/* RJW: "cannot open calculator function stream" */
- tok = pdf_lex(stream, buf, sizeof buf, &len);
+ tok = pdf_lex(stream, &buf);
if (tok != PDF_TOK_OPEN_BRACE)
{
fz_throw(ctx, "stream is not a calculator function");
diff --git a/pdf/pdf_interpret.c b/pdf/pdf_interpret.c
index effea657..1cfe6a96 100644
--- a/pdf/pdf_interpret.c
+++ b/pdf/pdf_interpret.c
@@ -819,6 +819,7 @@ pdf_show_string(pdf_csi *csi, unsigned char *buf, int len)
{
int w = pdf_decode_cmap(fontdesc->encoding, buf, &cpt);
buf += w;
+
cid = pdf_lookup_cmap(fontdesc->encoding, cpt);
if (cid >= 0)
pdf_show_char(csi, cid);
@@ -1625,12 +1626,10 @@ static void pdf_run_BI(pdf_csi *csi, fz_obj *rdb, fz_stream *file)
{
fz_context *ctx = csi->dev->ctx;
int ch;
- char *buf = csi->xref->scratch;
- int buflen = sizeof(csi->xref->scratch);
fz_image *img;
fz_obj *obj;
- obj = pdf_parse_dict(csi->xref, file, buf, buflen);
+ obj = pdf_parse_dict(csi->xref, file, &csi->xref->lexbuf.base);
/* RJW: "cannot parse inline image dictionary" */
/* read whitespace after ID keyword */
@@ -2523,10 +2522,10 @@ pdf_run_keyword(pdf_csi *csi, fz_obj *rdb, fz_stream *file, char *buf)
}
static void
-pdf_run_stream(pdf_csi *csi, fz_obj *rdb, fz_stream *file, char *buf, int buflen)
+pdf_run_stream(pdf_csi *csi, fz_obj *rdb, fz_stream *file, pdf_lexbuf *buf)
{
fz_context *ctx = csi->dev->ctx;
- int tok, len, in_array;
+ int tok, in_array;
/* make sure we have a clean slate if we come here from flush_text */
pdf_clear_stack(csi);
@@ -2551,7 +2550,7 @@ pdf_run_stream(pdf_csi *csi, fz_obj *rdb, fz_stream *file, char *buf, int buflen
csi->cookie->progress++;
}
- tok = pdf_lex(file, buf, buflen, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "lexical error in content stream" */
if (in_array)
@@ -2560,19 +2559,24 @@ pdf_run_stream(pdf_csi *csi, fz_obj *rdb, fz_stream *file, char *buf, int buflen
{
in_array = 0;
}
- else if (tok == PDF_TOK_INT || tok == PDF_TOK_REAL)
+ else if (tok == PDF_TOK_REAL)
{
pdf_gstate *gstate = csi->gstate + csi->gtop;
- pdf_show_space(csi, -fz_atof(buf) * gstate->size * 0.001f);
+ pdf_show_space(csi, -buf->f * gstate->size * 0.001f);
+ }
+ else if (tok == PDF_TOK_INT)
+ {
+ pdf_gstate *gstate = csi->gstate + csi->gtop;
+ pdf_show_space(csi, -buf->i * gstate->size * 0.001f);
}
else if (tok == PDF_TOK_STRING)
{
- pdf_show_string(csi, (unsigned char *)buf, len);
+ pdf_show_string(csi, (unsigned char *)buf->scratch, buf->len);
}
else if (tok == PDF_TOK_KEYWORD)
{
- if (!strcmp(buf, "Tw") || !strcmp(buf, "Tc"))
- fz_warn(ctx, "ignoring keyword '%s' inside array", buf);
+ if (!strcmp(buf->scratch, "Tw") || !strcmp(buf->scratch, "Tc"))
+ fz_warn(ctx, "ignoring keyword '%s' inside array", buf->scratch);
else
fz_throw(ctx, "syntax error in array");
}
@@ -2591,7 +2595,7 @@ pdf_run_stream(pdf_csi *csi, fz_obj *rdb, fz_stream *file, char *buf, int buflen
case PDF_TOK_OPEN_ARRAY:
if (!csi->in_text)
{
- csi->obj = pdf_parse_array(csi->xref, file, buf, buflen);
+ csi->obj = pdf_parse_array(csi->xref, file, buf);
/* RJW: "cannot parse array" */
}
else
@@ -2601,38 +2605,38 @@ pdf_run_stream(pdf_csi *csi, fz_obj *rdb, fz_stream *file, char *buf, int buflen
break;
case PDF_TOK_OPEN_DICT:
- csi->obj = pdf_parse_dict(csi->xref, file, buf, buflen);
+ csi->obj = pdf_parse_dict(csi->xref, file, buf);
/* RJW: "cannot parse dictionary" */
break;
case PDF_TOK_NAME:
- fz_strlcpy(csi->name, buf, sizeof(csi->name));
+ fz_strlcpy(csi->name, buf->scratch, sizeof(csi->name));
break;
case PDF_TOK_INT:
- csi->stack[csi->top] = atoi(buf);
+ csi->stack[csi->top] = buf->i;
csi->top ++;
break;
case PDF_TOK_REAL:
- csi->stack[csi->top] = fz_atof(buf);
+ csi->stack[csi->top] = buf->f;
csi->top ++;
break;
case PDF_TOK_STRING:
- if (len <= sizeof(csi->string))
+ if (buf->len <= sizeof(csi->string))
{
- memcpy(csi->string, buf, len);
- csi->string_len = len;
+ memcpy(csi->string, buf->scratch, buf->len);
+ csi->string_len = buf->len;
}
else
{
- csi->obj = fz_new_string(ctx, buf, len);
+ csi->obj = fz_new_string(ctx, buf->scratch, buf->len);
}
break;
case PDF_TOK_KEYWORD:
- pdf_run_keyword(csi, rdb, file, buf);
+ pdf_run_keyword(csi, rdb, file, buf->scratch);
/* RJW: "cannot run keyword" */
pdf_clear_stack(csi);
break;
@@ -2651,8 +2655,7 @@ static void
pdf_run_buffer(pdf_csi *csi, fz_obj *rdb, fz_buffer *contents)
{
fz_context *ctx = csi->dev->ctx;
- int len = sizeof csi->xref->scratch;
- char *buf = NULL;
+ pdf_lexbuf_large *buf;
fz_stream * file = NULL;
int save_in_text;
@@ -2664,13 +2667,14 @@ pdf_run_buffer(pdf_csi *csi, fz_obj *rdb, fz_buffer *contents)
fz_try(ctx)
{
- buf = fz_malloc(ctx, len); /* we must be re-entrant for type3 fonts */
+ buf = fz_malloc(ctx, sizeof(*buf)); /* we must be re-entrant for type3 fonts */
+ buf->base.size = PDF_LEXBUF_LARGE;
file = fz_open_buffer(ctx, contents);
save_in_text = csi->in_text;
csi->in_text = 0;
fz_try(ctx)
{
- pdf_run_stream(csi, rdb, file, buf, len);
+ pdf_run_stream(csi, rdb, file, &buf->base);
}
fz_catch(ctx)
{
@@ -2678,14 +2682,15 @@ pdf_run_buffer(pdf_csi *csi, fz_obj *rdb, fz_buffer *contents)
}
csi->in_text = save_in_text;
}
- fz_catch(ctx)
- {
+ fz_always(ctx)
+ {
fz_close(file);
fz_free(ctx, buf);
+ }
+ fz_catch(ctx)
+ {
fz_throw(ctx, "cannot parse context stream");
}
- fz_close(file);
- fz_free(ctx, buf);
}
void
diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c
index 24828412..322d945c 100644
--- a/pdf/pdf_lex.c
+++ b/pdf/pdf_lex.c
@@ -63,87 +63,106 @@ lex_comment(fz_stream *f)
}
static int
-lex_number(fz_stream *f, char *s, int n, int *tok)
+lex_number(fz_stream *f, pdf_lexbuf *buf, int c)
{
- char *buf = s;
- *tok = PDF_TOK_INT;
+ int neg = 0;
+ int i = 0;
+ int n;
+ int d;
+ float v;
/* Initially we might have +, -, . or a digit */
- if (n > 1)
+ switch (c)
+ {
+ case '.':
+ goto loop_after_dot;
+ case '-':
+ neg = 1;
+ break;
+ case '+':
+ break;
+ default: /* Must be a digit */
+ i = c - '0';
+ break;
+ }
+
+ while (1)
{
int c = fz_read_byte(f);
switch (c)
{
case '.':
- *tok = PDF_TOK_REAL;
- *s++ = c;
- n--;
goto loop_after_dot;
- case '+':
- case '-':
case RANGE_0_9:
- *s++ = c;
- n--;
- goto loop_after_sign;
+ i = 10*i + c - '0';
+ /* FIXME: Need overflow check here; do we care? */
+ break;
default:
fz_unread_byte(f);
- goto end;
+ /* Fallthrough */
case EOF:
- goto end;
+ if (neg)
+ i = -i;
+ buf->i = i;
+ return PDF_TOK_INT;
}
}
- /* We can't accept a sign from here on in, just . or a digit */
-loop_after_sign:
- while (n > 1)
+ /* In here, we've seen a dot, so can accept just digits */
+loop_after_dot:
+ n = 0;
+ d = 1;
+ while (1)
{
int c = fz_read_byte(f);
switch (c)
{
- case '.':
- *tok = PDF_TOK_REAL;
- *s++ = c;
- n--;
- goto loop_after_dot;
case RANGE_0_9:
- *s++ = c;
+ if (d >= INT_MAX/10)
+ goto underflow;
+ n = n*10 + (c - '0');
+ d *= 10;
break;
default:
fz_unread_byte(f);
- goto end;
+ /* Fallthrough */
case EOF:
- goto end;
+ v = (float)i + ((float)n / (float)d);
+ if (neg)
+ v = -v;
+ buf->f = v;
+ return PDF_TOK_REAL;
}
- n--;
}
- /* In here, we've seen a dot, so can accept just digits */
-loop_after_dot:
- while (n > 1)
+underflow:
+ /* Ignore any digits after here, because they are too small */
+ while (1)
{
int c = fz_read_byte(f);
switch (c)
{
case RANGE_0_9:
- *s++ = c;
break;
default:
fz_unread_byte(f);
- goto end;
+ /* Fallthrough */
case EOF:
- goto end;
+ v = (float)i + ((float)n / (float)d);
+ if (neg)
+ v = -v;
+ buf->f = v;
+ return PDF_TOK_REAL;
}
- n--;
}
-
-end:
- *s = '\0';
- return s-buf;
}
static void
-lex_name(fz_stream *f, char *s, int n)
+lex_name(fz_stream *f, pdf_lexbuf *buf)
{
+ char *s = buf->scratch;
+ int n = buf->size;
+
while (n > 1)
{
int c = fz_read_byte(f);
@@ -208,6 +227,7 @@ lex_name(fz_stream *f, char *s, int n)
}
end:
*s = '\0';
+ buf->len = s - buf->scratch;
}
static int
@@ -380,7 +400,7 @@ pdf_token_from_keyword(char *key)
}
int
-pdf_lex(fz_stream *f, char *buf, int n, int *sl)
+pdf_lex(fz_stream *f, pdf_lexbuf *buf)
{
while (1)
{
@@ -396,11 +416,10 @@ pdf_lex(fz_stream *f, char *buf, int n, int *sl)
lex_comment(f);
break;
case '/':
- lex_name(f, buf, n);
- *sl = strlen(buf);
+ lex_name(f, buf);
return PDF_TOK_NAME;
case '(':
- *sl = lex_string(f, buf, n);
+ buf->len = lex_string(f, buf->scratch, buf->size);
return PDF_TOK_STRING;
case ')':
fz_warn(f->ctx, "lexical error (unexpected ')')");
@@ -414,7 +433,7 @@ pdf_lex(fz_stream *f, char *buf, int n, int *sl)
else
{
fz_unread_byte(f);
- *sl = lex_hex_string(f, buf, n);
+ buf->len = lex_hex_string(f, buf->scratch, buf->size);
return PDF_TOK_STRING;
}
case '>':
@@ -434,17 +453,11 @@ pdf_lex(fz_stream *f, char *buf, int n, int *sl)
case '}':
return PDF_TOK_CLOSE_BRACE;
case IS_NUMBER:
- {
- int tok;
- fz_unread_byte(f);
- *sl = lex_number(f, buf, n, &tok);
- return tok;
- }
+ return lex_number(f, buf, c);
default: /* isregular: !isdelim && !iswhite && c != EOF */
fz_unread_byte(f);
- lex_name(f, buf, n);
- *sl = strlen(buf);
- return pdf_token_from_keyword(buf);
+ lex_name(f, buf);
+ return pdf_token_from_keyword(buf->scratch);
}
}
}
diff --git a/pdf/pdf_parse.c b/pdf/pdf_parse.c
index 220eb30c..fb6cb7ef 100644
--- a/pdf/pdf_parse.c
+++ b/pdf/pdf_parse.c
@@ -171,13 +171,12 @@ pdf_to_utf8_name(fz_context *ctx, fz_obj *src)
}
fz_obj *
-pdf_parse_array(pdf_document *xref, fz_stream *file, char *buf, int cap)
+pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
{
fz_obj *ary = NULL;
fz_obj *obj = NULL;
int a = 0, b = 0, n = 0;
int tok;
- int len;
fz_context *ctx = file->ctx;
fz_obj *op;
@@ -189,7 +188,7 @@ pdf_parse_array(pdf_document *xref, fz_stream *file, char *buf, int cap)
{
while (1)
{
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
{
@@ -228,9 +227,9 @@ pdf_parse_array(pdf_document *xref, fz_stream *file, char *buf, int cap)
case PDF_TOK_INT:
if (n == 0)
- a = atoi(buf);
+ a = buf->i;
if (n == 1)
- b = atoi(buf);
+ b = buf->i;
n ++;
break;
@@ -245,33 +244,33 @@ pdf_parse_array(pdf_document *xref, fz_stream *file, char *buf, int cap)
break;
case PDF_TOK_OPEN_ARRAY:
- obj = pdf_parse_array(xref, file, buf, cap);
+ obj = pdf_parse_array(xref, file, buf);
fz_array_push(ary, obj);
fz_drop_obj(obj);
obj = NULL;
break;
case PDF_TOK_OPEN_DICT:
- obj = pdf_parse_dict(xref, file, buf, cap);
+ obj = pdf_parse_dict(xref, file, buf);
fz_array_push(ary, obj);
fz_drop_obj(obj);
obj = NULL;
break;
case PDF_TOK_NAME:
- obj = fz_new_name(ctx, buf);
+ obj = fz_new_name(ctx, buf->scratch);
fz_array_push(ary, obj);
fz_drop_obj(obj);
obj = NULL;
break;
case PDF_TOK_REAL:
- obj = fz_new_real(ctx, fz_atof(buf));
+ obj = fz_new_real(ctx, buf->f);
fz_array_push(ary, obj);
fz_drop_obj(obj);
obj = NULL;
break;
case PDF_TOK_STRING:
- obj = fz_new_string(ctx, buf, len);
+ obj = fz_new_string(ctx, buf->scratch, buf->len);
fz_array_push(ary, obj);
fz_drop_obj(obj);
obj = NULL;
@@ -312,13 +311,12 @@ end:
}
fz_obj *
-pdf_parse_dict(pdf_document *xref, fz_stream *file, char *buf, int cap)
+pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
{
fz_obj *dict = NULL;
fz_obj *key = NULL;
fz_obj *val = NULL;
int tok;
- int len;
int a, b;
fz_context *ctx = file->ctx;
@@ -332,45 +330,45 @@ pdf_parse_dict(pdf_document *xref, fz_stream *file, char *buf, int cap)
{
while (1)
{
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
skip:
if (tok == PDF_TOK_CLOSE_DICT)
break;
/* for BI .. ID .. EI in content streams */
- if (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID"))
+ if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
break;
if (tok != PDF_TOK_NAME)
fz_throw(ctx, "invalid key in dict");
- key = fz_new_name(ctx, buf);
+ key = fz_new_name(ctx, buf->scratch);
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
switch (tok)
{
case PDF_TOK_OPEN_ARRAY:
- val = pdf_parse_array(xref, file, buf, cap);
+ val = pdf_parse_array(xref, file, buf);
break;
case PDF_TOK_OPEN_DICT:
- val = pdf_parse_dict(xref, file, buf, cap);
+ val = pdf_parse_dict(xref, file, buf);
break;
- case PDF_TOK_NAME: val = fz_new_name(ctx, buf); break;
- case PDF_TOK_REAL: val = fz_new_real(ctx, fz_atof(buf)); break;
- case PDF_TOK_STRING: val = fz_new_string(ctx, buf, len); break;
+ case PDF_TOK_NAME: val = fz_new_name(ctx, buf->scratch); break;
+ case PDF_TOK_REAL: val = fz_new_real(ctx, buf->f); break;
+ case PDF_TOK_STRING: val = fz_new_string(ctx, buf->scratch, buf->len); break;
case PDF_TOK_TRUE: val = fz_new_bool(ctx, 1); break;
case PDF_TOK_FALSE: val = fz_new_bool(ctx, 0); break;
case PDF_TOK_NULL: val = fz_new_null(ctx); break;
case PDF_TOK_INT:
/* 64-bit to allow for numbers > INT_MAX and overflow */
- a = (int) strtoll(buf, 0, 10);
- tok = pdf_lex(file, buf, cap, &len);
+ a = buf->i;
+ tok = pdf_lex(file, buf);
if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
- (tok == PDF_TOK_KEYWORD && !strcmp(buf, "ID")))
+ (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
{
val = fz_new_int(ctx, a);
fz_dict_put(dict, key, val);
@@ -382,8 +380,8 @@ pdf_parse_dict(pdf_document *xref, fz_stream *file, char *buf, int cap)
}
if (tok == PDF_TOK_INT)
{
- b = atoi(buf);
- tok = pdf_lex(file, buf, cap, &len);
+ b = buf->i;
+ tok = pdf_lex(file, buf);
if (tok == PDF_TOK_R)
{
val = fz_new_indirect(ctx, a, b, xref);
@@ -414,30 +412,29 @@ pdf_parse_dict(pdf_document *xref, fz_stream *file, char *buf, int cap)
}
fz_obj *
-pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, char *buf, int cap)
+pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
{
int tok;
- int len;
fz_context *ctx = file->ctx;
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot parse token in object stream") */
switch (tok)
{
case PDF_TOK_OPEN_ARRAY:
- return pdf_parse_array(xref, file, buf, cap);
+ return pdf_parse_array(xref, file, buf);
/* RJW: "cannot parse object stream" */
case PDF_TOK_OPEN_DICT:
- return pdf_parse_dict(xref, file, buf, cap);
+ return pdf_parse_dict(xref, file, buf);
/* RJW: "cannot parse object stream" */
- case PDF_TOK_NAME: return fz_new_name(ctx, buf); break;
- case PDF_TOK_REAL: return fz_new_real(ctx, fz_atof(buf)); break;
- case PDF_TOK_STRING: return fz_new_string(ctx, buf, len); break;
+ case PDF_TOK_NAME: return fz_new_name(ctx, buf->scratch); break;
+ case PDF_TOK_REAL: return fz_new_real(ctx, buf->f); break;
+ case PDF_TOK_STRING: return fz_new_string(ctx, buf->scratch, buf->len); break;
case PDF_TOK_TRUE: return fz_new_bool(ctx, 1); break;
case PDF_TOK_FALSE: return fz_new_bool(ctx, 0); break;
case PDF_TOK_NULL: return fz_new_null(ctx); break;
- case PDF_TOK_INT: return fz_new_int(ctx, atoi(buf)); break;
+ case PDF_TOK_INT: return fz_new_int(ctx, buf->i); break;
default: fz_throw(ctx, "unknown token in object stream");
}
return NULL; /* Stupid MSVC */
@@ -445,60 +442,59 @@ pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, char *buf, int cap)
fz_obj *
pdf_parse_ind_obj(pdf_document *xref,
- fz_stream *file, char *buf, int cap,
+ fz_stream *file, pdf_lexbuf *buf,
int *onum, int *ogen, int *ostmofs)
{
fz_obj *obj = NULL;
int num = 0, gen = 0, stm_ofs;
int tok;
- int len;
int a, b;
fz_context *ctx = file->ctx;
fz_var(obj);
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: cannot parse indirect object (%d %d R)", num, gen */
if (tok != PDF_TOK_INT)
fz_throw(ctx, "expected object number (%d %d R)", num, gen);
- num = atoi(buf);
+ num = buf->i;
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot parse indirect object (%d %d R)", num, gen */
if (tok != PDF_TOK_INT)
fz_throw(ctx, "expected generation number (%d %d R)", num, gen);
- gen = atoi(buf);
+ gen = buf->i;
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot parse indirect object (%d %d R)", num, gen */
if (tok != PDF_TOK_OBJ)
fz_throw(ctx, "expected 'obj' keyword (%d %d R)", num, gen);
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot parse indirect object (%d %d R)", num, gen */
switch (tok)
{
case PDF_TOK_OPEN_ARRAY:
- obj = pdf_parse_array(xref, file, buf, cap);
+ obj = pdf_parse_array(xref, file, buf);
/* RJW: "cannot parse indirect object (%d %d R)", num, gen */
break;
case PDF_TOK_OPEN_DICT:
- obj = pdf_parse_dict(xref, file, buf, cap);
+ obj = pdf_parse_dict(xref, file, buf);
/* RJW: "cannot parse indirect object (%d %d R)", num, gen */
break;
- case PDF_TOK_NAME: obj = fz_new_name(ctx, buf); break;
- case PDF_TOK_REAL: obj = fz_new_real(ctx, fz_atof(buf)); break;
- case PDF_TOK_STRING: obj = fz_new_string(ctx, buf, len); break;
+ case PDF_TOK_NAME: obj = fz_new_name(ctx, buf->scratch); break;
+ case PDF_TOK_REAL: obj = fz_new_real(ctx, buf->f); break;
+ case PDF_TOK_STRING: obj = fz_new_string(ctx, buf->scratch, buf->len); break;
case PDF_TOK_TRUE: obj = fz_new_bool(ctx, 1); break;
case PDF_TOK_FALSE: obj = fz_new_bool(ctx, 0); break;
case PDF_TOK_NULL: obj = fz_new_null(ctx); break;
case PDF_TOK_INT:
- a = atoi(buf);
- tok = pdf_lex(file, buf, cap, &len);
+ a = buf->i;
+ tok = pdf_lex(file, buf);
/* "cannot parse indirect object (%d %d R)", num, gen */
if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
{
@@ -507,8 +503,8 @@ pdf_parse_ind_obj(pdf_document *xref,
}
if (tok == PDF_TOK_INT)
{
- b = atoi(buf);
- tok = pdf_lex(file, buf, cap, &len);
+ b = buf->i;
+ tok = pdf_lex(file, buf);
/* RJW: "cannot parse indirect object (%d %d R)", num, gen); */
if (tok == PDF_TOK_R)
{
@@ -528,7 +524,7 @@ pdf_parse_ind_obj(pdf_document *xref,
fz_try(ctx)
{
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
}
fz_catch(ctx)
{
diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c
index 0dc0e132..c70df3e2 100644
--- a/pdf/pdf_repair.c
+++ b/pdf/pdf_repair.c
@@ -13,11 +13,10 @@ struct entry
};
static void
-pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id)
+pdf_repair_obj(fz_stream *file, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id)
{
int tok;
int stm_len;
- int len;
int n;
fz_context *ctx = file->ctx;
@@ -26,7 +25,7 @@ pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp,
stm_len = 0;
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot parse object" */
if (tok == PDF_TOK_OPEN_DICT)
{
@@ -35,7 +34,7 @@ pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp,
/* Send NULL xref so we don't try to resolve references */
fz_try(ctx)
{
- dict = pdf_parse_dict(NULL, file, buf, cap);
+ dict = pdf_parse_dict(NULL, file, buf);
}
fz_catch(ctx)
{
@@ -79,13 +78,13 @@ pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp,
tok != PDF_TOK_EOF &&
tok != PDF_TOK_INT )
{
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot scan for endobj or stream token" */
}
if (tok == PDF_TOK_INT)
{
- while (len-- > 0)
+ while (buf->len-- > 0)
fz_unread_byte(file);
}
else if (tok == PDF_TOK_STREAM)
@@ -106,7 +105,7 @@ pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp,
fz_seek(file, *stmofsp + stm_len, 0);
fz_try(ctx)
{
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
}
fz_catch(ctx)
{
@@ -117,23 +116,23 @@ pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp,
fz_seek(file, *stmofsp, 0);
}
- n = fz_read(file, (unsigned char *) buf, 9);
+ n = fz_read(file, (unsigned char *) buf->scratch, 9);
if (n < 0)
fz_throw(ctx, "cannot read from file");
- while (memcmp(buf, "endstream", 9) != 0)
+ while (memcmp(buf->scratch, "endstream", 9) != 0)
{
c = fz_read_byte(file);
if (c == EOF)
break;
- memmove(buf, buf + 1, 8);
- buf[8] = c;
+ memmove(&buf->scratch[0], &buf->scratch[1], 8);
+ buf->scratch[8] = c;
}
*stmlenp = fz_tell(file) - *stmofsp - 9;
atobjend:
- tok = pdf_lex(file, buf, cap, &len);
+ tok = pdf_lex(file, buf);
/* RJW: "cannot scan for endobj token" */
if (tok != PDF_TOK_ENDOBJ)
fz_warn(ctx, "object missing 'endobj' token");
@@ -147,11 +146,13 @@ pdf_repair_obj_stm(pdf_document *xref, int num, int gen)
fz_stream *stm = NULL;
int tok;
int i, n, count;
- char buf[256];
fz_context *ctx = xref->ctx;
+ pdf_lexbuf buf;
fz_var(stm);
+ buf.size = PDF_LEXBUF_SMALL;
+
fz_try(ctx)
{
obj = pdf_load_object(xref, num, gen);
@@ -164,11 +165,11 @@ pdf_repair_obj_stm(pdf_document *xref, int num, int gen)
for (i = 0; i < count; i++)
{
- tok = pdf_lex(stm, buf, sizeof buf, &n);
+ tok = pdf_lex(stm, &buf);
if (tok != PDF_TOK_INT)
fz_throw(ctx, "corrupt object stream (%d %d R)", num, gen);
- n = atoi(buf);
+ n = buf.i;
if (n >= xref->len)
pdf_resize_xref(xref, n + 1);
@@ -179,7 +180,7 @@ pdf_repair_obj_stm(pdf_document *xref, int num, int gen)
xref->table[n].obj = NULL;
xref->table[n].type = 'o';
- tok = pdf_lex(stm, buf, sizeof buf, &n);
+ tok = pdf_lex(stm, &buf);
if (tok != PDF_TOK_INT)
fz_throw(ctx, "corrupt object stream (%d %d R)", num, gen);
}
@@ -195,7 +196,7 @@ pdf_repair_obj_stm(pdf_document *xref, int num, int gen)
}
void
-pdf_repair_xref(pdf_document *xref, char *buf, int bufsize)
+pdf_repair_xref(pdf_document *xref, pdf_lexbuf *buf)
{
fz_obj *dict, *obj;
fz_obj *length;
@@ -234,14 +235,14 @@ pdf_repair_xref(pdf_document *xref, char *buf, int bufsize)
list = fz_malloc_array(ctx, listcap, sizeof(struct entry));
/* look for '%PDF' version marker within first kilobyte of file */
- n = fz_read(xref->file, (unsigned char *)buf, MIN(bufsize, 1024));
+ n = fz_read(xref->file, (unsigned char *)buf->scratch, MIN(buf->size, 1024));
if (n < 0)
fz_throw(ctx, "cannot read from file");
fz_seek(xref->file, 0, 0);
for (i = 0; i < n - 4; i++)
{
- if (memcmp(buf + i, "%PDF", 4) == 0)
+ if (memcmp(&buf->scratch[i], "%PDF", 4) == 0)
{
fz_seek(xref->file, i + 8, 0); /* skip "%PDF-X.Y" */
break;
@@ -263,7 +264,7 @@ pdf_repair_xref(pdf_document *xref, char *buf, int bufsize)
fz_try(ctx)
{
- tok = pdf_lex(xref->file, buf, bufsize, &n);
+ tok = pdf_lex(xref->file, buf);
}
fz_catch(ctx)
{
@@ -276,14 +277,14 @@ pdf_repair_xref(pdf_document *xref, char *buf, int bufsize)
numofs = genofs;
num = gen;
genofs = tmpofs;
- gen = atoi(buf);
+ gen = buf->i;
}
else if (tok == PDF_TOK_OBJ)
{
fz_try(ctx)
{
- pdf_repair_obj(xref->file, buf, bufsize, &stm_ofs, &stm_len, &encrypt, &id);
+ pdf_repair_obj(xref->file, buf, &stm_ofs, &stm_len, &encrypt, &id);
}
fz_catch(ctx)
{
@@ -318,7 +319,7 @@ pdf_repair_xref(pdf_document *xref, char *buf, int bufsize)
{
fz_try(ctx)
{
- dict = pdf_parse_dict(xref, xref->file, buf, bufsize);
+ dict = pdf_parse_dict(xref, xref->file, buf);
}
fz_catch(ctx)
{
diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c
index 7500ded3..383747a7 100644
--- a/pdf/pdf_xref.c
+++ b/pdf/pdf_xref.c
@@ -51,6 +51,7 @@ pdf_read_start_xref(pdf_document *xref)
while (iswhite(buf[i]) && i < n)
i ++;
xref->startxref = atoi((char*)(buf + i));
+
return;
}
}
@@ -63,17 +64,16 @@ pdf_read_start_xref(pdf_document *xref)
*/
static void
-pdf_read_old_trailer(pdf_document *xref, char *buf, int cap)
+pdf_read_old_trailer(pdf_document *xref, pdf_lexbuf *buf)
{
int len;
char *s;
- int n;
int t;
int tok;
int c;
- fz_read_line(xref->file, buf, cap);
- if (strncmp(buf, "xref", 4) != 0)
+ fz_read_line(xref->file, buf->scratch, buf->size);
+ if (strncmp(buf->scratch, "xref", 4) != 0)
fz_throw(xref->ctx, "cannot find xref marker");
while (1)
@@ -82,8 +82,8 @@ pdf_read_old_trailer(pdf_document *xref, char *buf, int cap)
if (!(c >= '0' && c <= '9'))
break;
- fz_read_line(xref->file, buf, cap);
- s = buf;
+ fz_read_line(xref->file, buf->scratch, buf->size);
+ s = buf->scratch;
fz_strsep(&s, " "); /* ignore ofs */
if (!s)
fz_throw(xref->ctx, "invalid range marker in xref");
@@ -102,15 +102,15 @@ pdf_read_old_trailer(pdf_document *xref, char *buf, int cap)
fz_try(xref->ctx)
{
- tok = pdf_lex(xref->file, buf, cap, &n);
+ tok = pdf_lex(xref->file, buf);
if (tok != PDF_TOK_TRAILER)
fz_throw(xref->ctx, "expected trailer marker");
- tok = pdf_lex(xref->file, buf, cap, &n);
+ tok = pdf_lex(xref->file, buf);
if (tok != PDF_TOK_OPEN_DICT)
fz_throw(xref->ctx, "expected trailer dictionary");
- xref->trailer = pdf_parse_dict(xref, xref->file, buf, cap);
+ xref->trailer = pdf_parse_dict(xref, xref->file, buf);
}
fz_catch(xref->ctx)
{
@@ -119,11 +119,11 @@ pdf_read_old_trailer(pdf_document *xref, char *buf, int cap)
}
static void
-pdf_read_new_trailer(pdf_document *xref, char *buf, int cap)
+pdf_read_new_trailer(pdf_document *xref, pdf_lexbuf *buf)
{
fz_try(xref->ctx)
{
- xref->trailer = pdf_parse_ind_obj(xref, xref->file, buf, cap, NULL, NULL, NULL);
+ xref->trailer = pdf_parse_ind_obj(xref, xref->file, buf, NULL, NULL, NULL);
}
fz_catch(xref->ctx)
{
@@ -132,7 +132,7 @@ pdf_read_new_trailer(pdf_document *xref, char *buf, int cap)
}
static void
-pdf_read_trailer(pdf_document *xref, char *buf, int cap)
+pdf_read_trailer(pdf_document *xref, pdf_lexbuf *buf)
{
int c;
@@ -145,9 +145,9 @@ pdf_read_trailer(pdf_document *xref, char *buf, int cap)
{
c = fz_peek_byte(xref->file);
if (c == 'x')
- pdf_read_old_trailer(xref, buf, cap);
+ pdf_read_old_trailer(xref, buf);
else if (c >= '0' && c <= '9')
- pdf_read_new_trailer(xref, buf, cap);
+ pdf_read_new_trailer(xref, buf);
else
fz_throw(xref->ctx, "cannot recognize xref format: '%c'", c);
}
@@ -179,7 +179,7 @@ pdf_resize_xref(pdf_document *xref, int newlen)
}
static fz_obj *
-pdf_read_old_xref(pdf_document *xref, char *buf, int cap)
+pdf_read_old_xref(pdf_document *xref, pdf_lexbuf *buf)
{
int ofs, len;
char *s;
@@ -189,8 +189,8 @@ pdf_read_old_xref(pdf_document *xref, char *buf, int cap)
int c;
fz_obj *trailer;
- fz_read_line(xref->file, buf, cap);
- if (strncmp(buf, "xref", 4) != 0)
+ fz_read_line(xref->file, buf->scratch, buf->size);
+ if (strncmp(buf->scratch, "xref", 4) != 0)
fz_throw(xref->ctx, "cannot find xref marker");
while (1)
@@ -199,8 +199,8 @@ pdf_read_old_xref(pdf_document *xref, char *buf, int cap)
if (!(c >= '0' && c <= '9'))
break;
- fz_read_line(xref->file, buf, cap);
- s = buf;
+ fz_read_line(xref->file, buf->scratch, buf->size);
+ s = buf->scratch;
ofs = atoi(fz_strsep(&s, " "));
len = atoi(fz_strsep(&s, " "));
@@ -220,12 +220,12 @@ pdf_read_old_xref(pdf_document *xref, char *buf, int cap)
for (i = ofs; i < ofs + len; i++)
{
- n = fz_read(xref->file, (unsigned char *) buf, 20);
+ n = fz_read(xref->file, (unsigned char *) buf->scratch, 20);
if (n < 0)
fz_throw(xref->ctx, "cannot read xref table");
if (!xref->table[i].type)
{
- s = buf;
+ s = buf->scratch;
/* broken pdfs where line start with white space */
while (*s != '\0' && iswhite(*s))
@@ -242,15 +242,15 @@ pdf_read_old_xref(pdf_document *xref, char *buf, int cap)
fz_try(xref->ctx)
{
- tok = pdf_lex(xref->file, buf, cap, &n);
+ tok = pdf_lex(xref->file, buf);
if (tok != PDF_TOK_TRAILER)
fz_throw(xref->ctx, "expected trailer marker");
- tok = pdf_lex(xref->file, buf, cap, &n);
+ tok = pdf_lex(xref->file, buf);
if (tok != PDF_TOK_OPEN_DICT)
fz_throw(xref->ctx, "expected trailer dictionary");
- trailer = pdf_parse_dict(xref, xref->file, buf, cap);
+ trailer = pdf_parse_dict(xref, xref->file, buf);
}
fz_catch(xref->ctx)
{
@@ -296,7 +296,7 @@ pdf_read_new_xref_section(pdf_document *xref, fz_stream *stm, int i0, int i1, in
/* Entered with file locked. Drops the lock in the middle, but then picks
* it up again before exiting. */
static fz_obj *
-pdf_read_new_xref(pdf_document *xref, char *buf, int cap)
+pdf_read_new_xref(pdf_document *xref, pdf_lexbuf *buf)
{
fz_stream *stm = NULL;
fz_obj *trailer = NULL;
@@ -312,7 +312,7 @@ pdf_read_new_xref(pdf_document *xref, char *buf, int cap)
fz_try(ctx)
{
- trailer = pdf_parse_ind_obj(xref, xref->file, buf, cap, &num, &gen, &stm_ofs);
+ trailer = pdf_parse_ind_obj(xref, xref->file, buf, &num, &gen, &stm_ofs);
}
fz_catch(ctx)
{
@@ -378,7 +378,7 @@ pdf_read_new_xref(pdf_document *xref, char *buf, int cap)
/* File is locked on entry, and exit (but may be dropped in the middle) */
static fz_obj *
-pdf_read_xref(pdf_document *xref, int ofs, char *buf, int cap)
+pdf_read_xref(pdf_document *xref, int ofs, pdf_lexbuf *buf)
{
int c;
fz_context *ctx = xref->ctx;
@@ -393,9 +393,9 @@ pdf_read_xref(pdf_document *xref, int ofs, char *buf, int cap)
{
c = fz_peek_byte(xref->file);
if (c == 'x')
- trailer = pdf_read_old_xref(xref, buf, cap);
+ trailer = pdf_read_old_xref(xref, buf);
else if (c >= '0' && c <= '9')
- trailer = pdf_read_new_xref(xref, buf, cap);
+ trailer = pdf_read_new_xref(xref, buf);
else
fz_throw(ctx, "cannot recognize xref format");
}
@@ -407,7 +407,7 @@ pdf_read_xref(pdf_document *xref, int ofs, char *buf, int cap)
}
static void
-pdf_read_xref_sections(pdf_document *xref, int ofs, char *buf, int cap)
+pdf_read_xref_sections(pdf_document *xref, int ofs, pdf_lexbuf *buf)
{
fz_obj *trailer = NULL;
fz_obj *xrefstm = NULL;
@@ -416,16 +416,16 @@ pdf_read_xref_sections(pdf_document *xref, int ofs, char *buf, int cap)
fz_try(ctx)
{
- trailer = pdf_read_xref(xref, ofs, buf, cap);
+ trailer = pdf_read_xref(xref, ofs, buf);
/* FIXME: do we overwrite free entries properly? */
xrefstm = fz_dict_gets(trailer, "XRefStm");
if (xrefstm)
- pdf_read_xref_sections(xref, fz_to_int(xrefstm), buf, cap);
+ pdf_read_xref_sections(xref, fz_to_int(xrefstm), buf);
prev = fz_dict_gets(trailer, "Prev");
if (prev)
- pdf_read_xref_sections(xref, fz_to_int(prev), buf, cap);
+ pdf_read_xref_sections(xref, fz_to_int(prev), buf);
}
fz_catch(ctx)
{
@@ -441,7 +441,7 @@ pdf_read_xref_sections(pdf_document *xref, int ofs, char *buf, int cap)
*/
static void
-pdf_load_xref(pdf_document *xref, char *buf, int bufsize)
+pdf_load_xref(pdf_document *xref, pdf_lexbuf *buf)
{
fz_obj *size;
int i;
@@ -451,7 +451,7 @@ pdf_load_xref(pdf_document *xref, char *buf, int bufsize)
pdf_read_start_xref(xref);
- pdf_read_trailer(xref, buf, bufsize);
+ pdf_read_trailer(xref, buf);
size = fz_dict_gets(xref->trailer, "Size");
if (!size)
@@ -459,7 +459,7 @@ pdf_load_xref(pdf_document *xref, char *buf, int bufsize)
pdf_resize_xref(xref, fz_to_int(size));
- pdf_read_xref_sections(xref, xref->startxref, buf, bufsize);
+ pdf_read_xref_sections(xref, xref->startxref, buf);
/* broken pdfs where first object is not free */
if (xref->table[0].type != 'f')
@@ -672,6 +672,7 @@ pdf_open_document_with_stream(fz_stream *file)
xref = fz_malloc_struct(ctx, pdf_document);
pdf_init_document(xref);
+ xref->lexbuf.base.size = PDF_LEXBUF_LARGE;
xref->file = fz_keep_stream(file);
xref->ctx = ctx;
@@ -681,7 +682,7 @@ pdf_open_document_with_stream(fz_stream *file)
fz_try(ctx)
{
- pdf_load_xref(xref, xref->scratch, sizeof xref->scratch);
+ pdf_load_xref(xref, &xref->lexbuf.base);
}
fz_catch(ctx)
{
@@ -705,7 +706,7 @@ pdf_open_document_with_stream(fz_stream *file)
int hasroot, hasinfo;
if (repaired)
- pdf_repair_xref(xref, xref->scratch, sizeof xref->scratch);
+ pdf_repair_xref(xref, &xref->lexbuf.base);
fz_unlock(ctx, FZ_LOCK_FILE);
locked = 0;
@@ -864,7 +865,7 @@ pdf_debug_xref(pdf_document *xref)
*/
static void
-pdf_load_obj_stm(pdf_document *xref, int num, int gen, char *buf, int cap)
+pdf_load_obj_stm(pdf_document *xref, int num, int gen, pdf_lexbuf *buf)
{
fz_stream *stm = NULL;
fz_obj *objstm = NULL;
@@ -874,7 +875,7 @@ pdf_load_obj_stm(pdf_document *xref, int num, int gen, char *buf, int cap)
fz_obj *obj;
int first;
int count;
- int i, n;
+ int i;
int tok;
fz_context *ctx = xref->ctx;
@@ -896,15 +897,15 @@ pdf_load_obj_stm(pdf_document *xref, int num, int gen, char *buf, int cap)
stm = pdf_open_stream(xref, num, gen);
for (i = 0; i < count; i++)
{
- tok = pdf_lex(stm, buf, cap, &n);
+ tok = pdf_lex(stm, buf);
if (tok != PDF_TOK_INT)
fz_throw(ctx, "corrupt object stream (%d %d R)", num, gen);
- numbuf[i] = atoi(buf);
+ numbuf[i] = buf->i;
- tok = pdf_lex(stm, buf, cap, &n);
+ tok = pdf_lex(stm, buf);
if (tok != PDF_TOK_INT)
fz_throw(ctx, "corrupt object stream (%d %d R)", num, gen);
- ofsbuf[i] = atoi(buf);
+ ofsbuf[i] = buf->i;
}
fz_seek(stm, first, 0);
@@ -913,7 +914,7 @@ pdf_load_obj_stm(pdf_document *xref, int num, int gen, char *buf, int cap)
{
fz_seek(stm, first + ofsbuf[i], 0);
- obj = pdf_parse_stm_obj(xref, stm, buf, cap);
+ obj = pdf_parse_stm_obj(xref, stm, buf);
/* RJW: Ensure above does fz_throw(ctx, "cannot parse object %d in stream (%d %d R)", i, num, gen); */
if (numbuf[i] < 1 || numbuf[i] >= xref->len)
@@ -978,7 +979,7 @@ pdf_cache_object(pdf_document *xref, int num, int gen)
fz_try(ctx)
{
- x->obj = pdf_parse_ind_obj(xref, xref->file, xref->scratch, sizeof xref->scratch,
+ x->obj = pdf_parse_ind_obj(xref, xref->file, &xref->lexbuf.base,
&rnum, &rgen, &x->stm_ofs);
}
fz_catch(ctx)
@@ -1005,7 +1006,7 @@ pdf_cache_object(pdf_document *xref, int num, int gen)
{
fz_try(ctx)
{
- pdf_load_obj_stm(xref, x->ofs, 0, xref->scratch, sizeof xref->scratch);
+ pdf_load_obj_stm(xref, x->ofs, 0, &xref->lexbuf.base);
}
fz_catch(ctx)
{