summaryrefslogtreecommitdiff
path: root/pdf/pdf_lex.c
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2012-06-22 13:26:35 +0100
committerRobin Watts <robin.watts@artifex.com>2012-06-22 16:42:09 +0100
commitb4157f3585c0f7a07305ac324e7ab9d47e7d04ae (patch)
treed993f31a9fb04e9b833ca042e2aa8088e393999c /pdf/pdf_lex.c
parent34fc2fb23d9f0a9b2a68896e608ea149d8bc38e2 (diff)
downloadmupdf-b4157f3585c0f7a07305ac324e7ab9d47e7d04ae.tar.xz
Rework pdf_lexbuf to allow for dynamic parsing buffers.
Currently pdf_lexbufs use a static scratch buffer for parsing. In the main case this is 64K in size, but in other cases it can be just 256 bytes; this causes problems when parsing long strings. Even the 64K limit is an implementation limit of Acrobat, not an architectural limit of PDF. Change here to allow dynamic buffers. This means a slightly more complex setup and destruction for each buffer, but more importantly requires correct cleanup on errors. To avoid having to insert lots more try/catch clauses this commit includes various changes to the code so we reuse pdf_lexbufs where possible. This keeps the speed up.
Diffstat (limited to 'pdf/pdf_lex.c')
-rw-r--r--pdf/pdf_lex.c69
1 files changed, 55 insertions, 14 deletions
diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c
index c6ab6604..967b6dcc 100644
--- a/pdf/pdf_lex.c
+++ b/pdf/pdf_lex.c
@@ -231,16 +231,21 @@ end:
}
static int
-lex_string(fz_stream *f, char *buf, int n)
+lex_string(fz_stream *f, pdf_lexbuf *lb)
{
- char *s = buf;
- char *e = buf + n;
+ char *s = lb->scratch;
+ char *e = s + lb->size;
int bal = 1;
int oct;
int c;
- while (s < e)
+ while (1)
{
+ if (s == e)
+ {
+ s += pdf_lexbuf_grow(lb);
+ e = lb->scratch + lb->size;
+ }
c = fz_read_byte(f);
switch (c)
{
@@ -319,19 +324,25 @@ lex_string(fz_stream *f, char *buf, int n)
}
}
end:
- return s - buf;
+ lb->len = s - lb->scratch;
+ return PDF_TOK_STRING;
}
static int
-lex_hex_string(fz_stream *f, char *buf, int n)
+lex_hex_string(fz_stream *f, pdf_lexbuf *lb)
{
- char *s = buf;
- char *e = buf + n;
+ char *s = lb->scratch;
+ char *e = s + lb->size;
int a = 0, x = 0;
int c;
- while (s < e)
+ while (1)
{
+ if (s == e)
+ {
+ s += pdf_lexbuf_grow(lb);
+ e = lb->scratch + lb->size;
+ }
c = fz_read_byte(f);
switch (c)
{
@@ -357,7 +368,8 @@ lex_hex_string(fz_stream *f, char *buf, int n)
}
}
end:
- return s - buf;
+ lb->len = s - lb->scratch;
+ return PDF_TOK_STRING;
}
static int
@@ -399,6 +411,37 @@ pdf_token_from_keyword(char *key)
return PDF_TOK_KEYWORD;
}
+void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
+{
+ lb->size = lb->base_size = size;
+ lb->len = 0;
+ lb->ctx = ctx;
+ lb->scratch = &lb->buffer[0];
+}
+
+void pdf_lexbuf_fin(pdf_lexbuf *lb)
+{
+ if (lb && lb->size != lb->base_size)
+ fz_free(lb->ctx, lb->scratch);
+}
+
+ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb)
+{
+ char *old = lb->scratch;
+ int newsize = lb->size * 2;
+ if (lb->size == lb->base_size)
+ {
+ lb->scratch = fz_malloc(lb->ctx, newsize);
+ memcpy(lb->scratch, lb->buffer, lb->size);
+ }
+ else
+ {
+ lb->scratch = fz_resize_array(lb->ctx, lb->scratch, newsize, 1);
+ }
+ lb->size = newsize;
+ return lb->scratch - old;
+}
+
int
pdf_lex(fz_stream *f, pdf_lexbuf *buf)
{
@@ -419,8 +462,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
lex_name(f, buf);
return PDF_TOK_NAME;
case '(':
- buf->len = lex_string(f, buf->scratch, buf->size);
- return PDF_TOK_STRING;
+ return lex_string(f, buf);
case ')':
fz_warn(f->ctx, "lexical error (unexpected ')')");
continue;
@@ -433,8 +475,7 @@ pdf_lex(fz_stream *f, pdf_lexbuf *buf)
else
{
fz_unread_byte(f);
- buf->len = lex_hex_string(f, buf->scratch, buf->size);
- return PDF_TOK_STRING;
+ return lex_hex_string(f, buf);
}
case '>':
c = fz_read_byte(f);