Revamp pdf lexing code

A huge amount (20%+ on some files) of our runtime is spent in fz_atof. A survey of results on the net suggests we will get much better speed by writing our own atof. Part of the job of doing this involves parsing the string to identify the component parts of the number - ludicrously, we are already doing this as part of the lexing process, so it would make sense to do the atoi/atof as part of this process. In order to do this, we need somewhere to store the lexed results; rather than add a float * and an int * to every single pdf_lex call, we generalise the calls to pass a pdf_lexbuf * pointer instead of separate buffer/max/string length pointers. This should help us overall.
author: Robin Watts <robin.watts@artifex.com> 2012-02-23 15:07:04 +0000
committer: Robin Watts <robin.watts@artifex.com> 2012-02-25 08:14:40 -0800
commit: 9ef2a68e77842456ab30594a9a8d2c0535314715 (patch)
tree: 630c1b23b64ed15984b6c065635929f08bf613e8 /pdf/pdf_lex.c
parent: d28129c2ff6a78c50877426f90167d63334ab18a (diff)
download: mupdf-9ef2a68e77842456ab30594a9a8d2c0535314715.tar.xz
1 files changed, 65 insertions, 52 deletions
diff --git a/pdf/pdf_lex.c b/pdf/pdf_lex.c
index 24828412..322d945c 100644
--- a/pdf/pdf_lex.c
+++ b/pdf/pdf_lex.c
@@ -63,87 +63,106 @@ lex_comment(fz_stream *f)
 }
 
 static int
-lex_number(fz_stream *f, char *s, int n, int *tok)
+lex_number(fz_stream *f, pdf_lexbuf *buf, int c)
 {
-	char *buf = s;
-	*tok = PDF_TOK_INT;
+	int neg = 0;
+	int i = 0;
+	int n;
+	int d;
+	float v;
 
 	/* Initially we might have +, -, . or a digit */
-	if (n > 1)
+	switch (c)
+	{
+	case '.':
+		goto loop_after_dot;
+	case '-':
+		neg = 1;
+		break;
+	case '+':
+		break;
+	default: /* Must be a digit */
+		i = c - '0';
+		break;
+	}
+
+	while (1)
 	{
 		int c = fz_read_byte(f);
 		switch (c)
 		{
 		case '.':
-			*tok = PDF_TOK_REAL;
-			*s++ = c;
-			n--;
 			goto loop_after_dot;
-		case '+':
-		case '-':
 		case RANGE_0_9:
-			*s++ = c;
-			n--;
-			goto loop_after_sign;
+			i = 10*i + c - '0';
+			/* FIXME: Need overflow check here; do we care? */
+			break;
 		default:
 			fz_unread_byte(f);
-			goto end;
+			/* Fallthrough */
 		case EOF:
-			goto end;
+			if (neg)
+				i = -i;
+			buf->i = i;
+			return PDF_TOK_INT;
 		}
 	}
 
-	/* We can't accept a sign from here on in, just . or a digit */
-loop_after_sign:
-	while (n > 1)
+	/* In here, we've seen a dot, so can accept just digits */
+loop_after_dot:
+	n = 0;
+	d = 1;
+	while (1)
 	{
 		int c = fz_read_byte(f);
 		switch (c)
 		{
-		case '.':
-			*tok = PDF_TOK_REAL;
-			*s++ = c;
-			n--;
-			goto loop_after_dot;
 		case RANGE_0_9:
-			*s++ = c;
+			if (d >= INT_MAX/10)
+				goto underflow;
+			n = n*10 + (c - '0');
+			d *= 10;
 			break;
 		default:
 			fz_unread_byte(f);
-			goto end;
+			/* Fallthrough */
 		case EOF:
-			goto end;
+			v = (float)i + ((float)n / (float)d);
+			if (neg)
+				v = -v;
+			buf->f = v;
+			return PDF_TOK_REAL;
 		}
-		n--;
 	}
 
-	/* In here, we've seen a dot, so can accept just digits */
-loop_after_dot:
-	while (n > 1)
+underflow:
+	/* Ignore any digits after here, because they are too small */
+	while (1)
 	{
 		int c = fz_read_byte(f);
 		switch (c)
 		{
 		case RANGE_0_9:
-			*s++ = c;
 			break;
 		default:
 			fz_unread_byte(f);
-			goto end;
+			/* Fallthrough */
 		case EOF:
-			goto end;
+			v = (float)i + ((float)n / (float)d);
+			if (neg)
+				v = -v;
+			buf->f = v;
+			return PDF_TOK_REAL;
 		}
-		n--;
 	}
-
-end:
-	*s = '\0';
-	return s-buf;
 }
 
 static void
-lex_name(fz_stream *f, char *s, int n)
+lex_name(fz_stream *f, pdf_lexbuf *buf)
 {
+	char *s = buf->scratch;
+	int n = buf->size;
+
 	while (n > 1)
 	{
 		int c = fz_read_byte(f);
@@ -208,6 +227,7 @@ lex_name(fz_stream *f, char *s, int n)
 	}
 end:
 	*s = '\0';
+	buf->len = s - buf->scratch;
 }
 
 static int
@@ -380,7 +400,7 @@ pdf_token_from_keyword(char *key)
 }
 
 int
-pdf_lex(fz_stream *f, char *buf, int n, int *sl)
+pdf_lex(fz_stream *f, pdf_lexbuf *buf)
 {
 	while (1)
 	{
@@ -396,11 +416,10 @@ pdf_lex(fz_stream *f, char *buf, int n, int *sl)
 			lex_comment(f);
 			break;
 		case '/':
-			lex_name(f, buf, n);
-			*sl = strlen(buf);
+			lex_name(f, buf);
 			return PDF_TOK_NAME;
 		case '(':
-			*sl = lex_string(f, buf, n);
+			buf->len = lex_string(f, buf->scratch, buf->size);
 			return PDF_TOK_STRING;
 		case ')':
 			fz_warn(f->ctx, "lexical error (unexpected ')')");
@@ -414,7 +433,7 @@ pdf_lex(fz_stream *f, char *buf, int n, int *sl)
 			else
 			{
 				fz_unread_byte(f);
-				*sl = lex_hex_string(f, buf, n);
+				buf->len = lex_hex_string(f, buf->scratch, buf->size);
 				return PDF_TOK_STRING;
 			}
 		case '>':
@@ -434,17 +453,11 @@ pdf_lex(fz_stream *f, char *buf, int n, int *sl)
 		case '}':
 			return PDF_TOK_CLOSE_BRACE;
 		case IS_NUMBER:
-			{
-				int tok;
-				fz_unread_byte(f);
-				*sl = lex_number(f, buf, n, &tok);
-				return tok;
-			}
+			return lex_number(f, buf, c);
 		default: /* isregular: !isdelim && !iswhite && c != EOF */
 			fz_unread_byte(f);
-			lex_name(f, buf, n);
-			*sl = strlen(buf);
-			return pdf_token_from_keyword(buf);
+			lex_name(f, buf);
+			return pdf_token_from_keyword(buf->scratch);
 		}
 	}
 }
author	Robin Watts <robin.watts@artifex.com>	2012-02-23 15:07:04 +0000
committer	Robin Watts <robin.watts@artifex.com>	2012-02-25 08:14:40 -0800
commit	9ef2a68e77842456ab30594a9a8d2c0535314715 (patch)
tree	630c1b23b64ed15984b6c065635929f08bf613e8 /pdf/pdf_lex.c
parent	d28129c2ff6a78c50877426f90167d63334ab18a (diff)
download	mupdf-9ef2a68e77842456ab30594a9a8d2c0535314715.tar.xz