summaryrefslogtreecommitdiff
path: root/source/pdf/pdf-lex.c
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2016-01-15 12:28:00 +0100
committerTor Andersson <tor.andersson@artifex.com>2016-01-15 12:28:00 +0100
commit9bc886e621a20aa3dd667be31bf2481e4cfad50d (patch)
tree94b70e43d61d7f97dc14b0aa26c60991f38b11fa /source/pdf/pdf-lex.c
parentad8936bf2bcf54c7042bdec20c49c96657649b34 (diff)
downloadmupdf-9bc886e621a20aa3dd667be31bf2481e4cfad50d.tar.xz
pdf: Consume entire token before lexing numbers.
"0.00-70" should be parsed as one token, not two tokens as we did.
Diffstat (limited to 'source/pdf/pdf-lex.c')
-rw-r--r--source/pdf/pdf-lex.c173
1 files changed, 93 insertions, 80 deletions
diff --git a/source/pdf/pdf-lex.c b/source/pdf/pdf-lex.c
index c2381e8d..99bf3ffd 100644
--- a/source/pdf/pdf-lex.c
+++ b/source/pdf/pdf-lex.c
@@ -63,107 +63,120 @@ lex_comment(fz_context *ctx, fz_stream *f)
} while ((c != '\012') && (c != '\015') && (c != EOF));
}
-static int
-lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
+/* Fast but inaccurate strtof, with Adobe overflow handling. */
+static float fast_atof(char *s)
{
int neg = 0;
- fz_off_t i = 0;
- int n;
- int d;
- float v;
- float fd;
-
- /* Initially we might have +, -, . or a digit */
- switch (c)
+ int i = 0;
+
+ while (*s == '-')
{
- case '.':
- goto loop_after_dot;
- case '-':
neg = 1;
- break;
- case '+':
- break;
- default: /* Must be a digit */
- i = c - '0';
- break;
+ ++s;
+ }
+ while (*s == '+')
+ {
+ ++s;
}
- while (1)
+ while (*s >= '0' && *s <= '9')
{
- c = fz_read_byte(ctx, f);
- switch (c)
- {
- case '.':
- goto loop_after_dot;
- case RANGE_0_9:
- /* We deliberately ignore overflow here. We tried
- * code that returned INT_MIN/MAX as appropriate,
- * but this causes loss of data (see Bug695950.pdf
- * for an example). Tests show that Acrobat handles
- * overflows in exactly the same way we do (i.e.
- * 123450000000000000000678 is read as 678). */
- i = 10*i + c - '0';
- break;
- default:
- fz_unread_byte(ctx, f);
- /* Fallthrough */
- case EOF:
- if (neg)
- i = -i;
- buf->i = i;
- return PDF_TOK_INT;
- }
+ /* We deliberately ignore overflow here.
+ * Tests show that Acrobat handles * overflows in exactly the same way we do:
+ * 123450000000000000000678 is read as 678.
+ */
+ i = i * 10 + (*s - '0');
+ ++s;
}
- /* In here, we've seen a dot, so can accept just digits */
-loop_after_dot:
- n = 0;
- d = 1;
- while (1)
+ if (*s == '.')
{
- c = fz_read_byte(ctx, f);
- switch (c)
+ float v = i;
+ float n = 0;
+ float d = 1;
+ ++s;
+ while (*s >= '0' && *s <= '9')
{
- case RANGE_0_9:
- if (d >= INT_MAX/10)
- goto underflow;
- n = n*10 + (c - '0');
- d *= 10;
- break;
- default:
- fz_unread_byte(ctx, f);
- /* Fallthrough */
- case EOF:
- v = (float)i + ((float)n / (float)d);
- if (neg)
- v = -v;
- buf->f = v;
- return PDF_TOK_REAL;
+ n = 10 * n + (*s - '0');
+ d = 10 * d;
+ ++s;
}
+ v += n / d;
+ return neg ? -v : v;
}
+ else
+ {
+ return neg ? -i : i;
+ }
+}
-underflow:
- fd = 1 / (float)d;
- v = (float)i + ((float)n * fd);
- while (1)
+/* Fast but inaccurate atoi. */
+static int fast_atoi(char *s)
+{
+ int neg = 0;
+ int i = 0;
+
+ while (*s == '-')
{
- c = fz_read_byte(ctx, f);
+ neg = 1;
+ ++s;
+ }
+ while (*s == '+')
+ {
+ ++s;
+ }
+
+ while (*s >= '0' && *s <= '9')
+ {
+ /* We deliberately ignore overflow here. */
+ i = i * 10 + (*s - '0');
+ ++s;
+ }
+
+ return neg ? -i : i;
+}
+
+static int
+lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
+{
+ char *s = buf->scratch;
+ char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
+ int isreal = (c == '.');
+
+ *s++ = c;
+
+ while (s < e)
+ {
+ int c = fz_read_byte(ctx, f);
switch (c)
{
- case RANGE_0_9:
- fd /= 10;
- v += (c - '0') * fd;
- break;
- default:
+ case IS_WHITE:
+ case IS_DELIM:
fz_unread_byte(ctx, f);
- /* Fallthrough */
+ goto end;
case EOF:
- if (neg)
- v = -v;
- buf->f = v;
- return PDF_TOK_REAL;
+ goto end;
+ case '.':
+ isreal = 1;
+ /* Fall through */
+ default:
+ *s++ = c;
+ break;
}
}
+
+end:
+ *s = '\0';
+ if (isreal)
+ {
+ buf->f = fast_atof(buf->scratch);
+ return PDF_TOK_REAL;
+ }
+ else
+ {
+ buf->i = fast_atoi(buf->scratch);
+ return PDF_TOK_INT;
+ }
}
static void