summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Watts <robin.watts@artifex.com>2010-05-26 14:05:59 +0200
committerRobin Watts <robin.watts@artifex.com>2010-05-26 14:05:59 +0200
commit67a0351468df8c09f334f0cd1bff49712578e7c6 (patch)
treec6a54d20c13ee93700ad80ca350ab2127bba3d05
parentcd39bd3a550165c5464f325956c30458b1ac9000 (diff)
downloadmupdf-67a0351468df8c09f334f0cd1bff49712578e7c6.tar.xz
Optimize lexer by using unreadbyte and switches rather than chained ifs.
-rw-r--r--fitz/fitz_stream.h8
-rw-r--r--fitz/stm_read.c5
-rw-r--r--mupdf/pdf_lex.c476
3 files changed, 295 insertions, 194 deletions
diff --git a/fitz/fitz_stream.h b/fitz/fitz_stream.h
index 8689677e..8cbff89c 100644
--- a/fitz/fitz_stream.h
+++ b/fitz/fitz_stream.h
@@ -372,11 +372,13 @@ fz_buffer * fz_readall(fz_stream *stm, int sizehint);
fz_error fz_readerror(fz_stream *stm);
int fz_readbytex(fz_stream *stm);
int fz_peekbytex(fz_stream *stm);
+void fz_unreadbytex(fz_stream *stm);
#ifdef DEBUG
#define fz_readbyte fz_readbytex
#define fz_peekbyte fz_peekbytex
+#define fz_unreadbyte fz_unreadbytex
#else
@@ -396,6 +398,12 @@ static inline int fz_peekbyte(fz_stream *stm)
return fz_peekbytex(stm);
}
+static inline void fz_unreadbyte(fz_stream *stm)
+{
+ fz_buffer *buf = stm->buffer;
+ buf->rp--;
+}
+
#endif
#endif
diff --git a/fitz/stm_read.c b/fitz/stm_read.c
index 9938b6e6..b01e300d 100644
--- a/fitz/stm_read.c
+++ b/fitz/stm_read.c
@@ -275,3 +275,8 @@ fz_peekbytex(fz_stream *stm)
return buf->rp < buf->wp ? *buf->rp : EOF ;
}
+void fz_unreadbytex(fz_stream *stm)
+{
+ fz_buffer *buf = stm->buffer;
+ buf->rp--;
+}
diff --git a/mupdf/pdf_lex.c b/mupdf/pdf_lex.c
index 350bb1c7..e35a25aa 100644
--- a/mupdf/pdf_lex.c
+++ b/mupdf/pdf_lex.c
@@ -1,12 +1,35 @@
#include "fitz.h"
#include "mupdf.h"
+#define ISNUMBER \
+ '+':case'-':case'.':case'0':case'1':case'2':case'3':\
+ case'4':case'5':case'6':case'7':case'8':case'9'
+#define ISWHITE \
+ '\000':case'\011':case'\012':case'\014':case'\015':case'\040'
+#define ISHEX \
+ '0':case'1':case'2':case'3':case'4':case'5':case'6':\
+ case'7':case'8':case'9':case'A':case'B':case'C':\
+ case'D':case'E':case'F':case'a':case'b':case'c':\
+ case'd':case'e':case'f'
+#define ISDELIM \
+ '(':case')':case'<':case'>':case'[':case']':case'{':\
+ case'}':case'/':case'%'
+
+#define RANGE_0_9 \
+ '0':case'1':case'2':case'3':case'4':case'5':\
+ case'6':case'7':case'8':case'9'
+#define RANGE_a_f \
+ 'a':case'b':case'c':case'd':case'e':case'f'
+#define RANGE_A_F \
+ 'A':case'B':case'C':case'D':case'E':case'F'
+
/*
* pdf_lex will use fz_peekbyte and fz_readbyte.
* have to check for file errors with fz_readerror() after lexing.
*/
-static inline int iswhite(int ch)
+static inline int
+iswhite(int ch)
{
return
ch == '\000' ||
@@ -17,36 +40,8 @@ static inline int iswhite(int ch)
ch == '\040';
}
-static inline int isdelim(int ch)
-{
- return
- ch == '(' || ch == ')' ||
- ch == '<' || ch == '>' ||
- ch == '[' || ch == ']' ||
- ch == '{' || ch == '}' ||
- ch == '/' ||
- ch == '%';
-}
-
-static inline int isregular(int ch)
-{
- return !isdelim(ch) && !iswhite(ch) && ch != EOF;
-}
-
-static inline int isnumber(int ch)
-{
- return ch == '+' || ch == '-' || ch == '.' || (ch >= '0' && ch <= '9');
-}
-
-static inline int ishex(int ch)
-{
- return
- (ch >= '0' && ch <= '9') ||
- (ch >= 'A' && ch <= 'F') ||
- (ch >= 'a' && ch <= 'f');
-}
-
-static inline int fromhex(int ch)
+static inline int
+fromhex(int ch)
{
if (ch >= '0' && ch <= '9')
return ch - '0';
@@ -61,67 +56,172 @@ static inline void
lexwhite(fz_stream *f)
{
int c;
- while (1)
+ do
{
- c = fz_peekbyte(f);
- if (!iswhite(c))
- break;
- fz_readbyte(f);
+ c = fz_readbyte(f);
}
+ while ((c <= 32) && (iswhite(c)));
+ if (c != EOF)
+ fz_unreadbyte(f);
}
static inline void
lexcomment(fz_stream *f)
{
int c;
- while (1)
+ do
{
c = fz_readbyte(f);
- if (c == '\012') break;
- if (c == '\015') break;
- if (c == EOF) break;
}
+ while ((c != '\012') && (c != '\015') && (c != EOF));
}
-static void
-lexnumber(fz_stream *f, char *s, int n)
+static int
+lexnumber(fz_stream *f, char *s, int n, pdf_token_e *tok)
{
+ char *buf = s;
+ *tok = PDF_TINT;
+
+ /* Initially we might have +, -, . or a digit */
+ if (n > 1)
+ {
+ int c = fz_readbyte(f);
+ switch (c)
+ {
+ case '.':
+ *tok = PDF_TREAL;
+ *s++ = c;
+ n--;
+ goto loop_after_dot;
+ case '+':
+ case '-':
+ case IF_0_9:
+ *s++ = c;
+ n--;
+ goto loop_after_sign;
+ default:
+ fz_unreadbyte(f);
+ goto end;
+ case EOF:
+ goto end;
+ }
+ }
+
+ /* We can't accept a sign from here on in, just . or a digit */
+loop_after_sign:
while (n > 1)
{
- if (!isnumber(fz_peekbyte(f)))
+ int c = fz_readbyte(f);
+ switch (c)
+ {
+ case '.':
+ *tok = PDF_TREAL;
+ *s++ = c;
+ n--;
+ goto loop_after_dot;
+ case IF_0_9:
+ *s++ = c;
break;
- *s++ = fz_readbyte(f);
+ default:
+ fz_unreadbyte(f);
+ goto end;
+ case EOF:
+ goto end;
+ }
n--;
}
- *s = '\0';
-}
-
-static void
-lexname(fz_stream *f, char *s, int n)
-{
- char *p = s;
- char *q = s;
+ /* In here, we've seen a dot, so can accept just digits */
+loop_after_dot:
while (n > 1)
{
- if (!isregular(fz_peekbyte(f)))
+ int c = fz_readbyte(f);
+ switch (c)
+ {
+ case IF_0_9:
+ *s++ = c;
break;
- *s++ = fz_readbyte(f);
+ default:
+ fz_unreadbyte(f);
+ goto end;
+ case EOF:
+ goto end;
+ }
n--;
}
+
+end:
*s = '\0';
+ return s-buf;
+}
- while (*p)
+static void
+lexname(fz_stream *f, char *s, int n)
+{
+ while (n > 1)
{
- if (p[0] == '#' && p[1] != 0 && p[2] != 0)
+ int c = fz_readbyte(f);
+ switch (c)
{
- *q++ = fromhex(p[1]) * 16 + fromhex(p[2]);
- p += 3;
+ case ISWHITE:
+ case ISDELIM:
+ fz_unreadbyte(f);
+ goto end;
+ case EOF:
+ goto end;
+ case '#':
+ {
+ int d;
+ c = fz_readbyte(f);
+ switch (c)
+ {
+ case RANGE_0_9:
+ d = (c - '0') << 4;
+ break;
+ case RANGE_a_f:
+ d = (c - 'a' + 10) << 4;
+ break;
+ case RANGE_A_F:
+ d = (c - 'A' + 10) << 4;
+ break;
+ default:
+ fz_unreadbyte(f);
+ /* fallthrough */
+ case EOF:
+ goto end;
+ }
+ c = fz_readbyte(f);
+ switch (c)
+ {
+ case RANGE_0_9:
+ c -= '0';
+ break;
+ case RANGE_a_f:
+ c -= 'a' - 10;
+ break;
+ case RANGE_A_F:
+ c -= 'A' - 10;
+ break;
+ default:
+ fz_unreadbyte(f);
+ /* fallthrough */
+ case EOF:
+ *s++ = d;
+ n--;
+ goto end;
+ }
+ *s++ = d + c;
+ n--;
+ break;
+ }
+ default:
+ *s++ = c;
+ n--;
+ break;
}
- else
- *q++ = *p++;
}
- *q = '\0';
+end:
+ *s = '\0';
}
static int
@@ -136,64 +236,83 @@ lexstring(fz_stream *f, char *buf, int n)
while (s < e)
{
c = fz_readbyte(f);
- if (c == '(')
+ switch (c)
{
+ case EOF:
+ goto end;
+ case '(':
bal++;
*s++ = c;
- }
- else if (c == ')')
- {
+ break;
+ case ')':
bal --;
if (bal == 0)
- break;
+ goto end;
*s++ = c;
- }
- else if (c == '\\')
- {
+ break;
+ case '\\':
c = fz_readbyte(f);
- if (c == 'n') *s++ = '\n';
- else if (c == 'r') *s++ = '\r';
- else if (c == 't') *s++ = '\t';
- else if (c == 'b') *s++ = '\b';
- else if (c == 'f') *s++ = '\f';
- else if (c == '(') *s++ = '(';
- else if (c == ')') *s++ = ')';
- else if (c == '\\') *s++ = '\\';
-
- else if (c >= '0' && c <= '9')
+ switch (c)
{
+ case EOF:
+ goto end;
+ case 'n':
+ *s++ = '\n';
+ break;
+ case 'r':
+ *s++ = '\r';
+ break;
+ case 't':
+ *s++ = '\t';
+ break;
+ case 'b':
+ *s++ = '\b';
+ break;
+ case 'f':
+ *s++ = '\f';
+ break;
+ case '(':
+ *s++ = '(';
+ break;
+ case ')':
+ *s++ = ')';
+ break;
+ case '\\':
+ *s++ = '\\';
+ break;
+ case RANGE_0_9:
oct = c - '0';
- c = fz_peekbyte(f);
+ c = fz_readbyte(f);
if (c >= '0' && c <= '9')
{
- fz_readbyte(f);
oct = oct * 8 + (c - '0');
- c = fz_peekbyte(f);
+ c = fz_readbyte(f);
if (c >= '0' && c <= '9')
- {
- fz_readbyte(f);
oct = oct * 8 + (c - '0');
- }
+ else if (c != EOF)
+ fz_unreadbyte(f);
}
+ else if (c != EOF)
+ fz_unreadbyte(f);
*s++ = oct;
+ break;
+ case '\n':
+ break;
+ case '\r':
+ c = fz_readbyte(f);
+ if ((c != '\n') && (c != EOF))
+ fz_unreadbyte(f);
+ break;
+ default:
+ *s++ = c;
}
-
- else if (c == '\n')
- ;
- else if (c == '\r')
- {
- c = fz_peekbyte(f);
- if (c == '\n')
- fz_readbyte(f);
- }
- else *s++ = c;
- }
- else
- {
+ break;
+ default:
*s++ = c;
+ break;
}
}
-
+end:
return s - buf;
}
@@ -208,12 +327,11 @@ lexhexstring(fz_stream *f, char *buf, int n)
while (s < e)
{
c = fz_readbyte(f);
- if (c == '>')
- break;
- else if (iswhite(c))
- continue;
- else if (ishex(c))
+ switch (c)
{
+ case ISWHITE:
+ break;
+ case ISHEX:
if (x)
{
*s++ = a * 16 + fromhex(c);
@@ -224,30 +342,51 @@ lexhexstring(fz_stream *f, char *buf, int n)
a = fromhex(c);
x = !x;
}
- }
- else
break;
+ case '>':
+ default:
+ goto end;
+ }
}
-
+end:
return s - buf;
}
static pdf_token_e
pdf_tokenfromkeyword(char *key)
{
- if (!strcmp(key, "R")) return PDF_TR;
- if (!strcmp(key, "true")) return PDF_TTRUE;
- if (!strcmp(key, "false")) return PDF_TFALSE;
- if (!strcmp(key, "null")) return PDF_TNULL;
-
- if (!strcmp(key, "obj")) return PDF_TOBJ;
- if (!strcmp(key, "endobj")) return PDF_TENDOBJ;
- if (!strcmp(key, "stream")) return PDF_TSTREAM;
- if (!strcmp(key, "endstream")) return PDF_TENDSTREAM;
-
- if (!strcmp(key, "xref")) return PDF_TXREF;
- if (!strcmp(key, "trailer")) return PDF_TTRAILER;
- if (!strcmp(key, "startxref")) return PDF_TSTARTXREF;
+ switch (*key)
+ {
+ case 'R':
+ if (!strcmp(key, "R")) return PDF_TR;
+ break;
+ case 't':
+ if (!strcmp(key, "true")) return PDF_TTRUE;
+ if (!strcmp(key, "trailer")) return PDF_TTRAILER;
+ break;
+ case 'f':
+ if (!strcmp(key, "false")) return PDF_TFALSE;
+ break;
+ case 'n':
+ if (!strcmp(key, "null")) return PDF_TNULL;
+ break;
+ case 'o':
+ if (!strcmp(key, "obj")) return PDF_TOBJ;
+ break;
+ case 'e':
+ if (!strcmp(key, "endobj")) return PDF_TENDOBJ;
+ if (!strcmp(key, "endstream")) return PDF_TENDSTREAM;
+ break;
+ case 's':
+ if (!strcmp(key, "stream")) return PDF_TSTREAM;
+ if (!strcmp(key, "startxref")) return PDF_TSTARTXREF;
+ break;
+ case 'x':
+ if (!strcmp(key, "xref")) return PDF_TXREF;
+ break;
+ default:
+ break;
+ }
return PDF_TKEYWORD;
}
@@ -260,58 +399,41 @@ pdf_lex(pdf_token_e *tok, fz_stream *f, char *buf, int n, int *sl)
while (1)
{
- c = fz_peekbyte(f);
-
- if (c == EOF)
+ c = fz_readbyte(f);
+ switch (c)
{
+ case EOF:
*tok = PDF_TEOF;
goto cleanupokay;
- }
-
- else if (iswhite(c))
+ case ISWHITE:
lexwhite(f);
-
- else if (c == '%')
+ break;
+ case '%':
lexcomment(f);
-
- else if (c == '/')
- {
- fz_readbyte(f);
+ break;
+ case '/':
lexname(f, buf, n);
*sl = strlen(buf);
*tok = PDF_TNAME;
goto cleanupokay;
- }
-
- else if (c == '(')
- {
- fz_readbyte(f);
+ case '(':
*sl = lexstring(f, buf, n);
*tok = PDF_TSTRING;
goto cleanupokay;
- }
-
- else if (c == '<')
- {
- fz_readbyte(f);
- c = fz_peekbyte(f);
+ case '<':
+ c = fz_readbyte(f);
if (c == '<')
{
- fz_readbyte(f);
*tok = PDF_TODICT;
- goto cleanupokay;
}
else
{
+ fz_unreadbyte(f);
*sl = lexhexstring(f, buf, n);
*tok = PDF_TSTRING;
- goto cleanupokay;
}
- }
-
- else if (c == '>')
- {
- fz_readbyte(f);
+ goto cleanupokay;
+ case '>':
c = fz_readbyte(f);
if (c == '>')
{
@@ -320,62 +442,29 @@ pdf_lex(pdf_token_e *tok, fz_stream *f, char *buf, int n, int *sl)
}
*tok = PDF_TERROR;
goto cleanuperror;
- }
-
- else if (c == '[')
- {
- fz_readbyte(f);
+ case '[':
*tok = PDF_TOARRAY;
goto cleanupokay;
- }
-
- else if (c == ']')
- {
- fz_readbyte(f);
+ case ']':
*tok = PDF_TCARRAY;
goto cleanupokay;
- }
-
- else if (c == '{')
- {
- fz_readbyte(f);
+ case '{':
*tok = PDF_TOBRACE;
goto cleanupokay;
- }
-
- else if (c == '}')
- {
- fz_readbyte(f);
+ case '}':
*tok = PDF_TCBRACE;
goto cleanupokay;
- }
-
- else if (isnumber(c))
- {
- lexnumber(f, buf, n);
- *sl = strlen(buf);
- if (strchr(buf, '.'))
- {
- *tok = PDF_TREAL;
- goto cleanupokay;
- }
- *tok = PDF_TINT;
+ case ISNUMBER:
+ fz_unreadbyte(f);
+ *sl = lexnumber(f, buf, n, tok);
goto cleanupokay;
- }
-
- else if (isregular(c))
- {
+ default: /* isregular(c) */
+ fz_unreadbyte(f);
lexname(f, buf, n);
*sl = strlen(buf);
*tok = pdf_tokenfromkeyword(buf);
goto cleanupokay;
}
-
- else
- {
- *tok = PDF_TERROR;
- goto cleanuperror;
- }
}
cleanupokay:
@@ -397,4 +486,3 @@ cleanuperror:
*tok = PDF_TERROR;
return fz_throw("lexical error");
}
-