#include "mupdf/pdf.h" #define IS_NUMBER \ '+':case'-':case'.':case'0':case'1':case'2':case'3':\ case'4':case'5':case'6':case'7':case'8':case'9' #define IS_WHITE \ '\000':case'\011':case'\012':case'\014':case'\015':case'\040' #define IS_HEX \ '0':case'1':case'2':case'3':case'4':case'5':case'6':\ case'7':case'8':case'9':case'A':case'B':case'C':\ case'D':case'E':case'F':case'a':case'b':case'c':\ case'd':case'e':case'f' #define IS_DELIM \ '(':case')':case'<':case'>':case'[':case']':case'{':\ case'}':case'/':case'%' #define RANGE_0_9 \ '0':case'1':case'2':case'3':case'4':case'5':\ case'6':case'7':case'8':case'9' #define RANGE_a_f \ 'a':case'b':case'c':case'd':case'e':case'f' #define RANGE_A_F \ 'A':case'B':case'C':case'D':case'E':case'F' #define RANGE_0_7 \ '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7' static inline int iswhite(int ch) { return ch == '\000' || ch == '\011' || ch == '\012' || ch == '\014' || ch == '\015' || ch == '\040'; } static inline int unhex(int ch) { if (ch >= '0' && ch <= '9') return ch - '0'; if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; return 0; } static void lex_white(fz_context *ctx, fz_stream *f) { int c; do { c = fz_read_byte(ctx, f); } while ((c <= 32) && (iswhite(c))); if (c != EOF) fz_unread_byte(ctx, f); } static void lex_comment(fz_context *ctx, fz_stream *f) { int c; do { c = fz_read_byte(ctx, f); } while ((c != '\012') && (c != '\015') && (c != EOF)); } /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */ static float acrobat_compatible_atof(char *s) { int neg = 0; int i = 0; while (*s == '-') { neg = 1; ++s; } while (*s == '+') { ++s; } while (*s >= '0' && *s <= '9') { /* We deliberately ignore overflow here. * Tests show that Acrobat handles * overflows in exactly the same way we do: * 123450000000000000000678 is read as 678. */ i = i * 10 + (*s - '0'); ++s; } if (*s == '.') { float v = i; float n = 0; float d = 1; ++s; while (*s >= '0' && *s <= '9') { n = 10 * n + (*s - '0'); d = 10 * d; ++s; } v += n / d; return neg ? -v : v; } else { return neg ? -i : i; } } /* Fast but inaccurate atoi. */ static int fast_atoi(char *s) { int neg = 0; int i = 0; while (*s == '-') { neg = 1; ++s; } while (*s == '+') { ++s; } while (*s >= '0' && *s <= '9') { /* We deliberately ignore overflow here. */ i = i * 10 + (*s - '0'); ++s; } return neg ? -i : i; } static int lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) { char *s = buf->scratch; char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */ char *isreal = (c == '.' ? s : NULL); int neg = (c == '-'); *s++ = c; while (s < e) { int c = fz_read_byte(ctx, f); switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(ctx, f); goto end; case EOF: goto end; case '-': neg++; *s++ = c; break; case '.': isreal = s; /* Fall through */ default: *s++ = c; break; } } end: *s = '\0'; if (isreal) { /* We'd like to use the fastest possible atof * routine, but we'd rather match acrobats * handling of broken numbers. As such, we * spot common broken cases and call an * acrobat compatible routine where required. */ if (neg > 1 || isreal - buf->scratch >= 10) buf->f = acrobat_compatible_atof(buf->scratch); else buf->f = fz_atof(buf->scratch); return PDF_TOK_REAL; } else { buf->i = fast_atoi(buf->scratch); return PDF_TOK_INT; } } static void lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) { char *s = buf->scratch; int n = buf->size; while (n > 1) { int c = fz_read_byte(ctx, f); switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(ctx, f); goto end; case EOF: goto end; case '#': { int d; c = fz_read_byte(ctx, f); switch (c) { case RANGE_0_9: d = (c - '0') << 4; break; case RANGE_a_f: d = (c - 'a' + 10) << 4; break; case RANGE_A_F: d = (c - 'A' + 10) << 4; break; default: fz_unread_byte(ctx, f); /* fallthrough */ case EOF: goto end; } c = fz_read_byte(ctx, f); switch (c) { case RANGE_0_9: c -= '0'; break; case RANGE_a_f: c -= 'a' - 10; break; case RANGE_A_F: c -= 'A' - 10; break; default: fz_unread_byte(ctx, f); /* fallthrough */ case EOF: *s++ = d; n--; goto end; } *s++ = d + c; n--; break; } default: *s++ = c; n--; break; } } end: *s = '\0'; buf->len = s - buf->scratch; } static int lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) { char *s = lb->scratch; char *e = s + lb->size; int bal = 1; int oct; int c; while (1) { if (s == e) { s += pdf_lexbuf_grow(ctx, lb); e = lb->scratch + lb->size; } c = fz_read_byte(ctx, f); switch (c) { case EOF: goto end; case '(': bal++; *s++ = c; break; case ')': bal --; if (bal == 0) goto end; *s++ = c; break; case '\\': c = fz_read_byte(ctx, f); switch (c) { case EOF: goto end; case 'n': *s++ = '\n'; break; case 'r': *s++ = '\r'; break; case 't': *s++ = '\t'; break; case 'b': *s++ = '\b'; break; case 'f': *s++ = '\f'; break; case '(': *s++ = '('; break; case ')': *s++ = ')'; break; case '\\': *s++ = '\\'; break; case RANGE_0_7: oct = c - '0'; c = fz_read_byte(ctx, f); if (c >= '0' && c <= '7') { oct = oct * 8 + (c - '0'); c = fz_read_byte(ctx, f); if (c >= '0' && c <= '7') oct = oct * 8 + (c - '0'); else if (c != EOF) fz_unread_byte(ctx, f); } else if (c != EOF) fz_unread_byte(ctx, f); *s++ = oct; break; case '\n': break; case '\r': c = fz_read_byte(ctx, f); if ((c != '\n') && (c != EOF)) fz_unread_byte(ctx, f); break; default: *s++ = c; } break; default: *s++ = c; break; } } end: lb->len = s - lb->scratch; return PDF_TOK_STRING; } static int lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) { char *s = lb->scratch; char *e = s + lb->size; int a = 0, x = 0; int c; while (1) { if (s == e) { s += pdf_lexbuf_grow(ctx, lb); e = lb->scratch + lb->size; } c = fz_read_byte(ctx, f); switch (c) { case IS_WHITE: break; case IS_HEX: if (x) { *s++ = a * 16 + unhex(c); x = !x; } else { a = unhex(c); x = !x; } break; case '>': case EOF: goto end; default: fz_warn(ctx, "ignoring invalid character in hex string"); } } end: lb->len = s - lb->scratch; return PDF_TOK_STRING; } static pdf_token pdf_token_from_keyword(char *key) { switch (*key) { case 'R': if (!strcmp(key, "R")) return PDF_TOK_R; break; case 't': if (!strcmp(key, "true")) return PDF_TOK_TRUE; if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; break; case 'f': if (!strcmp(key, "false")) return PDF_TOK_FALSE; break; case 'n': if (!strcmp(key, "null")) return PDF_TOK_NULL; break; case 'o': if (!strcmp(key, "obj")) return PDF_TOK_OBJ; break; case 'e': if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; break; case 's': if (!strcmp(key, "stream")) return PDF_TOK_STREAM; if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; break; case 'x': if (!strcmp(key, "xref")) return PDF_TOK_XREF; break; default: break; } return PDF_TOK_KEYWORD; } void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) { lb->size = lb->base_size = size; lb->len = 0; lb->scratch = &lb->buffer[0]; } void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb) { if (lb && lb->size != lb->base_size) fz_free(ctx, lb->scratch); } ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb) { char *old = lb->scratch; int newsize = lb->size * 2; if (lb->size == lb->base_size) { lb->scratch = fz_malloc(ctx, newsize); memcpy(lb->scratch, lb->buffer, lb->size); } else { lb->scratch = fz_resize_array(ctx, lb->scratch, newsize, 1); } lb->size = newsize; return lb->scratch - old; } pdf_token pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) { while (1) { int c = fz_read_byte(ctx, f); switch (c) { case EOF: return PDF_TOK_EOF; case IS_WHITE: lex_white(ctx, f); break; case '%': lex_comment(ctx, f); break; case '/': lex_name(ctx, f, buf); return PDF_TOK_NAME; case '(': return lex_string(ctx, f, buf); case ')': fz_warn(ctx, "lexical error (unexpected ')')"); continue; case '<': c = fz_read_byte(ctx, f); if (c == '<') { return PDF_TOK_OPEN_DICT; } else { fz_unread_byte(ctx, f); return lex_hex_string(ctx, f, buf); } case '>': c = fz_read_byte(ctx, f); if (c == '>') { return PDF_TOK_CLOSE_DICT; } fz_warn(ctx, "lexical error (unexpected '>')"); if (c == EOF) { return PDF_TOK_EOF; } fz_unread_byte(ctx, f); continue; case '[': return PDF_TOK_OPEN_ARRAY; case ']': return PDF_TOK_CLOSE_ARRAY; case '{': return PDF_TOK_OPEN_BRACE; case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: return lex_number(ctx, f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(ctx, f); lex_name(ctx, f, buf); return pdf_token_from_keyword(buf->scratch); } } } pdf_token pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) { while (1) { int c = fz_read_byte(ctx, f); switch (c) { case EOF: return PDF_TOK_EOF; case IS_WHITE: lex_white(ctx, f); break; case '%': lex_comment(ctx, f); break; case '/': lex_name(ctx, f, buf); return PDF_TOK_NAME; case '(': continue; case ')': continue; case '<': c = fz_read_byte(ctx, f); if (c == '<') { return PDF_TOK_OPEN_DICT; } else { continue; } case '>': c = fz_read_byte(ctx, f); if (c == '>') { return PDF_TOK_CLOSE_DICT; } if (c == EOF) { return PDF_TOK_EOF; } fz_unread_byte(ctx, f); continue; case '[': return PDF_TOK_OPEN_ARRAY; case ']': return PDF_TOK_CLOSE_ARRAY; case '{': return PDF_TOK_OPEN_BRACE; case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: return lex_number(ctx, f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(ctx, f); lex_name(ctx, f, buf); return pdf_token_from_keyword(buf->scratch); } } } void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) { switch (tok) { case PDF_TOK_NAME: fz_buffer_printf(ctx, fzbuf, "/%s", buf->scratch); break; case PDF_TOK_STRING: if (buf->len >= buf->size) pdf_lexbuf_grow(ctx, buf); buf->scratch[buf->len] = 0; fz_buffer_print_pdf_string(ctx, fzbuf, buf->scratch); break; case PDF_TOK_OPEN_DICT: fz_buffer_printf(ctx, fzbuf, "<<"); break; case PDF_TOK_CLOSE_DICT: fz_buffer_printf(ctx, fzbuf, ">>"); break; case PDF_TOK_OPEN_ARRAY: fz_buffer_printf(ctx, fzbuf, "["); break; case PDF_TOK_CLOSE_ARRAY: fz_buffer_printf(ctx, fzbuf, "]"); break; case PDF_TOK_OPEN_BRACE: fz_buffer_printf(ctx, fzbuf, "{"); break; case PDF_TOK_CLOSE_BRACE: fz_buffer_printf(ctx, fzbuf, "}"); break; case PDF_TOK_INT: fz_buffer_printf(ctx, fzbuf, "%d", buf->i); break; case PDF_TOK_REAL: { fz_buffer_printf(ctx, fzbuf, "%g", buf->f); } break; default: fz_buffer_printf(ctx, fzbuf, "%s", buf->scratch); break; } }