From 0a927854a10e1e6b9770a81e2e1d9f3093631757 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Wed, 19 Jun 2013 15:29:44 +0200 Subject: Rearrange source files. --- source/pdf/pdf-lex.c | 553 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 source/pdf/pdf-lex.c (limited to 'source/pdf/pdf-lex.c') diff --git a/source/pdf/pdf-lex.c b/source/pdf/pdf-lex.c new file mode 100644 index 00000000..a8bf9f4b --- /dev/null +++ b/source/pdf/pdf-lex.c @@ -0,0 +1,553 @@ +#include "mupdf/pdf.h" + +#define IS_NUMBER \ + '+':case'-':case'.':case'0':case'1':case'2':case'3':\ + case'4':case'5':case'6':case'7':case'8':case'9' +#define IS_WHITE \ + '\000':case'\011':case'\012':case'\014':case'\015':case'\040' +#define IS_HEX \ + '0':case'1':case'2':case'3':case'4':case'5':case'6':\ + case'7':case'8':case'9':case'A':case'B':case'C':\ + case'D':case'E':case'F':case'a':case'b':case'c':\ + case'd':case'e':case'f' +#define IS_DELIM \ + '(':case')':case'<':case'>':case'[':case']':case'{':\ + case'}':case'/':case'%' + +#define RANGE_0_9 \ + '0':case'1':case'2':case'3':case'4':case'5':\ + case'6':case'7':case'8':case'9' +#define RANGE_a_f \ + 'a':case'b':case'c':case'd':case'e':case'f' +#define RANGE_A_F \ + 'A':case'B':case'C':case'D':case'E':case'F' + +static inline int iswhite(int ch) +{ + return + ch == '\000' || + ch == '\011' || + ch == '\012' || + ch == '\014' || + ch == '\015' || + ch == '\040'; +} + +static inline int unhex(int ch) +{ + if (ch >= '0' && ch <= '9') return ch - '0'; + if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; + if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; + return 0; +} + +static void +lex_white(fz_stream *f) +{ + int c; + do { + c = fz_read_byte(f); + } while ((c <= 32) && (iswhite(c))); + if (c != EOF) + fz_unread_byte(f); +} + +static void +lex_comment(fz_stream *f) +{ + int c; + do { + c = fz_read_byte(f); + } while ((c != '\012') && (c != '\015') && (c != EOF)); +} + +static int +lex_number(fz_stream *f, pdf_lexbuf *buf, int c) +{ + int neg = 0; + int i = 0; + int n; + int d; + float v; + + /* Initially we might have +, -, . or a digit */ + switch (c) + { + case '.': + goto loop_after_dot; + case '-': + neg = 1; + break; + case '+': + break; + default: /* Must be a digit */ + i = c - '0'; + break; + } + + while (1) + { + c = fz_read_byte(f); + switch (c) + { + case '.': + goto loop_after_dot; + case RANGE_0_9: + i = 10*i + c - '0'; + /* FIXME: Need overflow check here; do we care? */ + break; + default: + fz_unread_byte(f); + /* Fallthrough */ + case EOF: + if (neg) + i = -i; + buf->i = i; + return PDF_TOK_INT; + } + } + + /* In here, we've seen a dot, so can accept just digits */ +loop_after_dot: + n = 0; + d = 1; + while (1) + { + c = fz_read_byte(f); + switch (c) + { + case RANGE_0_9: + if (d >= INT_MAX/10) + goto underflow; + n = n*10 + (c - '0'); + d *= 10; + break; + default: + fz_unread_byte(f); + /* Fallthrough */ + case EOF: + v = (float)i + ((float)n / (float)d); + if (neg) + v = -v; + buf->f = v; + return PDF_TOK_REAL; + } + } + +underflow: + /* Ignore any digits after here, because they are too small */ + while (1) + { + c = fz_read_byte(f); + switch (c) + { + case RANGE_0_9: + break; + default: + fz_unread_byte(f); + /* Fallthrough */ + case EOF: + v = (float)i + ((float)n / (float)d); + if (neg) + v = -v; + buf->f = v; + return PDF_TOK_REAL; + } + } +} + +static void +lex_name(fz_stream *f, pdf_lexbuf *buf) +{ + char *s = buf->scratch; + int n = buf->size; + + while (n > 1) + { + int c = fz_read_byte(f); + switch (c) + { + case IS_WHITE: + case IS_DELIM: + fz_unread_byte(f); + goto end; + case EOF: + goto end; + case '#': + { + int d; + c = fz_read_byte(f); + switch (c) + { + case RANGE_0_9: + d = (c - '0') << 4; + break; + case RANGE_a_f: + d = (c - 'a' + 10) << 4; + break; + case RANGE_A_F: + d = (c - 'A' + 10) << 4; + break; + default: + fz_unread_byte(f); + /* fallthrough */ + case EOF: + goto end; + } + c = fz_read_byte(f); + switch (c) + { + case RANGE_0_9: + c -= '0'; + break; + case RANGE_a_f: + c -= 'a' - 10; + break; + case RANGE_A_F: + c -= 'A' - 10; + break; + default: + fz_unread_byte(f); + /* fallthrough */ + case EOF: + *s++ = d; + n--; + goto end; + } + *s++ = d + c; + n--; + break; + } + default: + *s++ = c; + n--; + break; + } + } +end: + *s = '\0'; + buf->len = s - buf->scratch; +} + +static int +lex_string(fz_stream *f, pdf_lexbuf *lb) +{ + char *s = lb->scratch; + char *e = s + lb->size; + int bal = 1; + int oct; + int c; + + while (1) + { + if (s == e) + { + s += pdf_lexbuf_grow(lb); + e = lb->scratch + lb->size; + } + c = fz_read_byte(f); + switch (c) + { + case EOF: + goto end; + case '(': + bal++; + *s++ = c; + break; + case ')': + bal --; + if (bal == 0) + goto end; + *s++ = c; + break; + case '\\': + c = fz_read_byte(f); + switch (c) + { + case EOF: + goto end; + case 'n': + *s++ = '\n'; + break; + case 'r': + *s++ = '\r'; + break; + case 't': + *s++ = '\t'; + break; + case 'b': + *s++ = '\b'; + break; + case 'f': + *s++ = '\f'; + break; + case '(': + *s++ = '('; + break; + case ')': + *s++ = ')'; + break; + case '\\': + *s++ = '\\'; + break; + case RANGE_0_9: + oct = c - '0'; + c = fz_read_byte(f); + if (c >= '0' && c <= '9') + { + oct = oct * 8 + (c - '0'); + c = fz_read_byte(f); + if (c >= '0' && c <= '9') + oct = oct * 8 + (c - '0'); + else if (c != EOF) + fz_unread_byte(f); + } + else if (c != EOF) + fz_unread_byte(f); + *s++ = oct; + break; + case '\n': + break; + case '\r': + c = fz_read_byte(f); + if ((c != '\n') && (c != EOF)) + fz_unread_byte(f); + break; + default: + *s++ = c; + } + break; + default: + *s++ = c; + break; + } + } +end: + lb->len = s - lb->scratch; + return PDF_TOK_STRING; +} + +static int +lex_hex_string(fz_stream *f, pdf_lexbuf *lb) +{ + char *s = lb->scratch; + char *e = s + lb->size; + int a = 0, x = 0; + int c; + + while (1) + { + if (s == e) + { + s += pdf_lexbuf_grow(lb); + e = lb->scratch + lb->size; + } + c = fz_read_byte(f); + switch (c) + { + case IS_WHITE: + break; + case IS_HEX: + if (x) + { + *s++ = a * 16 + unhex(c); + x = !x; + } + else + { + a = unhex(c); + x = !x; + } + break; + case '>': + case EOF: + goto end; + default: + fz_warn(f->ctx, "ignoring invalid character in hex string"); + } + } +end: + lb->len = s - lb->scratch; + return PDF_TOK_STRING; +} + +static pdf_token +pdf_token_from_keyword(char *key) +{ + switch (*key) + { + case 'R': + if (!strcmp(key, "R")) return PDF_TOK_R; + break; + case 't': + if (!strcmp(key, "true")) return PDF_TOK_TRUE; + if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; + break; + case 'f': + if (!strcmp(key, "false")) return PDF_TOK_FALSE; + break; + case 'n': + if (!strcmp(key, "null")) return PDF_TOK_NULL; + break; + case 'o': + if (!strcmp(key, "obj")) return PDF_TOK_OBJ; + break; + case 'e': + if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; + if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; + break; + case 's': + if (!strcmp(key, "stream")) return PDF_TOK_STREAM; + if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; + break; + case 'x': + if (!strcmp(key, "xref")) return PDF_TOK_XREF; + break; + default: + break; + } + + return PDF_TOK_KEYWORD; +} + +void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) +{ + lb->size = lb->base_size = size; + lb->len = 0; + lb->ctx = ctx; + lb->scratch = &lb->buffer[0]; +} + +void pdf_lexbuf_fin(pdf_lexbuf *lb) +{ + if (lb && lb->size != lb->base_size) + fz_free(lb->ctx, lb->scratch); +} + +ptrdiff_t pdf_lexbuf_grow(pdf_lexbuf *lb) +{ + char *old = lb->scratch; + int newsize = lb->size * 2; + if (lb->size == lb->base_size) + { + lb->scratch = fz_malloc(lb->ctx, newsize); + memcpy(lb->scratch, lb->buffer, lb->size); + } + else + { + lb->scratch = fz_resize_array(lb->ctx, lb->scratch, newsize, 1); + } + lb->size = newsize; + return lb->scratch - old; +} + +pdf_token +pdf_lex(fz_stream *f, pdf_lexbuf *buf) +{ + while (1) + { + int c = fz_read_byte(f); + switch (c) + { + case EOF: + return PDF_TOK_EOF; + case IS_WHITE: + lex_white(f); + break; + case '%': + lex_comment(f); + break; + case '/': + lex_name(f, buf); + return PDF_TOK_NAME; + case '(': + return lex_string(f, buf); + case ')': + fz_warn(f->ctx, "lexical error (unexpected ')')"); + continue; + case '<': + c = fz_read_byte(f); + if (c == '<') + { + return PDF_TOK_OPEN_DICT; + } + else + { + fz_unread_byte(f); + return lex_hex_string(f, buf); + } + case '>': + c = fz_read_byte(f); + if (c == '>') + { + return PDF_TOK_CLOSE_DICT; + } + fz_warn(f->ctx, "lexical error (unexpected '>')"); + fz_unread_byte(f); + continue; + case '[': + return PDF_TOK_OPEN_ARRAY; + case ']': + return PDF_TOK_CLOSE_ARRAY; + case '{': + return PDF_TOK_OPEN_BRACE; + case '}': + return PDF_TOK_CLOSE_BRACE; + case IS_NUMBER: + return lex_number(f, buf, c); + default: /* isregular: !isdelim && !iswhite && c != EOF */ + fz_unread_byte(f); + lex_name(f, buf); + return pdf_token_from_keyword(buf->scratch); + } + } +} + +void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) +{ + switch (tok) + { + case PDF_TOK_NAME: + fz_buffer_printf(ctx, fzbuf, "/%s", buf->scratch); + break; + case PDF_TOK_STRING: + if (buf->len >= buf->size) + pdf_lexbuf_grow(buf); + buf->scratch[buf->len] = 0; + fz_buffer_cat_pdf_string(ctx, fzbuf, buf->scratch); + break; + case PDF_TOK_OPEN_DICT: + fz_buffer_printf(ctx, fzbuf, "<<"); + break; + case PDF_TOK_CLOSE_DICT: + fz_buffer_printf(ctx, fzbuf, ">>"); + break; + case PDF_TOK_OPEN_ARRAY: + fz_buffer_printf(ctx, fzbuf, "["); + break; + case PDF_TOK_CLOSE_ARRAY: + fz_buffer_printf(ctx, fzbuf, "]"); + break; + case PDF_TOK_OPEN_BRACE: + fz_buffer_printf(ctx, fzbuf, "{"); + break; + case PDF_TOK_CLOSE_BRACE: + fz_buffer_printf(ctx, fzbuf, "}"); + break; + case PDF_TOK_INT: + fz_buffer_printf(ctx, fzbuf, "%d", buf->i); + break; + case PDF_TOK_REAL: + { + char sbuf[256]; + sprintf(sbuf, "%g", buf->f); + if (strchr(sbuf, 'e')) /* bad news! */ + sprintf(sbuf, fabsf(buf->f) > 1 ? "%1.1f" : "%1.8f", buf->f); + fz_buffer_printf(ctx, fzbuf, "%s", sbuf); + } + break; + default: + fz_buffer_printf(ctx, fzbuf, "%s", buf->scratch); + break; + } +} -- cgit v1.2.3