summaryrefslogtreecommitdiff
path: root/source/html/css-parse.c
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2014-09-08 15:49:59 +0200
committerTor Andersson <tor.andersson@artifex.com>2014-12-03 12:25:51 +0100
commit9d1482bc78d72ba330c2130170b49b4e18702623 (patch)
treee7f87a3266419e7bb238095e3fa146953dcb9e28 /source/html/css-parse.c
parent4b8638cfa35ecacf7418ec8933f971577652bb79 (diff)
downloadmupdf-9d1482bc78d72ba330c2130170b49b4e18702623.tar.xz
html: CSS lexer and parser.
Diffstat (limited to 'source/html/css-parse.c')
-rw-r--r--source/html/css-parse.c734
1 files changed, 734 insertions, 0 deletions
diff --git a/source/html/css-parse.c b/source/html/css-parse.c
new file mode 100644
index 00000000..5f7fd254
--- /dev/null
+++ b/source/html/css-parse.c
@@ -0,0 +1,734 @@
+#include "mupdf/fitz.h"
+#include "mupdf/html.h"
+
+struct lexbuf
+{
+ fz_context *ctx;
+ const char *s;
+ int lookahead;
+ int c;
+ int color;
+ int string_len;
+ char string[1024];
+};
+
+static void css_lex_next(struct lexbuf *buf)
+{
+ // buf->s += fz_chartorune(&buf->c, buf->s);
+ buf->c = *(buf->s++);
+}
+
+static void css_lex_init(fz_context *ctx, struct lexbuf *buf, const char *s)
+{
+ buf->ctx = ctx;
+ buf->s = s;
+ buf->c = 0;
+ css_lex_next(buf);
+
+ buf->color = 0;
+ buf->string_len = 0;
+}
+
+static int iswhite(int c)
+{
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f';
+}
+
+static int isnmstart(int c)
+{
+ return c == '\\' || c == '_' || (c >= 'a' && c <= 'z') ||
+ (c >= 128 && c <= 255);
+}
+
+static int isnmchar(int c)
+{
+ return c == '\\' || c == '_' || (c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') || c == '-' || (c >= 128 && c <= 255);
+}
+
+static void css_push_char(struct lexbuf *buf, int c)
+{
+ if (buf->string_len + 1 >= nelem(buf->string))
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "token too long");
+ buf->string[buf->string_len++] = c;
+}
+
+static int css_lex_accept(struct lexbuf *buf, int t)
+{
+ if (buf->c == t)
+ {
+ css_lex_next(buf);
+ return 1;
+ }
+ return 0;
+}
+
+static void css_lex_expect(struct lexbuf *buf, int t)
+{
+ if (!css_lex_accept(buf, t))
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected '%c'", t);
+}
+
+static int ishex(int c, int *v)
+{
+ if (c >= '0' && c <= '9')
+ {
+ *v = c - '0';
+ return 1;
+ }
+ if (c >= 'A' && c <= 'F')
+ {
+ *v = c - 'A' + 0xA;
+ return 1;
+ }
+ if (c >= 'a' && c <= 'f')
+ {
+ *v = c - 'a' + 0xA;
+ return 1;
+ }
+ return 0;
+}
+
+static int css_lex_accept_hex(struct lexbuf *buf, int *v)
+{
+ if (ishex(buf->c, v))
+ {
+ css_lex_next(buf);
+ return 1;
+ }
+ return 0;
+}
+
+static int css_lex_number(struct lexbuf *buf)
+{
+ while (buf->c >= '0' && buf->c <= '9')
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ }
+
+ if (css_lex_accept(buf, '.'))
+ {
+ css_push_char(buf, '.');
+ while (buf->c >= '0' && buf->c <= '9')
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ }
+ }
+
+ if (css_lex_accept(buf, '%'))
+ {
+ css_push_char(buf, '%');
+ css_push_char(buf, 0);
+ return CSS_PERCENT;
+ }
+
+ if (isnmstart(buf->c))
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ while (isnmchar(buf->c))
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ }
+ css_push_char(buf, 0);
+ return CSS_LENGTH;
+ }
+
+ css_push_char(buf, 0);
+ return CSS_NUMBER;
+}
+
+static int css_lex_keyword(struct lexbuf *buf)
+{
+ while (isnmchar(buf->c))
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ }
+ css_push_char(buf, 0);
+ return CSS_KEYWORD;
+}
+
+static int css_lex_string(struct lexbuf *buf, int q)
+{
+ while (buf->c && buf->c != q)
+ {
+ if (css_lex_accept(buf, '\\'))
+ {
+ if (css_lex_accept(buf, 'n'))
+ css_push_char(buf, '\n');
+ else if (css_lex_accept(buf, 'r'))
+ css_push_char(buf, '\r');
+ else if (css_lex_accept(buf, 'f'))
+ css_push_char(buf, '\f');
+ else if (css_lex_accept(buf, '\f'))
+ /* line continuation */ ;
+ else if (css_lex_accept(buf, '\n'))
+ /* line continuation */ ;
+ else if (css_lex_accept(buf, '\r'))
+ css_lex_accept(buf, '\n');
+ else
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ }
+ }
+ else
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ }
+ }
+ css_lex_expect(buf, q);
+ css_push_char(buf, 0);
+ return CSS_STRING;
+}
+
+static int css_lex(struct lexbuf *buf)
+{
+ int t;
+
+ // TODO: keyword escape sequences
+
+ buf->string_len = 0;
+
+ while (buf->c)
+ {
+ while (iswhite(buf->c))
+ css_lex_next(buf);
+
+ if (buf->c == 0)
+ break;
+
+ if (css_lex_accept(buf, '/'))
+ {
+ if (css_lex_accept(buf, '*'))
+ {
+ while (buf->c)
+ {
+ if (css_lex_accept(buf, '*'))
+ {
+ while (buf->c == '*')
+ css_lex_next(buf);
+ if (css_lex_accept(buf, '/'))
+ continue;
+ }
+ css_lex_next(buf);
+ }
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: unterminated comment");
+ }
+ return '/';
+ }
+
+ if (css_lex_accept(buf, '<'))
+ {
+ if (css_lex_accept(buf, '!'))
+ {
+ css_lex_expect(buf, '-');
+ css_lex_expect(buf, '-');
+ continue; /* ignore CDO */
+ }
+ return '<';
+ }
+
+ if (css_lex_accept(buf, '-'))
+ {
+ if (css_lex_accept(buf, '-'))
+ {
+ css_lex_expect(buf, '>');
+ continue; /* ignore CDC */
+ }
+ if (buf->c >= '0' && buf->c <= '9')
+ {
+ css_push_char(buf, '-');
+ return css_lex_number(buf);
+ }
+ if (isnmstart(buf->c))
+ {
+ css_push_char(buf, '-');
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ return css_lex_keyword(buf);
+ }
+ return '-';
+ }
+
+ if (css_lex_accept(buf, '.'))
+ {
+ if (buf->c >= '0' && buf->c <= '9')
+ {
+ css_push_char(buf, '.');
+ return css_lex_number(buf);
+ }
+ return '.';
+ }
+
+ if (css_lex_accept(buf, '#'))
+ {
+ int a, b, c, d, e, f;
+ if (!css_lex_accept_hex(buf, &a)) goto colorerror;
+ if (!css_lex_accept_hex(buf, &b)) goto colorerror;
+ if (!css_lex_accept_hex(buf, &c)) goto colorerror;
+ if (css_lex_accept_hex(buf, &d))
+ {
+ if (!css_lex_accept_hex(buf, &e)) goto colorerror;
+ if (!css_lex_accept_hex(buf, &f)) goto colorerror;
+ buf->color = (a << 20) | (b << 16) | (c << 12) | (d << 8) | (e << 4) | f;
+ }
+ else
+ {
+ buf->color = (a << 20) | (b << 12) | (c << 4);
+ }
+ sprintf(buf->string, "%06x", buf->color); // XXX
+ return CSS_COLOR;
+colorerror:
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error in color");
+ }
+
+ if (css_lex_accept(buf, '"'))
+ return css_lex_string(buf, '"');
+ if (css_lex_accept(buf, '\''))
+ return css_lex_string(buf, '\'');
+
+ if (buf->c >= '0' && buf->c <= '9')
+ return css_lex_number(buf);
+
+ if (css_lex_accept(buf, 'u'))
+ {
+ if (css_lex_accept(buf, 'r'))
+ {
+ if (css_lex_accept(buf, 'l'))
+ {
+ if (css_lex_accept(buf, '('))
+ {
+ // string or url
+ css_lex_expect(buf, ')');
+ return CSS_URI;
+ }
+ css_push_char(buf, 'u');
+ css_push_char(buf, 'r');
+ css_push_char(buf, 'l');
+ return css_lex_keyword(buf);
+ }
+ css_push_char(buf, 'u');
+ css_push_char(buf, 'r');
+ return css_lex_keyword(buf);
+ }
+ css_push_char(buf, 'u');
+ return css_lex_keyword(buf);
+ }
+
+ if (isnmstart(buf->c))
+ {
+ css_push_char(buf, buf->c);
+ css_lex_next(buf);
+ return css_lex_keyword(buf);
+ }
+
+ t = buf->c;
+ css_lex_next(buf);
+ return t;
+ }
+ return EOF;
+}
+
+static void next(struct lexbuf *buf)
+{
+ buf->lookahead = css_lex(buf);
+}
+
+static int accept(struct lexbuf *buf, int t)
+{
+ if (buf->lookahead == t)
+ {
+ next(buf);
+ return 1;
+ }
+ return 0;
+}
+
+static void expect(struct lexbuf *buf, int t)
+{
+ if (accept(buf, t))
+ return;
+ if (t < 256)
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected '%c'", t);
+ else
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: unexpected token");
+}
+
+static int iscond(int t)
+{
+ return t == ':' || t == '.' || t == '#' || t == '[';
+}
+
+static struct value *parse_value_list(struct lexbuf *buf);
+
+static struct value *parse_value(struct lexbuf *buf)
+{
+ struct value *v;
+
+ if (buf->lookahead == CSS_KEYWORD)
+ {
+ v = new_value(CSS_KEYWORD, buf->string);
+ next(buf);
+
+ if (accept(buf, '('))
+ {
+ v->type = '(';
+ v->args = parse_value_list(buf);
+ expect(buf, ')');
+ }
+
+ return v;
+ }
+
+ switch (buf->lookahead)
+ {
+ case CSS_NUMBER:
+ case CSS_LENGTH:
+ case CSS_PERCENT:
+ case CSS_STRING:
+ case CSS_COLOR:
+ case CSS_URI:
+ v = new_value(buf->lookahead, buf->string);
+ next(buf);
+ return v;
+ }
+
+ if (accept(buf, ','))
+ return new_value(',', ",");
+ if (accept(buf, '/'))
+ return new_value('/', "/");
+
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected value");
+}
+
+static struct value *parse_value_list(struct lexbuf *buf)
+{
+ struct value *v, *vv;
+
+ vv = NULL;
+
+ while (buf->lookahead != '}' && buf->lookahead != ';' && buf->lookahead != '!' &&
+ buf->lookahead != ')' && buf->lookahead != EOF)
+ {
+ v = parse_value(buf);
+ v->next = vv;
+ vv = v;
+ }
+
+ return vv;
+}
+
+static struct property *parse_declaration(struct lexbuf *buf)
+{
+ struct property *p;
+
+ if (buf->lookahead != CSS_KEYWORD)
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected keyword in property");
+ p = new_property(buf->string, NULL, 0);
+ next(buf);
+
+ expect(buf, ':');
+
+ p->value = parse_value_list(buf);
+
+ /* !important */
+ if (accept(buf, '!'))
+ expect(buf, CSS_KEYWORD);
+
+ return p;
+}
+
+static struct property *parse_declaration_list(struct lexbuf *buf)
+{
+ struct property *p, *pp;
+
+ if (buf->lookahead == '}')
+ return NULL;
+
+ pp = parse_declaration(buf);
+
+ while (accept(buf, ';'))
+ {
+ if (buf->lookahead != '}' && buf->lookahead != ';')
+ {
+ p = parse_declaration(buf);
+ p->next = pp;
+ pp = p;
+ }
+ }
+
+ return pp;
+}
+
+static const char *parse_attrib_value(struct lexbuf *buf)
+{
+ const char *s;
+
+ if (buf->lookahead == CSS_KEYWORD || buf->lookahead == CSS_STRING)
+ {
+ s = strdup(buf->string);
+ next(buf);
+ return s;
+ }
+
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected attribute value");
+}
+
+static struct condition *parse_condition(struct lexbuf *buf)
+{
+ struct condition *c;
+
+ if (accept(buf, ':'))
+ {
+ if (buf->lookahead != CSS_KEYWORD)
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected keyword after ':'");
+ c = new_condition(':', "pseudo", buf->string);
+ next(buf);
+ return c;
+ }
+
+ if (accept(buf, '.'))
+ {
+ if (buf->lookahead != CSS_KEYWORD)
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected keyword after '.'");
+ c = new_condition('.', "class", buf->string);
+ next(buf);
+ return c;
+ }
+
+ if (accept(buf, '#'))
+ {
+ if (buf->lookahead != CSS_KEYWORD)
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected keyword after '#'");
+ c = new_condition('#', "id", buf->string);
+ next(buf);
+ return c;
+ }
+
+ if (accept(buf, '['))
+ {
+ if (buf->lookahead != CSS_KEYWORD)
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected keyword after '['");
+
+ c = new_condition('[', buf->string, NULL);
+ next(buf);
+
+ if (accept(buf, '='))
+ {
+ c->type = '=';
+ c->val = parse_attrib_value(buf);
+ }
+ else if (accept(buf, '|'))
+ {
+ expect(buf, '=');
+ c->type = '|';
+ c->val = parse_attrib_value(buf);
+ }
+ else if (accept(buf, '~'))
+ {
+ expect(buf, '=');
+ c->type = '~';
+ c->val = parse_attrib_value(buf);
+ }
+
+ expect(buf, ']');
+
+ return c;
+ }
+
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected condition");
+}
+
+static struct condition *parse_condition_list(struct lexbuf *buf)
+{
+ struct condition *c, *cc;
+
+ cc = parse_condition(buf);
+ while (iscond(buf->lookahead))
+ {
+ c = parse_condition(buf);
+ c->next = cc;
+ cc = c;
+ }
+ return cc;
+}
+
+static struct selector *parse_simple_selector(struct lexbuf *buf)
+{
+ struct selector *s;
+
+ if (accept(buf, '*'))
+ {
+ s = new_selector(NULL);
+ if (iscond(buf->lookahead))
+ s->cond = parse_condition_list(buf);
+ return s;
+ }
+ else if (buf->lookahead == CSS_KEYWORD)
+ {
+ s = new_selector(buf->string);
+ next(buf);
+ if (iscond(buf->lookahead))
+ s->cond = parse_condition_list(buf);
+ return s;
+ }
+ else if (iscond(buf->lookahead))
+ {
+ s = new_selector(NULL);
+ s->cond = parse_condition_list(buf);
+ return s;
+ }
+
+ fz_throw(buf->ctx, FZ_ERROR_GENERIC, "syntax error: expected selector");
+}
+
+static struct selector *parse_adjacent_selector(struct lexbuf *buf)
+{
+ struct selector *s, *a, *b;
+
+ a = parse_simple_selector(buf);
+ if (accept(buf, '+'))
+ {
+ b = parse_adjacent_selector(buf);
+ s = new_selector(NULL);
+ s->combine = '>';
+ s->left = a;
+ s->right = b;
+ return s;
+ }
+ return a;
+}
+
+static struct selector *parse_child_selector(struct lexbuf *buf)
+{
+ struct selector *s, *a, *b;
+
+ a = parse_adjacent_selector(buf);
+ if (accept(buf, '>'))
+ {
+ b = parse_child_selector(buf);
+ s = new_selector(NULL);
+ s->combine = '>';
+ s->left = a;
+ s->right = b;
+ return s;
+ }
+ return a;
+}
+
+static struct selector *parse_descendant_selector(struct lexbuf *buf)
+{
+ struct selector *s, *a, *b;
+
+ a = parse_child_selector(buf);
+ if (buf->lookahead != ',' && buf->lookahead != '{' && buf->lookahead != EOF)
+ {
+ b = parse_descendant_selector(buf);
+ s = new_selector(NULL);
+ s->combine = ' ';
+ s->left = a;
+ s->right = b;
+ return s;
+ }
+ return a;
+}
+
+static struct selector *parse_selector_list(struct lexbuf *buf)
+{
+ struct selector *s, *ss;
+
+ ss = parse_descendant_selector(buf);
+ while (accept(buf, ','))
+ {
+ s = parse_descendant_selector(buf);
+ s->next = ss;
+ ss = s;
+ }
+ return ss;
+}
+
+static struct rule *parse_rule(struct lexbuf *buf)
+{
+ struct selector *s;
+ struct property *p;
+
+ s = parse_selector_list(buf);
+ expect(buf, '{');
+ p = parse_declaration_list(buf);
+ expect(buf, '}');
+ return new_rule(s, p);
+}
+
+static void parse_media_list(struct lexbuf *buf)
+{
+ struct rule *r;
+
+ while (buf->lookahead != '}' && buf->lookahead != EOF)
+ {
+ r = parse_rule(buf);
+ // TODO: free_rule(r);
+ }
+}
+
+static void parse_at_rule(struct lexbuf *buf)
+{
+ struct property *p;
+ struct value *v;
+
+ expect(buf, CSS_KEYWORD);
+ if (accept(buf, '{')) /* @page */
+ {
+ p = parse_declaration_list(buf);
+ // TODO: free_properties(p);
+ expect(buf, '}');
+ }
+ else
+ {
+ v = parse_value_list(buf);
+ // TODO: free_value_list(v);
+ if (accept(buf, '{')) /* @media */
+ {
+ parse_media_list(buf);
+ expect(buf, '}');
+ }
+ else /* @import */
+ {
+ expect(buf, ';');
+ }
+ }
+}
+
+static struct rule *parse_stylesheet(struct lexbuf *buf, struct rule *chain)
+{
+ struct rule *r;
+
+ while (buf->lookahead != EOF)
+ {
+ if (accept(buf, '@'))
+ {
+ parse_at_rule(buf);
+ }
+ else
+ {
+ r = parse_rule(buf);
+ r->next = chain;
+ chain = r;
+ }
+ }
+
+ return chain;
+}
+
+struct rule *fz_parse_css(fz_context *ctx, struct rule *chain, const char *source)
+{
+ struct lexbuf buf;
+ css_lex_init(ctx, &buf, source);
+ next(&buf);
+ return parse_stylesheet(&buf, chain);
+}