summaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2015-05-06 15:03:33 +0200
committerTor Andersson <tor.andersson@artifex.com>2015-05-06 15:06:36 +0200
commit9fe8e91556693c90cdb83932f130ceefc518539b (patch)
tree10a33e38c6e2cb6cab80dd0d79a1fa5cbc8ec1a9 /source
parenta3422a76d10fb82edb5a1a50edd9024280c31ef6 (diff)
downloadmupdf-9fe8e91556693c90cdb83932f130ceefc518539b.tar.xz
epub: Rejig CSS parser -- whitespace is significant!
The selectors ".foo.bar" and ".foo .bar" are not equivalent! They should parse as "*.foo.bar" and "*.foo *.bar" but we parsed both as "*.foo.bar" due to us ignoring significant whitespace.
Diffstat (limited to 'source')
-rw-r--r--source/html/css-parse.c355
1 files changed, 206 insertions, 149 deletions
diff --git a/source/html/css-parse.c b/source/html/css-parse.c
index 1090d515..59d305d4 100644
--- a/source/html/css-parse.c
+++ b/source/html/css-parse.c
@@ -12,6 +12,9 @@ struct lexbuf
char string[1024];
};
+static fz_css_value *parse_expr(struct lexbuf *buf);
+static fz_css_selector *parse_selector(struct lexbuf *buf);
+
FZ_NORETURN static void fz_css_error(struct lexbuf *buf, const char *msg)
{
fz_throw(buf->ctx, FZ_ERROR_SYNTAX, "css syntax error: %s (%s:%d)", msg, buf->file, buf->line);
@@ -59,6 +62,16 @@ static fz_css_property *fz_new_css_property(fz_context *ctx, const char *name, f
return prop;
}
+static fz_css_value *fz_new_css_value_x(fz_context *ctx, int type)
+{
+ fz_css_value *val = fz_malloc_struct(ctx, fz_css_value);
+ val->type = type;
+ val->data = NULL;
+ val->args = NULL;
+ val->next = NULL;
+ return val;
+}
+
static fz_css_value *fz_new_css_value(fz_context *ctx, int type, const char *data)
{
fz_css_value *val = fz_malloc_struct(ctx, fz_css_value);
@@ -330,148 +343,132 @@ static int css_lex(struct lexbuf *buf)
buf->string_len = 0;
- while (buf->c)
- {
restart:
+ if (buf->c == 0)
+ return EOF;
+
+ if (iswhite(buf->c))
+ {
while (iswhite(buf->c))
css_lex_next(buf);
+ return ' ';
+ }
- if (buf->c == 0)
- break;
-
- if (css_lex_accept(buf, '/'))
+ if (css_lex_accept(buf, '/'))
+ {
+ if (css_lex_accept(buf, '*'))
{
- if (css_lex_accept(buf, '*'))
+ while (buf->c)
{
- while (buf->c)
+ if (css_lex_accept(buf, '*'))
{
- if (css_lex_accept(buf, '*'))
- {
- while (buf->c == '*')
- css_lex_next(buf);
- if (css_lex_accept(buf, '/'))
- goto restart;
- }
- css_lex_next(buf);
+ while (buf->c == '*')
+ css_lex_next(buf);
+ if (css_lex_accept(buf, '/'))
+ goto restart;
}
- fz_css_error(buf, "unterminated comment");
+ css_lex_next(buf);
}
- return '/';
+ fz_css_error(buf, "unterminated comment");
}
+ return '/';
+ }
- if (css_lex_accept(buf, '<'))
+ if (css_lex_accept(buf, '<'))
+ {
+ if (css_lex_accept(buf, '!'))
{
- if (css_lex_accept(buf, '!'))
- {
- css_lex_expect(buf, '-');
- css_lex_expect(buf, '-');
- continue; /* ignore CDO */
- }
- return '<';
+ css_lex_expect(buf, '-');
+ css_lex_expect(buf, '-');
+ goto restart; /* ignore CDO */
}
+ return '<';
+ }
+ if (css_lex_accept(buf, '-'))
+ {
if (css_lex_accept(buf, '-'))
{
- if (css_lex_accept(buf, '-'))
- {
- css_lex_expect(buf, '>');
- continue; /* ignore CDC */
- }
- if (css_lex_accept(buf, '.'))
- {
- css_push_char(buf, '-');
- css_push_char(buf, '.');
- if (buf->c >= '0' && buf->c <= '9')
- return css_lex_number(buf);
- return css_lex_keyword(buf);
- }
- if (buf->c >= '0' && buf->c <= '9')
- {
- css_push_char(buf, '-');
- return css_lex_number(buf);
- }
- if (isnmstart(buf->c))
- {
- css_push_char(buf, '-');
- return css_lex_keyword(buf);
- }
- return '-';
+ css_lex_expect(buf, '>');
+ goto restart; /* ignore CDC */
}
-
- if (css_lex_accept(buf, '+'))
+ if (isnmstart(buf->c))
{
- if (buf->c >= '0' && buf->c <= '9')
- return css_lex_number(buf);
- return '+';
+ css_push_char(buf, '-');
+ return css_lex_keyword(buf);
}
+ return '-';
+ }
- if (css_lex_accept(buf, '.'))
+ if (css_lex_accept(buf, '.'))
+ {
+ if (buf->c >= '0' && buf->c <= '9')
{
- if (buf->c >= '0' && buf->c <= '9')
- {
- css_push_char(buf, '.');
- return css_lex_number(buf);
- }
- return '.';
+ css_push_char(buf, '.');
+ return css_lex_number(buf);
}
+ return '.';
+ }
- if (css_lex_accept(buf, '#'))
+ if (css_lex_accept(buf, '#'))
+ {
+ if (isnmchar(buf->c))
return css_lex_hash(buf);
+ return '#';
+ }
- if (css_lex_accept(buf, '"'))
- return css_lex_string(buf, '"');
- if (css_lex_accept(buf, '\''))
- return css_lex_string(buf, '\'');
+ if (css_lex_accept(buf, '"'))
+ return css_lex_string(buf, '"');
+ if (css_lex_accept(buf, '\''))
+ return css_lex_string(buf, '\'');
- if (buf->c >= '0' && buf->c <= '9')
- return css_lex_number(buf);
+ if (buf->c >= '0' && buf->c <= '9')
+ return css_lex_number(buf);
- if (css_lex_accept(buf, 'u'))
+ if (css_lex_accept(buf, 'u'))
+ {
+ if (css_lex_accept(buf, 'r'))
{
- if (css_lex_accept(buf, 'r'))
+ if (css_lex_accept(buf, 'l'))
{
- if (css_lex_accept(buf, 'l'))
+ if (css_lex_accept(buf, '('))
{
- if (css_lex_accept(buf, '('))
- {
- while (iswhite(buf->c))
- css_lex_next(buf);
- if (css_lex_accept(buf, '"'))
- css_lex_string(buf, '"');
- else if (css_lex_accept(buf, '\''))
- css_lex_string(buf, '\'');
- else
- css_lex_uri(buf);
- while (iswhite(buf->c))
- css_lex_next(buf);
- css_lex_expect(buf, ')');
- return CSS_URI;
- }
- css_push_char(buf, 'u');
- css_push_char(buf, 'r');
- css_push_char(buf, 'l');
- return css_lex_keyword(buf);
+ while (iswhite(buf->c))
+ css_lex_next(buf);
+ if (css_lex_accept(buf, '"'))
+ css_lex_string(buf, '"');
+ else if (css_lex_accept(buf, '\''))
+ css_lex_string(buf, '\'');
+ else
+ css_lex_uri(buf);
+ while (iswhite(buf->c))
+ css_lex_next(buf);
+ css_lex_expect(buf, ')');
+ return CSS_URI;
}
css_push_char(buf, 'u');
css_push_char(buf, 'r');
+ css_push_char(buf, 'l');
return css_lex_keyword(buf);
}
css_push_char(buf, 'u');
+ css_push_char(buf, 'r');
return css_lex_keyword(buf);
}
+ css_push_char(buf, 'u');
+ return css_lex_keyword(buf);
+ }
- if (isnmstart(buf->c))
- {
- css_push_char(buf, buf->c);
- css_lex_next(buf);
- return css_lex_keyword(buf);
- }
-
- t = buf->c;
+ if (isnmstart(buf->c))
+ {
+ css_push_char(buf, buf->c);
css_lex_next(buf);
- return t;
+ return css_lex_keyword(buf);
}
- return EOF;
+
+ t = buf->c;
+ css_lex_next(buf);
+ return t;
}
static void next(struct lexbuf *buf)
@@ -496,66 +493,99 @@ static void expect(struct lexbuf *buf, int t)
fz_css_error(buf, "unexpected token");
}
+static void white(struct lexbuf *buf)
+{
+ accept(buf, ' ');
+}
+
static int iscond(int t)
{
return t == ':' || t == '.' || t == '[' || t == CSS_HASH;
}
-static fz_css_value *parse_value_list(struct lexbuf *buf);
-
-static fz_css_value *parse_value(struct lexbuf *buf)
+static fz_css_value *parse_term(struct lexbuf *buf)
{
fz_css_value *v;
+ if (buf->c == '+' || buf->c == '-')
+ {
+ float sign = buf->c == '-' ? -1 : 1;
+ next(buf);
+ if (buf->lookahead != CSS_NUMBER || buf->lookahead != CSS_LENGTH || buf->lookahead != CSS_PERCENT)
+ fz_css_error(buf, "expected number");
+ if (sign < 0)
+ {
+ v = fz_new_css_value_x(buf->ctx, buf->lookahead);
+ v->data = fz_malloc(buf->ctx, strlen(buf->string) + 2);
+ v->data[0] = '-';
+ strcpy(v->data + 1, buf->string);
+ }
+ else
+ {
+ v = fz_new_css_value(buf->ctx, buf->lookahead, buf->string);
+ }
+ next(buf);
+ white(buf);
+ return v;
+ }
+
if (buf->lookahead == CSS_KEYWORD)
{
v = fz_new_css_value(buf->ctx, CSS_KEYWORD, buf->string);
next(buf);
-
if (accept(buf, '('))
{
+ white(buf);
v->type = '(';
- v->args = parse_value_list(buf);
+ v->args = parse_expr(buf);
expect(buf, ')');
}
-
+ white(buf);
return v;
}
switch (buf->lookahead)
{
case CSS_HASH:
+ case CSS_STRING:
+ case CSS_URI:
case CSS_NUMBER:
case CSS_LENGTH:
case CSS_PERCENT:
- case CSS_STRING:
- case CSS_URI:
v = fz_new_css_value(buf->ctx, buf->lookahead, buf->string);
next(buf);
+ white(buf);
return v;
}
- if (accept(buf, ','))
- return fz_new_css_value(buf->ctx, ',', ",");
- if (accept(buf, '/'))
- return fz_new_css_value(buf->ctx, '/', "/");
-
fz_css_error(buf, "expected value");
}
-static fz_css_value *parse_value_list(struct lexbuf *buf)
+static fz_css_value *parse_expr(struct lexbuf *buf)
{
fz_css_value *head, *tail;
- head = tail = NULL;
+ head = tail = parse_term(buf);
while (buf->lookahead != '}' && buf->lookahead != ';' && buf->lookahead != '!' &&
buf->lookahead != ')' && buf->lookahead != EOF)
{
- if (!head)
- head = tail = parse_value(buf);
+ if (accept(buf, ','))
+ {
+ white(buf);
+ tail = tail->next = fz_new_css_value(buf->ctx, ',', ",");
+ tail = tail->next = parse_term(buf);
+ }
+ else if (accept(buf, '/'))
+ {
+ white(buf);
+ tail = tail->next = fz_new_css_value(buf->ctx, '/', "/");
+ tail = tail->next = parse_term(buf);
+ }
else
- tail = tail->next = parse_value(buf);
+ {
+ tail = tail->next = parse_term(buf);
+ }
}
return head;
@@ -571,12 +601,16 @@ static fz_css_property *parse_declaration(struct lexbuf *buf)
next(buf);
expect(buf, ':');
+ white(buf);
- p->value = parse_value_list(buf);
+ p->value = parse_expr(buf);
/* !important */
if (accept(buf, '!'))
+ {
expect(buf, CSS_KEYWORD);
+ white(buf);
+ }
return p;
}
@@ -585,6 +619,8 @@ static fz_css_property *parse_declaration_list(struct lexbuf *buf)
{
fz_css_property *head, *tail;
+ white(buf);
+
if (buf->lookahead == '}' || buf->lookahead == EOF)
return NULL;
@@ -592,6 +628,8 @@ static fz_css_property *parse_declaration_list(struct lexbuf *buf)
while (accept(buf, ';'))
{
+ white(buf);
+
if (buf->lookahead != '}' && buf->lookahead != ';' && buf->lookahead != EOF)
{
tail = tail->next = parse_declaration(buf);
@@ -609,6 +647,7 @@ static char *parse_attrib_value(struct lexbuf *buf)
{
s = fz_strdup(buf->ctx, buf->string);
next(buf);
+ white(buf);
return s;
}
@@ -626,6 +665,13 @@ static fz_css_condition *parse_condition(struct lexbuf *buf)
fz_css_error(buf, "expected keyword after ':'");
c = fz_new_css_condition(buf->ctx, ':', "pseudo", buf->string);
next(buf);
+ if (accept(buf, '('))
+ {
+ white(buf);
+ if (accept(buf, CSS_KEYWORD))
+ white(buf);
+ expect(buf, ')');
+ }
return c;
}
@@ -640,12 +686,15 @@ static fz_css_condition *parse_condition(struct lexbuf *buf)
if (accept(buf, '['))
{
+ white(buf);
+
if (buf->lookahead != CSS_KEYWORD)
fz_css_error(buf, "expected keyword after '['");
-
c = fz_new_css_condition(buf->ctx, '[', buf->string, NULL);
next(buf);
+ white(buf);
+
if (accept(buf, '='))
{
c->type = '=';
@@ -720,31 +769,23 @@ static fz_css_selector *parse_simple_selector(struct lexbuf *buf)
fz_css_error(buf, "expected selector");
}
-static fz_css_selector *parse_adjacent_selector(struct lexbuf *buf)
+static fz_css_selector *parse_combinator(struct lexbuf *buf, fz_css_selector *a)
{
- fz_css_selector *s, *a, *b;
-
- a = parse_simple_selector(buf);
+ fz_css_selector *s, *b;
if (accept(buf, '+'))
{
- b = parse_adjacent_selector(buf);
+ white(buf);
+ b = parse_selector(buf);
s = fz_new_css_selector(buf->ctx, NULL);
s->combine = '+';
s->left = a;
s->right = b;
return s;
}
- return a;
-}
-
-static fz_css_selector *parse_child_selector(struct lexbuf *buf)
-{
- fz_css_selector *s, *a, *b;
-
- a = parse_adjacent_selector(buf);
if (accept(buf, '>'))
{
- b = parse_child_selector(buf);
+ white(buf);
+ b = parse_selector(buf);
s = fz_new_css_selector(buf->ctx, NULL);
s->combine = '>';
s->left = a;
@@ -754,36 +795,45 @@ static fz_css_selector *parse_child_selector(struct lexbuf *buf)
return a;
}
-static fz_css_selector *parse_descendant_selector(struct lexbuf *buf)
+static fz_css_selector *parse_selector(struct lexbuf *buf)
{
fz_css_selector *s, *a, *b;
- a = parse_child_selector(buf);
- if (buf->lookahead != ',' && buf->lookahead != '{' && buf->lookahead != EOF)
+ a = parse_simple_selector(buf);
+ if (accept(buf, ' '))
{
- b = parse_descendant_selector(buf);
- s = fz_new_css_selector(buf->ctx, NULL);
- s->combine = ' ';
- s->left = a;
- s->right = b;
- return s;
+ s = parse_combinator(buf, a);
+ if (s == a && buf->lookahead != ',' && buf->lookahead != '{' && buf->lookahead != EOF)
+ {
+ b = parse_selector(buf);
+ s = fz_new_css_selector(buf->ctx, NULL);
+ s->combine = ' ';
+ s->left = a;
+ s->right = b;
+ return s;
+ }
}
- return a;
+ else
+ {
+ s = parse_combinator(buf, a);
+ }
+ return s;
}
static fz_css_selector *parse_selector_list(struct lexbuf *buf)
{
fz_css_selector *head, *tail;
- head = tail = parse_descendant_selector(buf);
+ head = tail = parse_selector(buf);
while (accept(buf, ','))
{
- tail = tail->next = parse_descendant_selector(buf);
+ white(buf);
+ tail = tail->next = parse_selector(buf);
}
return head;
}
-static fz_css_rule *parse_rule(struct lexbuf *buf)
+static fz_css_rule *parse_ruleset(struct lexbuf *buf)
{
fz_css_selector *s = NULL;
fz_css_property *p = NULL;
@@ -794,6 +844,7 @@ static fz_css_rule *parse_rule(struct lexbuf *buf)
expect(buf, '{');
p = parse_declaration_list(buf);
expect(buf, '}');
+ white(buf);
}
fz_catch(buf->ctx)
{
@@ -802,7 +853,10 @@ static fz_css_rule *parse_rule(struct lexbuf *buf)
while (buf->lookahead != EOF)
{
if (accept(buf, '}'))
+ {
+ white(buf);
break;
+ }
next(buf);
}
return NULL;
@@ -854,6 +908,8 @@ static fz_css_rule *parse_stylesheet(struct lexbuf *buf, fz_css_rule *chain)
nextp = &tail;
}
+ white(buf);
+
while (buf->lookahead != EOF)
{
if (accept(buf, '@'))
@@ -862,13 +918,14 @@ static fz_css_rule *parse_stylesheet(struct lexbuf *buf, fz_css_rule *chain)
}
else
{
- fz_css_rule *x = parse_rule(buf);
+ fz_css_rule *x = parse_ruleset(buf);
if (x)
{
rule = *nextp = x;
nextp = &rule->next;
}
}
+ white(buf);
}
return chain ? chain : tail;