summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2014-09-02 20:34:48 +0200
committerTor Andersson <tor.andersson@artifex.com>2014-09-17 12:06:07 +0200
commit13e335fd06789a42369d03b2cad65818014f3f79 (patch)
tree3ad0982c256b58e675b73e66427d4d2a6c44d16d
parent758a9a75fd34bd8312b07f1be72aeee63522d2a0 (diff)
downloadmupdf-13e335fd06789a42369d03b2cad65818014f3f79.tar.xz
Improve XML parser.
Add a whitespace preserving mode, for future use with XHTML. Also parse XHTML entities. This is not strictly according to spec, but for properly formed XML documents it should not matter.
-rw-r--r--include/mupdf/fitz/xml.h14
-rw-r--r--source/fitz/xml.c140
-rw-r--r--source/xps/xps-doc.c4
-rw-r--r--source/xps/xps-outline.c2
-rw-r--r--source/xps/xps-resource.c2
5 files changed, 146 insertions, 16 deletions
diff --git a/include/mupdf/fitz/xml.h b/include/mupdf/fitz/xml.h
index ce391029..85ed4a2a 100644
--- a/include/mupdf/fitz/xml.h
+++ b/include/mupdf/fitz/xml.h
@@ -12,8 +12,15 @@ typedef struct fz_xml_s fz_xml;
/*
fz_parse_xml: Parse a zero-terminated string into a tree of xml nodes.
+
+ preserve_white: whether to keep or delete all-whitespace nodes.
+*/
+fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len, int preserve_white);
+
+/*
+ fz_xml_prev: Return previous sibling of XML node.
*/
-fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len);
+fz_xml *fz_xml_prev(fz_xml *item);
/*
fz_xml_next: Return next sibling of XML node.
@@ -21,6 +28,11 @@ fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len);
fz_xml *fz_xml_next(fz_xml *item);
/*
+ fz_xml_up: Return parent of XML node.
+*/
+fz_xml *fz_xml_up(fz_xml *item);
+
+/*
fz_xml_down: Return first child of XML node.
*/
fz_xml *fz_xml_down(fz_xml *item);
diff --git a/source/fitz/xml.c b/source/fitz/xml.c
index 1dd3512f..d916f205 100644
--- a/source/fitz/xml.c
+++ b/source/fitz/xml.c
@@ -1,9 +1,74 @@
#include "mupdf/fitz.h"
+static const struct { const char *ent; int ucs; } html_entities[] = {
+ {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
+ {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
+ {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
+ {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
+ {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
+ {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
+ {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
+ {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
+ {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
+ {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
+ {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
+ {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
+ {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
+ {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
+ {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
+ {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
+ {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
+ {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
+ {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
+ {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
+ {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
+ {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
+ {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
+ {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
+ {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
+ {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
+ {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
+ {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
+ {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
+ {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
+ {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
+ {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
+ {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
+ {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
+ {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
+ {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
+ {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
+ {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
+ {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
+ {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
+ {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
+ {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
+ {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
+ {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
+ {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
+ {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
+ {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
+ {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
+ {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
+ {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
+ {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
+ {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
+ {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
+ {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
+ {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
+ {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
+ {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
+ {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
+ {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
+ {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
+};
+
struct parser
{
fz_xml *head;
fz_context *ctx;
+ int preserve_white;
+ int depth;
};
struct attribute
@@ -18,7 +83,7 @@ struct fz_xml_s
char name[40];
char *text;
struct attribute *atts;
- fz_xml *up, *down, *next;
+ fz_xml *up, *down, *prev, *next;
};
static inline void indent(int n)
@@ -56,11 +121,21 @@ void fz_debug_xml(fz_xml *item, int level)
}
}
+fz_xml *fz_xml_prev(fz_xml *item)
+{
+ return item->prev;
+}
+
fz_xml *fz_xml_next(fz_xml *item)
{
return item->next;
}
+fz_xml *fz_xml_up(fz_xml *item)
+{
+ return item->up;
+}
+
fz_xml *fz_xml_down(fz_xml *item)
{
return item->down;
@@ -121,6 +196,8 @@ void fz_detach_xml(fz_xml *node)
static int xml_parse_entity(int *c, char *a)
{
char *b;
+ int i;
+
if (a[1] == '#') {
if (a[2] == 'x')
*c = strtol(a + 3, &b, 16);
@@ -149,6 +226,16 @@ static int xml_parse_entity(int *c, char *a)
*c = '"';
return 6;
}
+
+ /* We should only be doing this for XHTML, but it shouldn't be a problem. */
+ for (i = 0; i < nelem(html_entities); ++i) {
+ unsigned int n = strlen(html_entities[i].ent);
+ if (!memcmp(a+1, html_entities[i].ent, n) && a[1+n] == ';') {
+ *c = html_entities[i].ucs;
+ return n + 2;
+ }
+ }
+
*c = *a;
return 1;
}
@@ -186,6 +273,7 @@ static void xml_emit_open_tag(struct parser *parser, char *a, char *b)
head->text = NULL;
head->up = parser->head;
head->down = NULL;
+ head->prev = NULL;
head->next = NULL;
if (!parser->head->down) {
@@ -196,9 +284,11 @@ static void xml_emit_open_tag(struct parser *parser, char *a, char *b)
while (tail->next)
tail = tail->next;
tail->next = head;
+ head->prev = tail;
}
parser->head = head;
+ parser->depth++;
}
static void xml_emit_att_name(struct parser *parser, char *a, char *b)
@@ -239,6 +329,7 @@ static void xml_emit_att_value(struct parser *parser, char *a, char *b)
static void xml_emit_close_tag(struct parser *parser)
{
+ parser->depth--;
if (parser->head->up)
parser->head = parser->head->up;
}
@@ -250,13 +341,20 @@ static void xml_emit_text(struct parser *parser, char *a, char *b)
char *s;
int c;
- /* Skip all-whitespace text nodes */
- for (s = a; s < b; s++)
- if (!iswhite(*s))
- break;
- if (s == b)
+ /* Skip text outside the root tag */
+ if (parser->depth == 0)
return;
+ /* Skip all-whitespace text nodes */
+ if (!parser->preserve_white)
+ {
+ for (s = a; s < b; s++)
+ if (!iswhite(*s))
+ break;
+ if (s == b)
+ return;
+ }
+
xml_emit_open_tag(parser, empty, empty);
head = parser->head;
@@ -276,6 +374,23 @@ static void xml_emit_text(struct parser *parser, char *a, char *b)
xml_emit_close_tag(parser);
}
+static void xml_emit_cdata(struct parser *parser, char *a, char *b)
+{
+ static char *empty = "";
+ fz_xml *head;
+ char *s;
+
+ xml_emit_open_tag(parser, empty, empty);
+ head = parser->head;
+
+ s = head->text = fz_malloc(parser->ctx, b - a + 1);
+ while (a < b)
+ *s++ = *a++;
+ *s = 0;
+
+ xml_emit_close_tag(parser);
+}
+
static char *xml_parse_document_imp(struct parser *x, char *p)
{
char *mark;
@@ -302,7 +417,6 @@ parse_comment:
if (*p == 'D' && !memcmp(p, "DOCTYPE", 7)) goto parse_declaration;
if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
- /* mark = p; */
while (*p) {
if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
p += 3;
@@ -320,9 +434,10 @@ parse_cdata:
if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
return "syntax error in CDATA section";
p += 7;
- /* mark = p; */
+ mark = p;
while (*p) {
if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
+ xml_emit_cdata(x, mark, p);
p += 3;
goto parse_text;
}
@@ -342,7 +457,6 @@ parse_processing_instruction:
parse_closing_element:
while (iswhite(*p)) ++p;
- /* mark = p; */
while (isname(*p)) ++p;
while (iswhite(*p)) ++p;
if (*p != '>')
@@ -440,10 +554,10 @@ static char *convert_to_utf8(fz_context *doc, unsigned char *s, int n, int *dofr
}
fz_xml *
-fz_parse_xml(fz_context *ctx, unsigned char *s, int n)
+fz_parse_xml(fz_context *ctx, unsigned char *s, int n, int preserve_white)
{
struct parser parser;
- fz_xml root;
+ fz_xml root, *node;
char *p, *error;
int dofree;
@@ -452,6 +566,8 @@ fz_parse_xml(fz_context *ctx, unsigned char *s, int n)
memset(&root, 0, sizeof(root));
parser.head = &root;
parser.ctx = ctx;
+ parser.preserve_white = preserve_white;
+ parser.depth = 0;
p = convert_to_utf8(ctx, s, n, &dofree);
@@ -472,5 +588,7 @@ fz_parse_xml(fz_context *ctx, unsigned char *s, int n)
fz_rethrow(ctx);
}
+ for (node = root.down; node; node = node->next)
+ node->up = NULL;
return root.down;
}
diff --git a/source/xps/xps-doc.c b/source/xps/xps-doc.c
index ee3d9736..e2058fbc 100644
--- a/source/xps/xps-doc.c
+++ b/source/xps/xps-doc.c
@@ -360,7 +360,7 @@ xps_parse_metadata(xps_document *doc, xps_part *part, xps_fixdoc *fixdoc)
doc->base_uri = buf;
doc->part_uri = part->name;
- root = fz_parse_xml(doc->ctx, part->data, part->size);
+ root = fz_parse_xml(doc->ctx, part->data, part->size, 0);
xps_parse_metadata_imp(doc, root, fixdoc);
fz_free_xml(doc->ctx, root);
@@ -439,7 +439,7 @@ xps_load_fixed_page(xps_document *doc, xps_page *page)
part = xps_read_part(doc, page->name);
fz_try(ctx)
{
- root = fz_parse_xml(doc->ctx, part->data, part->size);
+ root = fz_parse_xml(doc->ctx, part->data, part->size, 0);
}
fz_always(ctx)
{
diff --git a/source/xps/xps-outline.c b/source/xps/xps-outline.c
index d8b573a9..3a10ff41 100644
--- a/source/xps/xps-outline.c
+++ b/source/xps/xps-outline.c
@@ -86,7 +86,7 @@ xps_load_document_structure(xps_document *doc, xps_fixdoc *fixdoc)
part = xps_read_part(doc, fixdoc->outline);
fz_try(doc->ctx)
{
- root = fz_parse_xml(doc->ctx, part->data, part->size);
+ root = fz_parse_xml(doc->ctx, part->data, part->size, 0);
}
fz_always(doc->ctx)
{
diff --git a/source/xps/xps-resource.c b/source/xps/xps-resource.c
index ef699619..85fd7c49 100644
--- a/source/xps/xps-resource.c
+++ b/source/xps/xps-resource.c
@@ -67,7 +67,7 @@ xps_parse_remote_resource_dictionary(xps_document *doc, char *base_uri, char *so
part = xps_read_part(doc, part_name);
fz_try(ctx)
{
- xml = fz_parse_xml(doc->ctx, part->data, part->size);
+ xml = fz_parse_xml(doc->ctx, part->data, part->size, 0);
}
fz_always(ctx)
{