diff options
-rw-r--r-- | include/mupdf/fitz/xml.h | 14 | ||||
-rw-r--r-- | source/fitz/xml.c | 140 | ||||
-rw-r--r-- | source/xps/xps-doc.c | 4 | ||||
-rw-r--r-- | source/xps/xps-outline.c | 2 | ||||
-rw-r--r-- | source/xps/xps-resource.c | 2 |
5 files changed, 146 insertions, 16 deletions
diff --git a/include/mupdf/fitz/xml.h b/include/mupdf/fitz/xml.h index ce391029..85ed4a2a 100644 --- a/include/mupdf/fitz/xml.h +++ b/include/mupdf/fitz/xml.h @@ -12,8 +12,15 @@ typedef struct fz_xml_s fz_xml; /* fz_parse_xml: Parse a zero-terminated string into a tree of xml nodes. + + preserve_white: whether to keep or delete all-whitespace nodes. +*/ +fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len, int preserve_white); + +/* + fz_xml_prev: Return previous sibling of XML node. */ -fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len); +fz_xml *fz_xml_prev(fz_xml *item); /* fz_xml_next: Return next sibling of XML node. @@ -21,6 +28,11 @@ fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len); fz_xml *fz_xml_next(fz_xml *item); /* + fz_xml_up: Return parent of XML node. +*/ +fz_xml *fz_xml_up(fz_xml *item); + +/* fz_xml_down: Return first child of XML node. */ fz_xml *fz_xml_down(fz_xml *item); diff --git a/source/fitz/xml.c b/source/fitz/xml.c index 1dd3512f..d916f205 100644 --- a/source/fitz/xml.c +++ b/source/fitz/xml.c @@ -1,9 +1,74 @@ #include "mupdf/fitz.h" +static const struct { const char *ent; int ucs; } html_entities[] = { + {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163}, + {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167}, + {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171}, + {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176}, + {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180}, + {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184}, + {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188}, + {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192}, + {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196}, + {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200}, + {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204}, + {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208}, + {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212}, + {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216}, + {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220}, + {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224}, + {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228}, + {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232}, + {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236}, + {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240}, + {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244}, + {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248}, + {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252}, + {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62}, + {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339}, + {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710}, + {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201}, + {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207}, + {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217}, + {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222}, + {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249}, + {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913}, + {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917}, + {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922}, + {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927}, + {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933}, + {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945}, + {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949}, + {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954}, + {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959}, + {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964}, + {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969}, + {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226}, + {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254}, + {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476}, + {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593}, + {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629}, + {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659}, + {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707}, + {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713}, + {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722}, + {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734}, + {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746}, + {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773}, + {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805}, + {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838}, + {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869}, + {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970}, + {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674}, + {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830}, +}; + struct parser { fz_xml *head; fz_context *ctx; + int preserve_white; + int depth; }; struct attribute @@ -18,7 +83,7 @@ struct fz_xml_s char name[40]; char *text; struct attribute *atts; - fz_xml *up, *down, *next; + fz_xml *up, *down, *prev, *next; }; static inline void indent(int n) @@ -56,11 +121,21 @@ void fz_debug_xml(fz_xml *item, int level) } } +fz_xml *fz_xml_prev(fz_xml *item) +{ + return item->prev; +} + fz_xml *fz_xml_next(fz_xml *item) { return item->next; } +fz_xml *fz_xml_up(fz_xml *item) +{ + return item->up; +} + fz_xml *fz_xml_down(fz_xml *item) { return item->down; @@ -121,6 +196,8 @@ void fz_detach_xml(fz_xml *node) static int xml_parse_entity(int *c, char *a) { char *b; + int i; + if (a[1] == '#') { if (a[2] == 'x') *c = strtol(a + 3, &b, 16); @@ -149,6 +226,16 @@ static int xml_parse_entity(int *c, char *a) *c = '"'; return 6; } + + /* We should only be doing this for XHTML, but it shouldn't be a problem. */ + for (i = 0; i < nelem(html_entities); ++i) { + unsigned int n = strlen(html_entities[i].ent); + if (!memcmp(a+1, html_entities[i].ent, n) && a[1+n] == ';') { + *c = html_entities[i].ucs; + return n + 2; + } + } + *c = *a; return 1; } @@ -186,6 +273,7 @@ static void xml_emit_open_tag(struct parser *parser, char *a, char *b) head->text = NULL; head->up = parser->head; head->down = NULL; + head->prev = NULL; head->next = NULL; if (!parser->head->down) { @@ -196,9 +284,11 @@ static void xml_emit_open_tag(struct parser *parser, char *a, char *b) while (tail->next) tail = tail->next; tail->next = head; + head->prev = tail; } parser->head = head; + parser->depth++; } static void xml_emit_att_name(struct parser *parser, char *a, char *b) @@ -239,6 +329,7 @@ static void xml_emit_att_value(struct parser *parser, char *a, char *b) static void xml_emit_close_tag(struct parser *parser) { + parser->depth--; if (parser->head->up) parser->head = parser->head->up; } @@ -250,13 +341,20 @@ static void xml_emit_text(struct parser *parser, char *a, char *b) char *s; int c; - /* Skip all-whitespace text nodes */ - for (s = a; s < b; s++) - if (!iswhite(*s)) - break; - if (s == b) + /* Skip text outside the root tag */ + if (parser->depth == 0) return; + /* Skip all-whitespace text nodes */ + if (!parser->preserve_white) + { + for (s = a; s < b; s++) + if (!iswhite(*s)) + break; + if (s == b) + return; + } + xml_emit_open_tag(parser, empty, empty); head = parser->head; @@ -276,6 +374,23 @@ static void xml_emit_text(struct parser *parser, char *a, char *b) xml_emit_close_tag(parser); } +static void xml_emit_cdata(struct parser *parser, char *a, char *b) +{ + static char *empty = ""; + fz_xml *head; + char *s; + + xml_emit_open_tag(parser, empty, empty); + head = parser->head; + + s = head->text = fz_malloc(parser->ctx, b - a + 1); + while (a < b) + *s++ = *a++; + *s = 0; + + xml_emit_close_tag(parser); +} + static char *xml_parse_document_imp(struct parser *x, char *p) { char *mark; @@ -302,7 +417,6 @@ parse_comment: if (*p == 'D' && !memcmp(p, "DOCTYPE", 7)) goto parse_declaration; if (*p++ != '-') return "syntax error in comment (<! not followed by --)"; if (*p++ != '-') return "syntax error in comment (<!- not followed by -)"; - /* mark = p; */ while (*p) { if (p[0] == '-' && p[1] == '-' && p[2] == '>') { p += 3; @@ -320,9 +434,10 @@ parse_cdata: if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') return "syntax error in CDATA section"; p += 7; - /* mark = p; */ + mark = p; while (*p) { if (p[0] == ']' && p[1] == ']' && p[2] == '>') { + xml_emit_cdata(x, mark, p); p += 3; goto parse_text; } @@ -342,7 +457,6 @@ parse_processing_instruction: parse_closing_element: while (iswhite(*p)) ++p; - /* mark = p; */ while (isname(*p)) ++p; while (iswhite(*p)) ++p; if (*p != '>') @@ -440,10 +554,10 @@ static char *convert_to_utf8(fz_context *doc, unsigned char *s, int n, int *dofr } fz_xml * -fz_parse_xml(fz_context *ctx, unsigned char *s, int n) +fz_parse_xml(fz_context *ctx, unsigned char *s, int n, int preserve_white) { struct parser parser; - fz_xml root; + fz_xml root, *node; char *p, *error; int dofree; @@ -452,6 +566,8 @@ fz_parse_xml(fz_context *ctx, unsigned char *s, int n) memset(&root, 0, sizeof(root)); parser.head = &root; parser.ctx = ctx; + parser.preserve_white = preserve_white; + parser.depth = 0; p = convert_to_utf8(ctx, s, n, &dofree); @@ -472,5 +588,7 @@ fz_parse_xml(fz_context *ctx, unsigned char *s, int n) fz_rethrow(ctx); } + for (node = root.down; node; node = node->next) + node->up = NULL; return root.down; } diff --git a/source/xps/xps-doc.c b/source/xps/xps-doc.c index ee3d9736..e2058fbc 100644 --- a/source/xps/xps-doc.c +++ b/source/xps/xps-doc.c @@ -360,7 +360,7 @@ xps_parse_metadata(xps_document *doc, xps_part *part, xps_fixdoc *fixdoc) doc->base_uri = buf; doc->part_uri = part->name; - root = fz_parse_xml(doc->ctx, part->data, part->size); + root = fz_parse_xml(doc->ctx, part->data, part->size, 0); xps_parse_metadata_imp(doc, root, fixdoc); fz_free_xml(doc->ctx, root); @@ -439,7 +439,7 @@ xps_load_fixed_page(xps_document *doc, xps_page *page) part = xps_read_part(doc, page->name); fz_try(ctx) { - root = fz_parse_xml(doc->ctx, part->data, part->size); + root = fz_parse_xml(doc->ctx, part->data, part->size, 0); } fz_always(ctx) { diff --git a/source/xps/xps-outline.c b/source/xps/xps-outline.c index d8b573a9..3a10ff41 100644 --- a/source/xps/xps-outline.c +++ b/source/xps/xps-outline.c @@ -86,7 +86,7 @@ xps_load_document_structure(xps_document *doc, xps_fixdoc *fixdoc) part = xps_read_part(doc, fixdoc->outline); fz_try(doc->ctx) { - root = fz_parse_xml(doc->ctx, part->data, part->size); + root = fz_parse_xml(doc->ctx, part->data, part->size, 0); } fz_always(doc->ctx) { diff --git a/source/xps/xps-resource.c b/source/xps/xps-resource.c index ef699619..85fd7c49 100644 --- a/source/xps/xps-resource.c +++ b/source/xps/xps-resource.c @@ -67,7 +67,7 @@ xps_parse_remote_resource_dictionary(xps_document *doc, char *base_uri, char *so part = xps_read_part(doc, part_name); fz_try(ctx) { - xml = fz_parse_xml(doc->ctx, part->data, part->size); + xml = fz_parse_xml(doc->ctx, part->data, part->size, 0); } fz_always(ctx) { |