From a980d3d030b6cad801110a36ff9b09d5ec38b7d1 Mon Sep 17 00:00:00 2001 From: Tor Andersson Date: Sat, 24 Nov 2012 15:42:10 +0100 Subject: Move XML parser into fitz directory. --- fitz/base_xml.c | 439 ++++++++++++++++++++++++++++++++++++++++++++++++++ fitz/fitz.h | 53 ++++++ win32/libmupdf.vcproj | 8 +- xps/muxps.h | 16 -- xps/xps_xml.c | 439 -------------------------------------------------- 5 files changed, 496 insertions(+), 459 deletions(-) create mode 100644 fitz/base_xml.c delete mode 100644 xps/xps_xml.c diff --git a/fitz/base_xml.c b/fitz/base_xml.c new file mode 100644 index 00000000..ac63659a --- /dev/null +++ b/fitz/base_xml.c @@ -0,0 +1,439 @@ +#include "fitz.h" + +struct parser +{ + fz_xml *head; + fz_context *ctx; +}; + +struct attribute +{ + char name[40]; + char *value; + struct attribute *next; +}; + +struct fz_xml_s +{ + char name[40]; + char *text; + struct attribute *atts; + fz_xml *up, *down, *next; +}; + +static inline void indent(int n) +{ + while (n--) putchar(' '); +} + +void fz_debug_xml(fz_xml *item, int level) +{ + while (item) { + if (item->text) { + printf("%s\n", item->text); + } else { + struct attribute *att; + indent(level); + printf("<%s", item->name); + for (att = item->atts; att; att = att->next) + printf(" %s=\"%s\"", att->name, att->value); + if (item->down) { + printf(">\n"); + fz_debug_xml(item->down, level + 1); + indent(level); + printf("\n", item->name); + } + else { + printf("/>\n"); + } + item = item->next; + } + } +} + +fz_xml *fz_xml_next(fz_xml *item) +{ + return item->next; +} + +fz_xml *fz_xml_down(fz_xml *item) +{ + return item->down; +} + +char *fz_xml_text(fz_xml *item) +{ + return item->text; +} + +char *fz_xml_tag(fz_xml *item) +{ + return item->name; +} + +char *fz_xml_att(fz_xml *item, const char *name) +{ + struct attribute *att; + for (att = item->atts; att; att = att->next) + if (!strcmp(att->name, name)) + return att->value; + return NULL; +} + +static void xml_free_attribute(fz_context *ctx, struct attribute *att) +{ + while (att) { + struct attribute *next = att->next; + if (att->value) + fz_free(ctx, att->value); + fz_free(ctx, att); + att = next; + } +} + +void fz_free_xml(fz_context *ctx, fz_xml *item) +{ + while (item) { + fz_xml *next = item->next; + if (item->text) + fz_free(ctx, item->text); + if (item->atts) + xml_free_attribute(ctx, item->atts); + if (item->down) + fz_free_xml(ctx, item->down); + fz_free(ctx, item); + item = next; + } +} + +void fz_detach_xml(fz_xml *node) +{ + if (node->up) + node->up->down = NULL; +} + +static int xml_parse_entity(int *c, char *a) +{ + char *b; + if (a[1] == '#') { + if (a[2] == 'x') + *c = strtol(a + 3, &b, 16); + else + *c = strtol(a + 2, &b, 10); + if (*b == ';') + return b - a + 1; + } + else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') { + *c = '<'; + return 4; + } + else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') { + *c = '>'; + return 4; + } + else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') { + *c = '&'; + return 5; + } + else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') { + *c = '\''; + return 6; + } + else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') { + *c = '"'; + return 6; + } + *c = *a++; + return 1; +} + +static inline int isname(int c) +{ + return c == '.' || c == '-' || c == '_' || c == ':' || + (c >= '0' && c <= '9') || + (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z'); +} + +static inline int iswhite(int c) +{ + return c == ' ' || c == '\r' || c == '\n' || c == '\t'; +} + +static void xml_emit_open_tag(struct parser *parser, char *a, char *b) +{ + fz_xml *head, *tail; + + head = fz_malloc_struct(parser->ctx, fz_xml); + if (b - a > sizeof(head->name) - 1) + b = a + sizeof(head->name) - 1; + memcpy(head->name, a, b - a); + head->name[b - a] = 0; + + head->atts = NULL; + head->text = NULL; + head->up = parser->head; + head->down = NULL; + head->next = NULL; + + if (!parser->head->down) { + parser->head->down = head; + } + else { + tail = parser->head->down; + while (tail->next) + tail = tail->next; + tail->next = head; + } + + parser->head = head; +} + +static void xml_emit_att_name(struct parser *parser, char *a, char *b) +{ + fz_xml *head = parser->head; + struct attribute *att; + + att = fz_malloc_struct(parser->ctx, struct attribute); + if (b - a > sizeof(att->name) - 1) + b = a + sizeof(att->name) - 1; + memcpy(att->name, a, b - a); + att->name[b - a] = 0; + att->value = NULL; + att->next = head->atts; + head->atts = att; +} + +static void xml_emit_att_value(struct parser *parser, char *a, char *b) +{ + fz_xml *head = parser->head; + struct attribute *att = head->atts; + char *s; + int c; + + /* entities are all longer than UTFmax so runetochar is safe */ + s = att->value = fz_malloc(parser->ctx, b - a + 1); + while (a < b) { + if (*a == '&') { + a += xml_parse_entity(&c, a); + s += fz_runetochar(s, c); + } + else { + *s++ = *a++; + } + } + *s = 0; +} + +static void xml_emit_close_tag(struct parser *parser) +{ + if (parser->head->up) + parser->head = parser->head->up; +} + +static void xml_emit_text(struct parser *parser, char *a, char *b) +{ + static char *empty = ""; + fz_xml *head; + char *s; + int c; + + /* Skip all-whitespace text nodes */ + for (s = a; s < b; s++) + if (!iswhite(*s)) + break; + if (s == b) + return; + + xml_emit_open_tag(parser, empty, empty); + head = parser->head; + + /* entities are all longer than UTFmax so runetochar is safe */ + s = head->text = fz_malloc(parser->ctx, b - a + 1); + while (a < b) { + if (*a == '&') { + a += xml_parse_entity(&c, a); + s += fz_runetochar(s, c); + } + else { + *s++ = *a++; + } + } + *s = 0; + + xml_emit_close_tag(parser); +} + +static char *xml_parse_document_imp(struct parser *x, char *p) +{ + char *mark; + int quote; + +parse_text: + mark = p; + while (*p && *p != '<') ++p; + xml_emit_text(x, mark, p); + if (*p == '<') { ++p; goto parse_element; } + return NULL; + +parse_element: + if (*p == '/') { ++p; goto parse_closing_element; } + if (*p == '!') { ++p; goto parse_comment; } + if (*p == '?') { ++p; goto parse_processing_instruction; } + while (iswhite(*p)) ++p; + if (isname(*p)) + goto parse_element_name; + return "syntax error in element"; + +parse_comment: + if (*p == '[') goto parse_cdata; + if (*p++ != '-') return "syntax error in comment (') { + p += 3; + goto parse_text; + } + ++p; + } + return "end of data in comment"; + +parse_cdata: + if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') + return "syntax error in CDATA section"; + p += 7; + mark = p; + while (*p) { + if (p[0] == ']' && p[1] == ']' && p[2] == '>') { + p += 3; + goto parse_text; + } + ++p; + } + return "end of data in CDATA section"; + +parse_processing_instruction: + while (*p) { + if (p[0] == '?' && p[1] == '>') { + p += 2; + goto parse_text; + } + ++p; + } + return "end of data in processing instruction"; + +parse_closing_element: + while (iswhite(*p)) ++p; + mark = p; + while (isname(*p)) ++p; + while (iswhite(*p)) ++p; + if (*p != '>') + return "syntax error in closing element"; + xml_emit_close_tag(x); + ++p; + goto parse_text; + +parse_element_name: + mark = p; + while (isname(*p)) ++p; + xml_emit_open_tag(x, mark, p); + if (*p == '>') { ++p; goto parse_text; } + if (p[0] == '/' && p[1] == '>') { + xml_emit_close_tag(x); + p += 2; + goto parse_text; + } + if (iswhite(*p)) + goto parse_attributes; + return "syntax error after element name"; + +parse_attributes: + while (iswhite(*p)) ++p; + if (isname(*p)) + goto parse_attribute_name; + if (*p == '>') { ++p; goto parse_text; } + if (p[0] == '/' && p[1] == '>') { + xml_emit_close_tag(x); + p += 2; + goto parse_text; + } + return "syntax error in attributes"; + +parse_attribute_name: + mark = p; + while (isname(*p)) ++p; + xml_emit_att_name(x, mark, p); + while (iswhite(*p)) ++p; + if (*p == '=') { ++p; goto parse_attribute_value; } + return "syntax error after attribute name"; + +parse_attribute_value: + while (iswhite(*p)) ++p; + quote = *p++; + if (quote != '"' && quote != '\'') + return "missing quote character"; + mark = p; + while (*p && *p != quote) ++p; + if (*p == quote) { + xml_emit_att_value(x, mark, p++); + goto parse_attributes; + } + return "end of data in attribute value"; +} + +static char *convert_to_utf8(fz_context *doc, unsigned char *s, int n) +{ + unsigned char *e = s + n; + char *dst, *d; + int c; + + if (s[0] == 0xFE && s[1] == 0xFF) { + dst = d = fz_malloc(doc, n * 2); + while (s + 1 < e) { + c = s[0] << 8 | s[1]; + d += fz_runetochar(d, c); + s += 2; + } + *d = 0; + return dst; + } + + if (s[0] == 0xFF && s[1] == 0xFE) { + dst = d = fz_malloc(doc, n * 2); + while (s + 1 < e) { + c = s[0] | s[1] << 8; + d += fz_runetochar(d, c); + s += 2; + } + *d = 0; + return dst; + } + + return (char*)s; +} + +fz_xml * +fz_parse_xml(fz_context *ctx, unsigned char *s, int n) +{ + struct parser parser; + fz_xml root; + char *p, *error; + + /* s is already null-terminated (see xps_new_part) */ + + memset(&root, 0, sizeof(root)); + parser.head = &root; + parser.ctx = ctx; + + p = convert_to_utf8(ctx, s, n); + + error = xml_parse_document_imp(&parser, p); + if (error) + fz_throw(ctx, "%s", error); + + if (p != (char*)s) + fz_free(ctx, p); + + return root.down; +} diff --git a/fitz/fitz.h b/fitz/fitz.h index 77e690e7..0ea3cb2c 100644 --- a/fitz/fitz.h +++ b/fitz/fitz.h @@ -655,6 +655,59 @@ extern int fz_getopt(int nargc, char * const *nargv, const char *ostr); extern int fz_optind; extern char *fz_optarg; +/* + XML document model +*/ + +typedef struct fz_xml_s fz_xml; + +/* + fz_parse_xml: Parse a zero-terminated string into a tree of xml nodes. +*/ +fz_xml *fz_parse_xml(fz_context *ctx, unsigned char *buf, int len); + +/* + fz_xml_next: Return next sibling of XML node. +*/ +fz_xml *fz_xml_next(fz_xml *item); + +/* + fz_xml_down: Return first child of XML node. +*/ +fz_xml *fz_xml_down(fz_xml *item); + +/* + fz_xml_tag: Return tag of XML node. Return the empty string for text nodes. +*/ +char *fz_xml_tag(fz_xml *item); + +/* + fz_xml_att: Return the value of an attribute of an XML node. + NULL if the attribute doesn't exist. +*/ +char *fz_xml_att(fz_xml *item, const char *att); + +/* + fz_xml_text: Return the text content of an XML node. + NULL if the node is a tag. +*/ +char *fz_xml_text(fz_xml *item); + +/* + fz_free_xml: Free the XML node and all its children and siblings. +*/ +void fz_free_xml(fz_context *doc, fz_xml *item); + +/* + fz_detach_xml: Detach a node from the tree, unlinking it from its parent. +*/ +void fz_detach_xml(fz_xml *node); + +/* + fz_debug_xml: Pretty-print an XML tree to stdout. +*/ +void fz_debug_xml(fz_xml *item, int level); + /* fz_point is a point in a two-dimensional space. */ diff --git a/win32/libmupdf.vcproj b/win32/libmupdf.vcproj index e563fc80..1195ccee 100644 --- a/win32/libmupdf.vcproj +++ b/win32/libmupdf.vcproj @@ -401,6 +401,10 @@ RelativePath="..\fitz\base_trans.c" > + + @@ -669,10 +673,6 @@ RelativePath="..\xps\xps_util.c" > - - diff --git a/xps/muxps.h b/xps/muxps.h index c784a01a..007f4452 100644 --- a/xps/muxps.h +++ b/xps/muxps.h @@ -6,22 +6,6 @@ typedef struct xps_document_s xps_document; typedef struct xps_page_s xps_page; -/* - * XML document model - */ - -typedef struct fz_xml_s fz_xml; - -fz_xml *fz_parse_xml(fz_context *doc, unsigned char *buf, int len); -fz_xml *fz_xml_next(fz_xml *item); -fz_xml *fz_xml_down(fz_xml *item); -char *fz_xml_tag(fz_xml *item); -char *fz_xml_att(fz_xml *item, const char *att); -char *fz_xml_text(fz_xml *item); -void fz_free_xml(fz_context *doc, fz_xml *item); -void fz_print_xml(fz_xml *item, int level); -void fz_detach_xml(fz_xml *node); - /* xps_open_document: Open a document. diff --git a/xps/xps_xml.c b/xps/xps_xml.c deleted file mode 100644 index 739c00ec..00000000 --- a/xps/xps_xml.c +++ /dev/null @@ -1,439 +0,0 @@ -#include "muxps.h" - -struct parser -{ - fz_xml *head; - fz_context *ctx; -}; - -struct attribute -{ - char name[40]; - char *value; - struct attribute *next; -}; - -struct fz_xml_s -{ - char name[40]; - char *text; - struct attribute *atts; - fz_xml *up, *down, *next; -}; - -static inline void indent(int n) -{ - while (n--) putchar(' '); -} - -void fz_print_xml(fz_xml *item, int level) -{ - while (item) { - if (item->text) { - printf("%s\n", item->text); - } else { - struct attribute *att; - indent(level); - printf("<%s", item->name); - for (att = item->atts; att; att = att->next) - printf(" %s=\"%s\"", att->name, att->value); - if (item->down) { - printf(">\n"); - fz_print_xml(item->down, level + 1); - indent(level); - printf("\n", item->name); - } - else { - printf("/>\n"); - } - item = item->next; - } - } -} - -fz_xml *fz_xml_next(fz_xml *item) -{ - return item->next; -} - -fz_xml *fz_xml_down(fz_xml *item) -{ - return item->down; -} - -char *fz_xml_text(fz_xml *item) -{ - return item->text; -} - -char *fz_xml_tag(fz_xml *item) -{ - return item->name; -} - -char *fz_xml_att(fz_xml *item, const char *name) -{ - struct attribute *att; - for (att = item->atts; att; att = att->next) - if (!strcmp(att->name, name)) - return att->value; - return NULL; -} - -static void xml_free_attribute(fz_context *ctx, struct attribute *att) -{ - while (att) { - struct attribute *next = att->next; - if (att->value) - fz_free(ctx, att->value); - fz_free(ctx, att); - att = next; - } -} - -void fz_free_xml(fz_context *ctx, fz_xml *item) -{ - while (item) { - fz_xml *next = item->next; - if (item->text) - fz_free(ctx, item->text); - if (item->atts) - xml_free_attribute(ctx, item->atts); - if (item->down) - fz_free_xml(ctx, item->down); - fz_free(ctx, item); - item = next; - } -} - -void fz_detach_xml(fz_xml *node) -{ - if (node->up) - node->up->down = NULL; -} - -static int xml_parse_entity(int *c, char *a) -{ - char *b; - if (a[1] == '#') { - if (a[2] == 'x') - *c = strtol(a + 3, &b, 16); - else - *c = strtol(a + 2, &b, 10); - if (*b == ';') - return b - a + 1; - } - else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') { - *c = '<'; - return 4; - } - else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') { - *c = '>'; - return 4; - } - else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') { - *c = '&'; - return 5; - } - else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') { - *c = '\''; - return 6; - } - else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') { - *c = '"'; - return 6; - } - *c = *a++; - return 1; -} - -static inline int isname(int c) -{ - return c == '.' || c == '-' || c == '_' || c == ':' || - (c >= '0' && c <= '9') || - (c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z'); -} - -static inline int iswhite(int c) -{ - return c == ' ' || c == '\r' || c == '\n' || c == '\t'; -} - -static void xml_emit_open_tag(struct parser *parser, char *a, char *b) -{ - fz_xml *head, *tail; - - head = fz_malloc_struct(parser->ctx, fz_xml); - if (b - a > sizeof(head->name) - 1) - b = a + sizeof(head->name) - 1; - memcpy(head->name, a, b - a); - head->name[b - a] = 0; - - head->atts = NULL; - head->text = NULL; - head->up = parser->head; - head->down = NULL; - head->next = NULL; - - if (!parser->head->down) { - parser->head->down = head; - } - else { - tail = parser->head->down; - while (tail->next) - tail = tail->next; - tail->next = head; - } - - parser->head = head; -} - -static void xml_emit_att_name(struct parser *parser, char *a, char *b) -{ - fz_xml *head = parser->head; - struct attribute *att; - - att = fz_malloc_struct(parser->ctx, struct attribute); - if (b - a > sizeof(att->name) - 1) - b = a + sizeof(att->name) - 1; - memcpy(att->name, a, b - a); - att->name[b - a] = 0; - att->value = NULL; - att->next = head->atts; - head->atts = att; -} - -static void xml_emit_att_value(struct parser *parser, char *a, char *b) -{ - fz_xml *head = parser->head; - struct attribute *att = head->atts; - char *s; - int c; - - /* entities are all longer than UTFmax so runetochar is safe */ - s = att->value = fz_malloc(parser->ctx, b - a + 1); - while (a < b) { - if (*a == '&') { - a += xml_parse_entity(&c, a); - s += fz_runetochar(s, c); - } - else { - *s++ = *a++; - } - } - *s = 0; -} - -static void xml_emit_close_tag(struct parser *parser) -{ - if (parser->head->up) - parser->head = parser->head->up; -} - -static void xml_emit_text(struct parser *parser, char *a, char *b) -{ - static char *empty = ""; - fz_xml *head; - char *s; - int c; - - /* Skip all-whitespace text nodes */ - for (s = a; s < b; s++) - if (!iswhite(*s)) - break; - if (s == b) - return; - - xml_emit_open_tag(parser, empty, empty); - head = parser->head; - - /* entities are all longer than UTFmax so runetochar is safe */ - s = head->text = fz_malloc(parser->ctx, b - a + 1); - while (a < b) { - if (*a == '&') { - a += xml_parse_entity(&c, a); - s += fz_runetochar(s, c); - } - else { - *s++ = *a++; - } - } - *s = 0; - - xml_emit_close_tag(parser); -} - -static char *xml_parse_document_imp(struct parser *x, char *p) -{ - char *mark; - int quote; - -parse_text: - mark = p; - while (*p && *p != '<') ++p; - xml_emit_text(x, mark, p); - if (*p == '<') { ++p; goto parse_element; } - return NULL; - -parse_element: - if (*p == '/') { ++p; goto parse_closing_element; } - if (*p == '!') { ++p; goto parse_comment; } - if (*p == '?') { ++p; goto parse_processing_instruction; } - while (iswhite(*p)) ++p; - if (isname(*p)) - goto parse_element_name; - return "syntax error in element"; - -parse_comment: - if (*p == '[') goto parse_cdata; - if (*p++ != '-') return "syntax error in comment (') { - p += 3; - goto parse_text; - } - ++p; - } - return "end of data in comment"; - -parse_cdata: - if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') - return "syntax error in CDATA section"; - p += 7; - mark = p; - while (*p) { - if (p[0] == ']' && p[1] == ']' && p[2] == '>') { - p += 3; - goto parse_text; - } - ++p; - } - return "end of data in CDATA section"; - -parse_processing_instruction: - while (*p) { - if (p[0] == '?' && p[1] == '>') { - p += 2; - goto parse_text; - } - ++p; - } - return "end of data in processing instruction"; - -parse_closing_element: - while (iswhite(*p)) ++p; - mark = p; - while (isname(*p)) ++p; - while (iswhite(*p)) ++p; - if (*p != '>') - return "syntax error in closing element"; - xml_emit_close_tag(x); - ++p; - goto parse_text; - -parse_element_name: - mark = p; - while (isname(*p)) ++p; - xml_emit_open_tag(x, mark, p); - if (*p == '>') { ++p; goto parse_text; } - if (p[0] == '/' && p[1] == '>') { - xml_emit_close_tag(x); - p += 2; - goto parse_text; - } - if (iswhite(*p)) - goto parse_attributes; - return "syntax error after element name"; - -parse_attributes: - while (iswhite(*p)) ++p; - if (isname(*p)) - goto parse_attribute_name; - if (*p == '>') { ++p; goto parse_text; } - if (p[0] == '/' && p[1] == '>') { - xml_emit_close_tag(x); - p += 2; - goto parse_text; - } - return "syntax error in attributes"; - -parse_attribute_name: - mark = p; - while (isname(*p)) ++p; - xml_emit_att_name(x, mark, p); - while (iswhite(*p)) ++p; - if (*p == '=') { ++p; goto parse_attribute_value; } - return "syntax error after attribute name"; - -parse_attribute_value: - while (iswhite(*p)) ++p; - quote = *p++; - if (quote != '"' && quote != '\'') - return "missing quote character"; - mark = p; - while (*p && *p != quote) ++p; - if (*p == quote) { - xml_emit_att_value(x, mark, p++); - goto parse_attributes; - } - return "end of data in attribute value"; -} - -static char *convert_to_utf8(fz_context *doc, unsigned char *s, int n) -{ - unsigned char *e = s + n; - char *dst, *d; - int c; - - if (s[0] == 0xFE && s[1] == 0xFF) { - dst = d = fz_malloc(doc, n * 2); - while (s + 1 < e) { - c = s[0] << 8 | s[1]; - d += fz_runetochar(d, c); - s += 2; - } - *d = 0; - return dst; - } - - if (s[0] == 0xFF && s[1] == 0xFE) { - dst = d = fz_malloc(doc, n * 2); - while (s + 1 < e) { - c = s[0] | s[1] << 8; - d += fz_runetochar(d, c); - s += 2; - } - *d = 0; - return dst; - } - - return (char*)s; -} - -fz_xml * -fz_parse_xml(fz_context *ctx, unsigned char *s, int n) -{ - struct parser parser; - fz_xml root; - char *p, *error; - - /* s is already null-terminated (see xps_new_part) */ - - memset(&root, 0, sizeof(root)); - parser.head = &root; - parser.ctx = ctx; - - p = convert_to_utf8(ctx, s, n); - - error = xml_parse_document_imp(&parser, p); - if (error) - fz_throw(ctx, "%s", error); - - if (p != (char*)s) - fz_free(ctx, p); - - return root.down; -} -- cgit v1.2.3