#include "mupdf/html.h" enum { T, R, B, L }; static const char *default_css = "html,address,blockquote,body,dd,div,dl,dt,h1,h2,h3,h4,h5,h6,ol,p,ul,center,hr,pre{display:block}" "span{display:inline}" "li{display:list-item}" "head{display:none}" "body{margin:1em}" "h1{font-size:2em;margin:.67em 0}" "h2{font-size:1.5em;margin:.75em 0}" "h3{font-size:1.17em;margin:.83em 0}" "h4,p,blockquote,ul,ol,dl,dir,menu{margin:1.12em 0}" "h5{font-size:.83em;margin:1.5em 0}" "h6{font-size:.67em;margin:1.67em 0}" "h1,h2,h3,h4,h5,h6,b,strong{font-weight:bold}" "blockquote{margin-left:40px;margin-right:40px}" "i,cite,em,var,address{font-style:italic}" "pre,tt,code,kbd,samp{font-family:monospace}" "pre{white-space:pre}" "big{font-size:1.17em}" "small,sub,sup{font-size:.83em}" "sub{vertical-align:sub}" "sup{vertical-align:super}" "s,strike,del{text-decoration:line-through}" "hr{border-width:thin;border-color:black;border-style:solid;margin:.5em 0}" "ol,ul,dir,menu,dd{margin-left:40px}" "ol{list-style-type:decimal}" "ol ul,ul ol,ul ul,ol ol{margin-top:0;margin-bottom:0}" "u,ins{text-decoration:underline}" "center{text-align:center}" "svg{display:none}" "a{color:blue}" ; static int iswhite(int c) { return c == ' ' || c == '\t' || c == '\r' || c == '\n'; } static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) { while (flow) { fz_html_flow *next = flow->next; if (flow->type == FLOW_WORD) fz_free(ctx, flow->text); if (flow->type == FLOW_IMAGE) fz_drop_image(ctx, flow->image); fz_free(ctx, flow); flow = next; } } static fz_html_flow *add_flow(fz_context *ctx, fz_html *top, fz_css_style *style, int type) { fz_html_flow *flow = fz_malloc_struct(ctx, fz_html_flow); flow->type = type; flow->style = style; *top->flow_tail = flow; top->flow_tail = &flow->next; return flow; } static void add_flow_space(fz_context *ctx, fz_html *top, fz_css_style *style) { fz_html_flow *flow; /* delete space at the beginning of the line */ if (!top->flow_head) return; flow = add_flow(ctx, top, style, FLOW_GLUE); flow->text = " "; flow->broken_text = ""; } static void add_flow_word(fz_context *ctx, fz_html *top, fz_css_style *style, const char *a, const char *b) { fz_html_flow *flow = add_flow(ctx, top, style, FLOW_WORD); flow->text = fz_malloc(ctx, b - a + 1); memcpy(flow->text, a, b - a); flow->text[b - a] = 0; } static void add_flow_image(fz_context *ctx, fz_html *top, fz_css_style *style, fz_image *img) { fz_html_flow *flow = add_flow(ctx, top, style, FLOW_IMAGE); flow->image = fz_keep_image(ctx, img); } static void generate_text(fz_context *ctx, fz_html *box, const char *text) { fz_html *flow = box; while (flow->type != BOX_FLOW) flow = flow->up; while (*text) { if (iswhite(*text)) { ++text; while (iswhite(*text)) ++text; add_flow_space(ctx, flow, &box->style); } if (*text) { const char *mark = text++; while (*text && !iswhite(*text)) ++text; add_flow_word(ctx, flow, &box->style, mark, text); } } } static void generate_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_html *box, const char *src) { fz_image *img; fz_buffer *buf; char path[2048]; fz_html *flow = box; while (flow->type != BOX_FLOW) flow = flow->up; fz_strlcpy(path, base_uri, sizeof path); fz_strlcat(path, "/", sizeof path); fz_strlcat(path, src, sizeof path); fz_cleanname(path); fz_try(ctx) { buf = fz_read_archive_entry(ctx, zip, path); img = fz_new_image_from_buffer(ctx, buf); fz_drop_buffer(ctx, buf); add_flow_image(ctx, flow, &box->style, img); } fz_catch(ctx) { const char *alt = "[image]"; fz_warn(ctx, "html: cannot add image src='%s'", src); add_flow_word(ctx, flow, &box->style, alt, alt + 7); } } static void init_box(fz_context *ctx, fz_html *box) { box->type = BOX_BLOCK; box->x = box->y = 0; box->w = box->h = 0; box->up = NULL; box->last = NULL; box->down = NULL; box->next = NULL; box->flow_head = NULL; box->flow_tail = &box->flow_head; fz_default_css_style(ctx, &box->style); } void fz_drop_html(fz_context *ctx, fz_html *box) { while (box) { fz_html *next = box->next; fz_drop_html_flow(ctx, box->flow_head); fz_drop_html(ctx, box->down); fz_free(ctx, box); box = next; } } static fz_html *new_box(fz_context *ctx) { fz_html *box = fz_malloc_struct(ctx, fz_html); init_box(ctx, box); return box; } static void insert_box(fz_context *ctx, fz_html *box, int type, fz_html *top) { box->type = type; box->up = top; if (top) { if (!top->last) { top->down = top->last = box; } else { top->last->next = box; top->last = box; } } } static fz_html *insert_block_box(fz_context *ctx, fz_html *box, fz_html *top) { if (top->type == BOX_BLOCK) { insert_box(ctx, box, BOX_BLOCK, top); } else if (top->type == BOX_FLOW) { while (top->type != BOX_BLOCK) top = top->up; insert_box(ctx, box, BOX_BLOCK, top); } else if (top->type == BOX_INLINE) { while (top->type != BOX_BLOCK) top = top->up; insert_box(ctx, box, BOX_BLOCK, top); } return top; } static fz_html *insert_break_box(fz_context *ctx, fz_html *box, fz_html *top) { if (top->type == BOX_BLOCK) { insert_box(ctx, box, BOX_BREAK, top); } else if (top->type == BOX_FLOW) { while (top->type != BOX_BLOCK) top = top->up; insert_box(ctx, box, BOX_BREAK, top); } else if (top->type == BOX_INLINE) { while (top->type != BOX_BLOCK) top = top->up; insert_box(ctx, box, BOX_BREAK, top); } return top; } static void insert_inline_box(fz_context *ctx, fz_html *box, fz_html *top) { if (top->type == BOX_BLOCK) { if (top->last && top->last->type == BOX_FLOW) { insert_box(ctx, box, BOX_INLINE, top->last); } else { fz_html *flow = new_box(ctx); flow->is_first_flow = !top->last; insert_box(ctx, flow, BOX_FLOW, top); insert_box(ctx, box, BOX_INLINE, flow); } } else if (top->type == BOX_FLOW) { insert_box(ctx, box, BOX_INLINE, top); } else if (top->type == BOX_INLINE) { insert_box(ctx, box, BOX_INLINE, top); } } static void generate_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_xml *node, fz_html *top, fz_css_rule *rule, fz_css_match *up_match) { fz_css_match match; fz_html *box; const char *tag; int display; while (node) { match.up = up_match; match.count = 0; tag = fz_xml_tag(node); if (tag) { fz_match_css(ctx, &match, rule, node); display = fz_get_css_match_display(&match); if (!strcmp(tag, "br")) { box = new_box(ctx); fz_apply_css_style(ctx, set, &box->style, &match); top = insert_break_box(ctx, box, top); } else if (!strcmp(tag, "img")) { const char *src = fz_xml_att(node, "src"); if (src) { box = new_box(ctx); fz_apply_css_style(ctx, set, &box->style, &match); insert_inline_box(ctx, box, top); generate_image(ctx, zip, base_uri, box, src); } } else if (display != DIS_NONE) { box = new_box(ctx); fz_apply_css_style(ctx, set, &box->style, &match); if (display == DIS_BLOCK) { top = insert_block_box(ctx, box, top); } else if (display == DIS_LIST_ITEM) { top = insert_block_box(ctx, box, top); } else if (display == DIS_INLINE) { insert_inline_box(ctx, box, top); } else { fz_warn(ctx, "unknown box display type"); insert_box(ctx, box, BOX_BLOCK, top); } if (fz_xml_down(node)) generate_boxes(ctx, set, zip, base_uri, fz_xml_down(node), box, rule, &match); // TODO: remove empty flow boxes } } else { if (top->type != BOX_INLINE) { box = new_box(ctx); insert_inline_box(ctx, box, top); box->style = top->style; generate_text(ctx, box, fz_xml_text(node)); } else { generate_text(ctx, top, fz_xml_text(node)); } } node = fz_xml_next(node); } } static void measure_image(fz_context *ctx, fz_html_flow *node, float w, float h) { float xs = 1, ys = 1, s = 1; node->x = 0; node->y = 0; if (node->image->w > w) xs = w / node->image->w; if (node->image->h > h) ys = h / node->image->h; s = fz_min(xs, ys); node->w = node->image->w * s; node->h = node->image->h * s; } static void measure_word(fz_context *ctx, fz_html_flow *node, float em) { const char *s; int c, g; float w; em = fz_from_css_number(node->style->font_size, em, em); node->x = 0; node->y = 0; node->h = fz_from_css_number_scale(node->style->line_height, em, em, em); w = 0; s = node->text; while (*s) { s += fz_chartorune(&c, s); g = fz_encode_character(ctx, node->style->font, c); w += fz_advance_glyph(ctx, node->style->font, g) * em; } node->w = w; node->em = em; } static float measure_line(fz_html_flow *node, fz_html_flow *end, float *baseline) { float max_a = 0, max_d = 0, h = 0; while (node != end) { if (node->type == FLOW_IMAGE) { if (node->h > max_a) max_a = node->h; } else { float a = node->em * 0.8; float d = node->em * 0.2; if (a > max_a) max_a = a; if (d > max_d) max_d = d; } if (node->h > h) h = node->h; if (max_a + max_d > h) h = max_a + max_d; node = node->next; } *baseline = max_a + (h - max_a - max_d) / 2; return h; } static void layout_line(fz_context *ctx, float indent, float page_w, float line_w, int align, fz_html_flow *node, fz_html_flow *end, fz_html *box, float baseline) { float x = box->x + indent; float y = box->y + box->h; float slop = page_w - line_w; float justify = 0; float va; int n = 0; if (align == TA_JUSTIFY) { fz_html_flow *it; for (it = node; it != end; it = it->next) if (it->type == FLOW_GLUE) ++n; justify = slop / n; } else if (align == TA_RIGHT) x += slop; else if (align == TA_CENTER) x += slop / 2; while (node != end) { switch (node->style->vertical_align) { default: case VA_BASELINE: va = 0; break; case VA_SUB: va = node->em * 0.2f; break; case VA_SUPER: va = node->em * -0.3f; break; } node->x = x; if (node->type == FLOW_IMAGE) node->y = y + baseline - node->h; else node->y = y + baseline + va; x += node->w; if (node->type == FLOW_GLUE) x += justify; node = node->next; } } static fz_html_flow *find_next_glue(fz_html_flow *node, float *w) { while (node && node->type == FLOW_GLUE) { *w += node->w; node = node->next; } while (node && node->type != FLOW_GLUE) { *w += node->w; node = node->next; } return node; } static fz_html_flow *find_next_word(fz_html_flow *node, float *w) { while (node && node->type == FLOW_GLUE) { *w += node->w; node = node->next; } return node; } static void layout_flow(fz_context *ctx, fz_html *box, fz_html *top, float em, float page_h) { fz_html_flow *node, *line_start, *word_start, *word_end, *line_end; float glue_w; float word_w; float line_w; float indent; float avail, line_h; float baseline; int align; em = fz_from_css_number(box->style.font_size, em, em); indent = box->is_first_flow ? fz_from_css_number(top->style.text_indent, em, top->w) : 0; align = top->style.text_align; box->x = top->x; box->y = top->y + top->h; box->w = top->w; box->h = 0; if (!box->flow_head) return; for (node = box->flow_head; node; node = node->next) if (node->type == FLOW_IMAGE) measure_image(ctx, node, top->w, page_h); else measure_word(ctx, node, em); line_start = find_next_word(box->flow_head, &glue_w); line_end = NULL; line_w = indent; word_w = 0; word_start = line_start; while (word_start) { word_end = find_next_glue(word_start, &word_w); if (line_w + word_w <= top->w) { line_w += word_w; glue_w = 0; line_end = word_end; word_start = find_next_word(word_end, &glue_w); word_w = glue_w; } else { avail = page_h - fmodf(box->y + box->h, page_h); line_h = measure_line(line_start, line_end, &baseline); if (line_h > avail) box->h += avail; layout_line(ctx, indent, top->w, line_w, align, line_start, line_end, box, baseline); box->h += line_h; word_start = find_next_word(line_end, &glue_w); line_start = word_start; line_end = NULL; indent = 0; line_w = 0; word_w = 0; } } /* don't justify the last line of a paragraph */ if (align == TA_JUSTIFY) align = TA_LEFT; if (line_start) { avail = page_h - fmodf(box->y + box->h, page_h); line_h = measure_line(line_start, line_end, &baseline); if (line_h > avail) box->h += avail; layout_line(ctx, indent, top->w, line_w, align, line_start, line_end, box, baseline); box->h += line_h; } } static void layout_block(fz_context *ctx, fz_html *box, fz_html *top, float em, float top_collapse_margin, float page_h) { fz_html *child; float box_collapse_margin; int prev_br; float *margin = box->margin; float *border = box->border; float *padding = box->padding; em = fz_from_css_number(box->style.font_size, em, em); margin[0] = fz_from_css_number(box->style.margin[0], em, top->w); margin[1] = fz_from_css_number(box->style.margin[1], em, top->w); margin[2] = fz_from_css_number(box->style.margin[2], em, top->w); margin[3] = fz_from_css_number(box->style.margin[3], em, top->w); padding[0] = fz_from_css_number(box->style.padding[0], em, top->w); padding[1] = fz_from_css_number(box->style.padding[1], em, top->w); padding[2] = fz_from_css_number(box->style.padding[2], em, top->w); padding[3] = fz_from_css_number(box->style.padding[3], em, top->w); if (box->style.border_style) { border[0] = fz_from_css_number(box->style.border_width[0], em, top->w); border[1] = fz_from_css_number(box->style.border_width[1], em, top->w); border[2] = fz_from_css_number(box->style.border_width[2], em, top->w); border[3] = fz_from_css_number(box->style.border_width[3], em, top->w); } else border[0] = border[1] = border[2] = border[3] = 0; if (padding[T] == 0 && border[T] == 0) box_collapse_margin = margin[T]; else box_collapse_margin = 0; if (margin[T] > top_collapse_margin) margin[T] -= top_collapse_margin; else margin[T] = 0; box->x = top->x + margin[L] + border[L] + padding[L]; box->y = top->y + top->h + margin[T] + border[T] + padding[T]; box->w = top->w - (margin[L] + margin[R] + border[L] + border[R] + padding[L] + padding[R]); box->h = 0; prev_br = 0; for (child = box->down; child; child = child->next) { if (child->type == BOX_BLOCK) { layout_block(ctx, child, box, em, box_collapse_margin, page_h); box->h += child->h + child->padding[T] + child->padding[B] + child->border[T] + child->border[B] + child->margin[T] + child->margin[B]; box_collapse_margin = child->margin[B]; prev_br = 0; } else if (child->type == BOX_BREAK) { /* TODO: interaction with page breaks */ if (prev_br) box->h += fz_from_css_number_scale(box->style.line_height, em, em, em); prev_br = 1; } else if (child->type == BOX_FLOW) { layout_flow(ctx, child, box, em, page_h); if (child->h > 0) { box->h += child->h; box_collapse_margin = 0; prev_br = 0; } } } if (padding[B] == 0 && border[B] == 0) { if (margin[B] > 0) { box->h -= box_collapse_margin; if (margin[B] < box_collapse_margin) margin[B] = box_collapse_margin; } } } static void draw_flow_box(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *ctm) { fz_html_flow *node; fz_text *text; fz_matrix trm; const char *s; float color[3]; float x, y; int c, g; for (node = box->flow_head; node; node = node->next) { if (node->type == FLOW_IMAGE) { if (node->y > page_bot || node->y + node->h < page_top) continue; } else { if (node->y > page_bot || node->y < page_top) continue; } if (node->type == FLOW_WORD) { fz_scale(&trm, node->em, -node->em); text = fz_new_text(ctx, node->style->font, &trm, 0); x = node->x; y = node->y; s = node->text; while (*s) { s += fz_chartorune(&c, s); g = fz_encode_character(ctx, node->style->font, c); fz_add_text(ctx, text, g, c, x, y); x += fz_advance_glyph(ctx, node->style->font, g) * node->em; } color[0] = node->style->color.r / 255.0f; color[1] = node->style->color.g / 255.0f; color[2] = node->style->color.b / 255.0f; fz_fill_text(ctx, dev, text, ctm, fz_device_rgb(ctx), color, 1); fz_drop_text(ctx, text); } else if (node->type == FLOW_IMAGE) { fz_matrix local_ctm = *ctm; fz_pre_translate(&local_ctm, node->x, node->y); fz_pre_scale(&local_ctm, node->w, node->h); fz_fill_image(ctx, dev, node->image, &local_ctm, 1); } } } static void draw_rect(fz_context *ctx, fz_device *dev, const fz_matrix *ctm, float *rgba, float x0, float y0, float x1, float y1) { fz_path *path = fz_new_path(ctx); fz_moveto(ctx, path, x0, y0); fz_lineto(ctx, path, x1, y0); fz_lineto(ctx, path, x1, y1); fz_lineto(ctx, path, x0, y1); fz_closepath(ctx, path); fz_fill_path(ctx, dev, path, 0, ctm, fz_device_rgb(ctx), rgba, rgba[3]); fz_drop_path(ctx, path); } static void draw_block_box(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *ctm) { float x0, y0, x1, y1; float color[4]; // TODO: background fill // TODO: border stroke float *border = box->border; float *padding = box->padding; x0 = box->x - padding[L]; y0 = box->y - padding[T]; x1 = box->x + box->w + padding[R]; y1 = box->y + box->h + padding[B]; if (y0 > page_bot || y1 < page_top) return; if (box->style.background_color.a > 0) { color[0] = box->style.background_color.r / 255.0f; color[1] = box->style.background_color.g / 255.0f; color[2] = box->style.background_color.b / 255.0f; color[3] = box->style.background_color.a / 255.0f; draw_rect(ctx, dev, ctm, color, x0, y0, x1, y1); } if (box->style.border_color.a > 0) { color[0] = box->style.border_color.r / 255.0f; color[1] = box->style.border_color.g / 255.0f; color[2] = box->style.border_color.b / 255.0f; color[3] = box->style.border_color.a / 255.0f; if (border[T] > 0) draw_rect(ctx, dev, ctm, color, x0 - border[L], y0 - border[T], x1 + border[R], y0); if (border[B] > 0) draw_rect(ctx, dev, ctm, color, x0 - border[L], y1, x1 + border[R], y1 + border[B]); if (border[L] > 0) draw_rect(ctx, dev, ctm, color, x0 - border[L], y0 - border[T], x0, y1 + border[B]); if (border[R] > 0) draw_rect(ctx, dev, ctm, color, x1, y0 - border[T], x1 + border[R], y1 + border[B]); } for (box = box->down; box; box = box->next) { switch (box->type) { case BOX_BLOCK: draw_block_box(ctx, box, page_top, page_bot, dev, ctm); break; case BOX_FLOW: draw_flow_box(ctx, box, page_top, page_bot, dev, ctm); break; } } } void fz_draw_html(fz_context *ctx, fz_html *box, float page_top, float page_bot, fz_device *dev, const fz_matrix *inctm) { fz_matrix ctm = *inctm; fz_pre_translate(&ctm, 0, -page_top); draw_block_box(ctx, box, page_top, page_bot, dev, &ctm); } static char *concat_text(fz_context *ctx, fz_xml *root) { fz_xml *node; int i = 0, n = 1; char *s; for (node = fz_xml_down(root); node; node = fz_xml_next(node)) { const char *text = fz_xml_text(node); n += text ? strlen(text) : 0; } s = fz_malloc(ctx, n); for (node = fz_xml_down(root); node; node = fz_xml_next(node)) { const char *text = fz_xml_text(node); if (text) { n = strlen(text); memcpy(s+i, text, n); i += n; } } s[i] = 0; return s; } static fz_css_rule * html_load_css(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_css_rule *css, fz_xml *root) { fz_xml *node; fz_buffer *buf; char path[2048]; for (node = root; node; node = fz_xml_next(node)) { const char *tag = fz_xml_tag(node); if (tag && !strcmp(tag, "link")) { char *rel = fz_xml_att(node, "rel"); if (rel && !fz_strcasecmp(rel, "stylesheet")) { char *type = fz_xml_att(node, "type"); if ((type && !strcmp(type, "text/css")) || !type) { char *href = fz_xml_att(node, "href"); if (href) { fz_strlcpy(path, base_uri, sizeof path); fz_strlcat(path, "/", sizeof path); fz_strlcat(path, href, sizeof path); fz_cleanname(path); buf = fz_read_archive_entry(ctx, zip, path); fz_write_buffer_byte(ctx, buf, 0); css = fz_parse_css(ctx, css, (char*)buf->data, path); fz_drop_buffer(ctx, buf); } } } } if (tag && !strcmp(tag, "style")) { char *s = concat_text(ctx, node); css = fz_parse_css(ctx, css, s, "