diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2017-11-08 14:16:30 +0100 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2017-11-08 17:57:09 +0100 |
commit | 8b88d6f3fe9f80a11408d3e0773cbe290b1a4dba (patch) | |
tree | 7cbce1511d3101f24c10b572e097659a5f89dcef /source | |
parent | fb51a9744d6d356371b2ad73f2d3da972012be1f (diff) | |
download | mupdf-8b88d6f3fe9f80a11408d3e0773cbe290b1a4dba.tar.xz |
Clean up and speed up text searching.
Diffstat (limited to 'source')
-rw-r--r-- | source/fitz/stext-search.c | 252 | ||||
-rw-r--r-- | source/fitz/util.c | 62 |
2 files changed, 160 insertions, 154 deletions
diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c index b2247b33..2f074180 100644 --- a/source/fitz/stext-search.c +++ b/source/fitz/stext-search.c @@ -2,21 +2,7 @@ #include <string.h> #include <limits.h> -#include <stdio.h> - -static inline int fz_tolower(int c) -{ - /* TODO: proper unicode case folding */ - /* TODO: character equivalence (a matches ä, etc) */ - if (c >= 'A' && c <= 'Z') - return c - 'A' + 'a'; - return c; -} - -static inline int iswhite(int c) -{ - return c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == 0xA0 || c == 0x2028 || c == 0x2029; -} +#include <assert.h> int fz_stext_char_count(fz_context *ctx, fz_stext_page *page) { @@ -71,95 +57,7 @@ const fz_stext_char *fz_stext_char_at(fz_context *ctx, fz_stext_page *page, int return &zero; } -static inline int charat(fz_context *ctx, fz_stext_page *page, int idx) -{ - return fz_stext_char_at(ctx, page, idx)->c; -} - -static fz_rect *bboxat(fz_context *ctx, fz_stext_page *page, int idx, fz_rect *bbox) -{ - /* FIXME: Nasty extra copy */ - *bbox = fz_stext_char_at(ctx, page, idx)->bbox; - return bbox; -} - -static int match_stext(fz_context *ctx, fz_stext_page *page, const char *s, int n) -{ - int orig = n; - int c; - while (*s) - { - s += fz_chartorune(&c, (char *)s); - if (iswhite(c) && iswhite(charat(ctx, page, n))) - { - const char *s_next; - - /* Skip over whitespace in the document */ - do - n++; - while (iswhite(charat(ctx, page, n))); - - /* Skip over multiple whitespace in the search string */ - while (s_next = s + fz_chartorune(&c, (char *)s), iswhite(c)) - s = s_next; - } - else - { - if (fz_tolower(c) != fz_tolower(charat(ctx, page, n))) - return 0; - n++; - } - } - return n - orig; -} - -int -fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_rect *hit_bbox, int hit_max) -{ - int pos, len, i, n, hit_count; - - if (strlen(needle) == 0) - return 0; - - hit_count = 0; - len = fz_stext_char_count(ctx, text); - pos = 0; - while (pos < len) - { - n = match_stext(ctx, text, needle, pos); - if (n) - { - fz_rect linebox = fz_empty_rect; - for (i = 0; i < n; i++) - { - fz_rect charbox; - bboxat(ctx, text, pos + i, &charbox); - if (!fz_is_empty_rect(&charbox)) - { - if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5) - { - if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) - hit_bbox[hit_count++] = linebox; - linebox = charbox; - } - else - { - fz_union_rect(&linebox, &charbox); - } - } - } - if (!fz_is_empty_rect(&linebox) && hit_count < hit_max) - hit_bbox[hit_count++] = linebox; - pos += n; - } - else - { - pos += 1; - } - } - - return hit_count; -} +/* Enumerate marked selection */ static float dist2(float a, float b) { @@ -323,17 +221,20 @@ fz_enumerate_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_poin } } +/* Highlight selection */ + struct highlight { int len, cap; fz_rect *box; + float hfuzz, vfuzz; }; static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) { struct highlight *hits = arg; - float vfuzz = ch->size * 0.1f; - float hfuzz = ch->size * 0.5f; + float vfuzz = ch->size * hits->vfuzz; + float hfuzz = ch->size * hits->hfuzz; fz_rect bbox; if (line->dir.x > line->dir.y) @@ -423,6 +324,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_poin hits.len = 0; hits.cap = hit_max; hits.box = hit_bbox; + hits.hfuzz = 0.5f; + hits.vfuzz = 0.1f; cb.on_char = on_highlight_char; cb.on_line = on_highlight_line; @@ -433,6 +336,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_poin return hits.len; } +/* Copy selection */ + static void on_copy_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) { fz_buffer *buffer = arg; @@ -475,3 +380,138 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_drop_buffer(ctx, buffer); return (char*)s; } + +/* String search */ + +static inline int canon(int c) +{ + /* TODO: proper unicode case folding */ + /* TODO: character equivalence (a matches ä, etc) */ + if (c == 0xA0 || c == 0x2028 || c == 0x2029) + return ' '; + if (c == '\r' || c == '\n' || c == '\t') + return ' '; + if (c >= 'A' && c <= 'Z') + return c - 'A' + 'a'; + return c; +} + +static inline int chartocanon(int *c, const char *s) +{ + int n = fz_chartorune(c, s); + *c = canon(*c); + return n; +} + +static const char *match_string(const char *h, const char *n) +{ + int hc, nc; + const char *e = h; + h += chartocanon(&hc, h); + n += chartocanon(&nc, n); + while (hc == nc) + { + e = h; + if (hc == ' ') + do + h += chartocanon(&hc, h); + while (hc == ' '); + else + h += chartocanon(&hc, h); + if (nc == ' ') + do + n += chartocanon(&nc, n); + while (nc == ' '); + else + n += chartocanon(&nc, n); + } + return nc == 0 ? e : NULL; +} + +static const char *find_string(const char *s, const char *needle, const char **endp) +{ + const char *end; + while (*s) + { + end = match_string(s, needle); + if (end) + return *endp = end, s; + ++s; + } + return *endp = NULL, NULL; +} + +int +fz_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle, fz_rect *hit_bbox, int hit_max) +{ + struct highlight hits; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_buffer *buffer; + const char *haystack, *begin, *end; + int c, inside; + + if (strlen(needle) == 0) + return 0; + + hits.len = 0; + hits.cap = hit_max; + hits.box = hit_bbox; + hits.hfuzz = 0.1f; + hits.vfuzz = 0.1f; + + buffer = fz_new_buffer_from_stext_page(ctx, page); + fz_try(ctx) + { + haystack = fz_string_from_buffer(ctx, buffer); + begin = find_string(haystack, needle, &end); + if (!begin) + goto no_more_matches; + + inside = 0; + for (block = page->first_block; block; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) + { + for (ch = line->first_char; ch; ch = ch->next) + { +try_new_match: + if (!inside) + { + if (haystack >= begin) + inside = 1; + } + if (inside) + { + if (haystack < end) + on_highlight_char(ctx, &hits, line, ch); + else + { + inside = 0; + begin = find_string(haystack, needle, &end); + if (!begin) + goto no_more_matches; + else + goto try_new_match; + } + } + haystack += fz_chartorune(&c, haystack); + } + assert(*haystack == '\n'); + ++haystack; + } + assert(*haystack == '\n'); + ++haystack; + } +no_more_matches:; + } + fz_always(ctx) + fz_drop_buffer(ctx, buffer); + fz_catch(ctx) + fz_rethrow(ctx); + + return hits.len; +} diff --git a/source/fitz/util.c b/source/fitz/util.c index f7567edc..02dc92c4 100644 --- a/source/fitz/util.c +++ b/source/fitz/util.c @@ -391,61 +391,27 @@ fz_search_page_number(fz_context *ctx, fz_document *doc, int number, const char } fz_buffer * -fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page, const fz_rect *sel, int crlf) +fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page) { - fz_buffer *buf; - float x0, y0, x1, y1; fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; - int need_newline; - - need_newline = 0; - - if (fz_is_infinite_rect(sel)) - { - x0 = y0 = -FLT_MAX; - x1 = y1 = FLT_MAX; - } - else - { - x0 = sel->x0; - y0 = sel->y0; - x1 = sel->x1; - y1 = sel->y1; - } + fz_buffer *buf; buf = fz_new_buffer(ctx, 256); fz_try(ctx) { for (block = page->first_block; block; block = block->next) { - if (block->type != FZ_STEXT_BLOCK_TEXT) - continue; - - for (line = block->u.t.first_line; line; line = line->next) + if (block->type == FZ_STEXT_BLOCK_TEXT) { - int saw_text = 0; - for (ch = line->first_char; ch; ch = ch->next) + for (line = block->u.t.first_line; line; line = line->next) { - int c = ch->c; - if (c < 32) - c = FZ_REPLACEMENT_CHARACTER; - if (ch->bbox.x1 >= x0 && ch->bbox.x0 <= x1 && ch->bbox.y1 >= y0 && ch->bbox.y0 <= y1) - { - saw_text = 1; - if (need_newline) - { - if (crlf) - fz_append_byte(ctx, buf, '\r'); - fz_append_byte(ctx, buf, '\n'); - need_newline = 0; - } - fz_append_rune(ctx, buf, c); - } + for (ch = line->first_char; ch; ch = ch->next) + fz_append_rune(ctx, buf, ch->c); + fz_append_byte(ctx, buf, '\n'); } - if (saw_text) - need_newline = 1; + fz_append_byte(ctx, buf, '\n'); } } } @@ -459,14 +425,14 @@ fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page, const fz_rec } fz_buffer * -fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_rect *sel, int crlf, const fz_stext_options *options) +fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz_stext_options *options) { fz_stext_page *text; fz_buffer *buf = NULL; text = fz_new_stext_page_from_display_list(ctx, list, options); fz_try(ctx) - buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); + buf = fz_new_buffer_from_stext_page(ctx, text); fz_always(ctx) fz_drop_stext_page(ctx, text); fz_catch(ctx) @@ -475,14 +441,14 @@ fz_new_buffer_from_display_list(fz_context *ctx, fz_display_list *list, const fz } fz_buffer * -fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int crlf, const fz_stext_options *options) +fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options) { fz_stext_page *text; fz_buffer *buf = NULL; text = fz_new_stext_page_from_page(ctx, page, options); fz_try(ctx) - buf = fz_new_buffer_from_stext_page(ctx, text, sel, crlf); + buf = fz_new_buffer_from_stext_page(ctx, text); fz_always(ctx) fz_drop_stext_page(ctx, text); fz_catch(ctx) @@ -491,14 +457,14 @@ fz_new_buffer_from_page(fz_context *ctx, fz_page *page, const fz_rect *sel, int } fz_buffer * -fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_rect *sel, int crlf, const fz_stext_options *options) +fz_new_buffer_from_page_number(fz_context *ctx, fz_document *doc, int number, const fz_stext_options *options) { fz_page *page; fz_buffer *buf = NULL; page = fz_load_page(ctx, doc, number); fz_try(ctx) - buf = fz_new_buffer_from_page(ctx, page, sel, crlf, options); + buf = fz_new_buffer_from_page(ctx, page, options); fz_always(ctx) fz_drop_page(ctx, page); fz_catch(ctx) |