summaryrefslogtreecommitdiff
path: root/source/fitz/stext-search.c
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2017-11-08 14:16:30 +0100
committerTor Andersson <tor.andersson@artifex.com>2017-11-08 17:57:09 +0100
commit8b88d6f3fe9f80a11408d3e0773cbe290b1a4dba (patch)
tree7cbce1511d3101f24c10b572e097659a5f89dcef /source/fitz/stext-search.c
parentfb51a9744d6d356371b2ad73f2d3da972012be1f (diff)
downloadmupdf-8b88d6f3fe9f80a11408d3e0773cbe290b1a4dba.tar.xz
Clean up and speed up text searching.
Diffstat (limited to 'source/fitz/stext-search.c')
-rw-r--r--source/fitz/stext-search.c252
1 files changed, 146 insertions, 106 deletions
diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c
index b2247b33..2f074180 100644
--- a/source/fitz/stext-search.c
+++ b/source/fitz/stext-search.c
@@ -2,21 +2,7 @@
#include <string.h>
#include <limits.h>
-#include <stdio.h>
-
-static inline int fz_tolower(int c)
-{
- /* TODO: proper unicode case folding */
- /* TODO: character equivalence (a matches ä, etc) */
- if (c >= 'A' && c <= 'Z')
- return c - 'A' + 'a';
- return c;
-}
-
-static inline int iswhite(int c)
-{
- return c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == 0xA0 || c == 0x2028 || c == 0x2029;
-}
+#include <assert.h>
int fz_stext_char_count(fz_context *ctx, fz_stext_page *page)
{
@@ -71,95 +57,7 @@ const fz_stext_char *fz_stext_char_at(fz_context *ctx, fz_stext_page *page, int
return &zero;
}
-static inline int charat(fz_context *ctx, fz_stext_page *page, int idx)
-{
- return fz_stext_char_at(ctx, page, idx)->c;
-}
-
-static fz_rect *bboxat(fz_context *ctx, fz_stext_page *page, int idx, fz_rect *bbox)
-{
- /* FIXME: Nasty extra copy */
- *bbox = fz_stext_char_at(ctx, page, idx)->bbox;
- return bbox;
-}
-
-static int match_stext(fz_context *ctx, fz_stext_page *page, const char *s, int n)
-{
- int orig = n;
- int c;
- while (*s)
- {
- s += fz_chartorune(&c, (char *)s);
- if (iswhite(c) && iswhite(charat(ctx, page, n)))
- {
- const char *s_next;
-
- /* Skip over whitespace in the document */
- do
- n++;
- while (iswhite(charat(ctx, page, n)));
-
- /* Skip over multiple whitespace in the search string */
- while (s_next = s + fz_chartorune(&c, (char *)s), iswhite(c))
- s = s_next;
- }
- else
- {
- if (fz_tolower(c) != fz_tolower(charat(ctx, page, n)))
- return 0;
- n++;
- }
- }
- return n - orig;
-}
-
-int
-fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_rect *hit_bbox, int hit_max)
-{
- int pos, len, i, n, hit_count;
-
- if (strlen(needle) == 0)
- return 0;
-
- hit_count = 0;
- len = fz_stext_char_count(ctx, text);
- pos = 0;
- while (pos < len)
- {
- n = match_stext(ctx, text, needle, pos);
- if (n)
- {
- fz_rect linebox = fz_empty_rect;
- for (i = 0; i < n; i++)
- {
- fz_rect charbox;
- bboxat(ctx, text, pos + i, &charbox);
- if (!fz_is_empty_rect(&charbox))
- {
- if (charbox.y0 != linebox.y0 || fz_abs(charbox.x0 - linebox.x1) > 5)
- {
- if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
- hit_bbox[hit_count++] = linebox;
- linebox = charbox;
- }
- else
- {
- fz_union_rect(&linebox, &charbox);
- }
- }
- }
- if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
- hit_bbox[hit_count++] = linebox;
- pos += n;
- }
- else
- {
- pos += 1;
- }
- }
-
- return hit_count;
-}
+/* Enumerate marked selection */
static float dist2(float a, float b)
{
@@ -323,17 +221,20 @@ fz_enumerate_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_poin
}
}
+/* Highlight selection */
+
struct highlight
{
int len, cap;
fz_rect *box;
+ float hfuzz, vfuzz;
};
static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch)
{
struct highlight *hits = arg;
- float vfuzz = ch->size * 0.1f;
- float hfuzz = ch->size * 0.5f;
+ float vfuzz = ch->size * hits->vfuzz;
+ float hfuzz = ch->size * hits->hfuzz;
fz_rect bbox;
if (line->dir.x > line->dir.y)
@@ -423,6 +324,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_poin
hits.len = 0;
hits.cap = hit_max;
hits.box = hit_bbox;
+ hits.hfuzz = 0.5f;
+ hits.vfuzz = 0.1f;
cb.on_char = on_highlight_char;
cb.on_line = on_highlight_line;
@@ -433,6 +336,8 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_poin
return hits.len;
}
+/* Copy selection */
+
static void on_copy_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch)
{
fz_buffer *buffer = arg;
@@ -475,3 +380,138 @@ fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b,
fz_drop_buffer(ctx, buffer);
return (char*)s;
}
+
+/* String search */
+
+static inline int canon(int c)
+{
+ /* TODO: proper unicode case folding */
+ /* TODO: character equivalence (a matches ä, etc) */
+ if (c == 0xA0 || c == 0x2028 || c == 0x2029)
+ return ' ';
+ if (c == '\r' || c == '\n' || c == '\t')
+ return ' ';
+ if (c >= 'A' && c <= 'Z')
+ return c - 'A' + 'a';
+ return c;
+}
+
+static inline int chartocanon(int *c, const char *s)
+{
+ int n = fz_chartorune(c, s);
+ *c = canon(*c);
+ return n;
+}
+
+static const char *match_string(const char *h, const char *n)
+{
+ int hc, nc;
+ const char *e = h;
+ h += chartocanon(&hc, h);
+ n += chartocanon(&nc, n);
+ while (hc == nc)
+ {
+ e = h;
+ if (hc == ' ')
+ do
+ h += chartocanon(&hc, h);
+ while (hc == ' ');
+ else
+ h += chartocanon(&hc, h);
+ if (nc == ' ')
+ do
+ n += chartocanon(&nc, n);
+ while (nc == ' ');
+ else
+ n += chartocanon(&nc, n);
+ }
+ return nc == 0 ? e : NULL;
+}
+
+static const char *find_string(const char *s, const char *needle, const char **endp)
+{
+ const char *end;
+ while (*s)
+ {
+ end = match_string(s, needle);
+ if (end)
+ return *endp = end, s;
+ ++s;
+ }
+ return *endp = NULL, NULL;
+}
+
+int
+fz_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle, fz_rect *hit_bbox, int hit_max)
+{
+ struct highlight hits;
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_char *ch;
+ fz_buffer *buffer;
+ const char *haystack, *begin, *end;
+ int c, inside;
+
+ if (strlen(needle) == 0)
+ return 0;
+
+ hits.len = 0;
+ hits.cap = hit_max;
+ hits.box = hit_bbox;
+ hits.hfuzz = 0.1f;
+ hits.vfuzz = 0.1f;
+
+ buffer = fz_new_buffer_from_stext_page(ctx, page);
+ fz_try(ctx)
+ {
+ haystack = fz_string_from_buffer(ctx, buffer);
+ begin = find_string(haystack, needle, &end);
+ if (!begin)
+ goto no_more_matches;
+
+ inside = 0;
+ for (block = page->first_block; block; block = block->next)
+ {
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
+ continue;
+ for (line = block->u.t.first_line; line; line = line->next)
+ {
+ for (ch = line->first_char; ch; ch = ch->next)
+ {
+try_new_match:
+ if (!inside)
+ {
+ if (haystack >= begin)
+ inside = 1;
+ }
+ if (inside)
+ {
+ if (haystack < end)
+ on_highlight_char(ctx, &hits, line, ch);
+ else
+ {
+ inside = 0;
+ begin = find_string(haystack, needle, &end);
+ if (!begin)
+ goto no_more_matches;
+ else
+ goto try_new_match;
+ }
+ }
+ haystack += fz_chartorune(&c, haystack);
+ }
+ assert(*haystack == '\n');
+ ++haystack;
+ }
+ assert(*haystack == '\n');
+ ++haystack;
+ }
+no_more_matches:;
+ }
+ fz_always(ctx)
+ fz_drop_buffer(ctx, buffer);
+ fz_catch(ctx)
+ fz_rethrow(ctx);
+
+ return hits.len;
+}