summaryrefslogtreecommitdiff
path: root/source/fitz
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2017-11-07 11:51:40 +0100
committerTor Andersson <tor.andersson@artifex.com>2017-11-08 17:57:09 +0100
commita0f531e2492bd6d19edab9ca2e9f19f2284b0796 (patch)
tree0f8fcd1ab547df519dff0302fe88fe90214a9bbb /source/fitz
parent44397fc9cbc795ec5acabcc4da1b08608431313b (diff)
downloadmupdf-a0f531e2492bd6d19edab9ca2e9f19f2284b0796.tar.xz
Select and copy structured text by lines.
Diffstat (limited to 'source/fitz')
-rw-r--r--source/fitz/stext-device.c13
-rw-r--r--source/fitz/stext-output.c2
-rw-r--r--source/fitz/stext-search.c320
3 files changed, 275 insertions, 60 deletions
diff --git a/source/fitz/stext-device.c b/source/fitz/stext-device.c
index cae3c583..cd657acc 100644
--- a/source/fitz/stext-device.c
+++ b/source/fitz/stext-device.c
@@ -643,14 +643,13 @@ fz_stext_close_device(fz_context *ctx, fz_device *dev)
for (block = page->first_block; block; block = block->next)
{
- if (block->type == FZ_STEXT_BLOCK_TEXT)
+ if (block->type != FZ_STEXT_BLOCK_TEXT)
+ continue;
+ for (line = block->u.t.first_line; line; line = line->next)
{
- for (line = block->u.t.first_line; line; line = line->next)
- {
- for (ch = line->first_char; ch; ch = ch->next)
- fz_union_rect(&line->bbox, &ch->bbox);
- fz_union_rect(&block->bbox, &line->bbox);
- }
+ for (ch = line->first_char; ch; ch = ch->next)
+ fz_union_rect(&line->bbox, &ch->bbox);
+ fz_union_rect(&block->bbox, &line->bbox);
}
}
diff --git a/source/fitz/stext-output.c b/source/fitz/stext-output.c
index 6a23cc6d..492885cb 100644
--- a/source/fitz/stext-output.c
+++ b/source/fitz/stext-output.c
@@ -244,8 +244,6 @@ static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_st
for (line = block->u.t.first_line; line; line = line->next)
{
- if (line != block->u.t.first_line)
- fz_write_string(ctx, out, "\n");
for (ch = line->first_char; ch; ch = ch->next)
{
int ch_sup = detect_super_script(line, ch);
diff --git a/source/fitz/stext-search.c b/source/fitz/stext-search.c
index c8fbdc0b..b2247b33 100644
--- a/source/fitz/stext-search.c
+++ b/source/fitz/stext-search.c
@@ -1,6 +1,8 @@
#include "mupdf/fitz.h"
#include <string.h>
+#include <limits.h>
+#include <stdio.h>
static inline int fz_tolower(int c)
{
@@ -159,21 +161,68 @@ fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, f
return hit_count;
}
-int
-fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max)
+static float dist2(float a, float b)
+{
+ return a * a + b * b;
+}
+
+static int line_length(fz_stext_line *line)
{
- fz_rect linebox;
- fz_stext_block *block;
- fz_stext_line *line;
fz_stext_char *ch;
- int hit_count;
+ int n = 0;
+ for (ch = line->first_char; ch; ch = ch->next)
+ ++n;
+ return n;
+}
- float x0 = rect.x0;
- float x1 = rect.x1;
- float y0 = rect.y0;
- float y1 = rect.y1;
+static int find_closest_in_line(fz_stext_line *line, int idx, fz_point p)
+{
+ fz_stext_char *ch;
+ float closest_dist = 1e30;
+ int closest_idx = idx;
- hit_count = 0;
+ if (line->dir.x > line->dir.y)
+ {
+ if (p.y < line->bbox.y0)
+ return idx;
+ if (p.y > line->bbox.y1)
+ return idx + line_length(line);
+ }
+ else
+ {
+ if (p.x < line->bbox.x0)
+ return idx + line_length(line);
+ if (p.x > line->bbox.x1)
+ return idx;
+ }
+
+ for (ch = line->first_char; ch; ch = ch->next)
+ {
+ float mid_x = (ch->bbox.x0 + ch->bbox.x1) / 2;
+ float mid_y = (ch->bbox.y0 + ch->bbox.y1) / 2;
+ float this_dist = dist2(p.x - mid_x, p.y - mid_y);
+ if (this_dist < closest_dist)
+ {
+ closest_dist = this_dist;
+ if (line->dir.x > line->dir.y)
+ closest_idx = (p.x < mid_x) ? idx : idx+1;
+ else
+ closest_idx = (p.y < mid_y) ? idx : idx+1;
+ }
+ ++idx;
+ }
+ return closest_idx;
+}
+
+static int find_closest_in_page(fz_stext_page *page, fz_point p)
+{
+ fz_stext_block *block;
+ fz_stext_line *line;
+ fz_stext_line *closest_line = NULL;
+ int closest_idx = 0;
+ float closest_dist = 1e30;
+ float this_dist;
+ int idx = 0;
for (block = page->first_block; block; block = block->next)
{
@@ -181,77 +230,246 @@ fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_re
continue;
for (line = block->u.t.first_line; line; line = line->next)
{
- linebox = fz_empty_rect;
- for (ch = line->first_char; ch; ch = ch->next)
+ fz_rect box = line->bbox;
+ if (p.x >= box.x0 && p.x <= box.x1)
{
- if (ch->bbox.x1 >= x0 && ch->bbox.x0 <= x1 && ch->bbox.y1 >= y0 && ch->bbox.y0 <= y1)
- {
- if (ch->bbox.y0 != linebox.y0 || fz_abs(ch->bbox.x0 - linebox.x1) > 5)
- {
- if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
- hit_bbox[hit_count++] = linebox;
- linebox = ch->bbox;
- }
- else
- {
- fz_union_rect(&linebox, &ch->bbox);
- }
- }
+ if (p.y < box.y0)
+ this_dist = dist2(box.y0 - p.y, 0);
+ else if (p.y > box.y1)
+ this_dist = dist2(p.y - box.y1, 0);
+ else
+ this_dist = 0;
}
- if (!fz_is_empty_rect(&linebox) && hit_count < hit_max)
- hit_bbox[hit_count++] = linebox;
+ else if (p.y >= box.y0 && p.y <= box.y1)
+ {
+ if (p.x < box.x0)
+ this_dist = dist2(box.x0 - p.x, 0);
+ else if (p.x > box.x1)
+ this_dist = dist2(p.x - box.x1, 0);
+ else
+ this_dist = 0;
+ }
+ else
+ {
+ float dul = dist2(p.x - box.x0, p.y - box.y0);
+ float dur = dist2(p.x - box.x1, p.y - box.y0);
+ float dll = dist2(p.x - box.x0, p.y - box.y1);
+ float dlr = dist2(p.x - box.x1, p.y - box.y1);
+ this_dist = fz_min(fz_min(dul, dur), fz_min(dll, dlr));
+ }
+ if (this_dist < closest_dist)
+ {
+ closest_dist = this_dist;
+ closest_line = line;
+ closest_idx = idx;
+ }
+ idx += line_length(line);
}
}
- return hit_count;
+ if (closest_line)
+ return find_closest_in_line(closest_line, closest_idx, p);
+ return 0;
}
-char *
-fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect)
+struct callbacks
+{
+ void (*on_char)(fz_context *ctx, void *arg, fz_stext_line *ln, fz_stext_char *ch);
+ void (*on_line)(fz_context *ctx, void *arg, fz_stext_line *ln);
+ void *arg;
+};
+
+static void
+fz_enumerate_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, struct callbacks *cb)
{
- fz_buffer *buffer;
- int c, seen = 0;
- unsigned char *s;
fz_stext_block *block;
fz_stext_line *line;
fz_stext_char *ch;
+ int idx, start, end;
+ int hit_count;
+ int inside;
- float x0 = rect.x0;
- float x1 = rect.x1;
- float y0 = rect.y0;
- float y1 = rect.y1;
+ start = find_closest_in_page(page, a);
+ end = find_closest_in_page(page, b);
- buffer = fz_new_buffer(ctx, 1024);
+ if (start > end)
+ idx = start, start = end, end = idx;
+
+ if (start == end)
+ return;
+ hit_count = 0;
+ inside = 0;
+ idx = 0;
for (block = page->first_block; block; block = block->next)
{
if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
for (line = block->u.t.first_line; line; line = line->next)
{
- if (seen)
+ for (ch = line->first_char; ch; ch = ch->next)
{
- fz_append_byte(ctx, buffer, '\n');
+ if (!inside)
+ if (idx == start)
+ inside = 1;
+ if (inside)
+ cb->on_char(ctx, cb->arg, line, ch);
+ if (++idx == end)
+ return;
}
+ if (inside)
+ cb->on_line(ctx, cb->arg, line);
+ }
+ }
+}
- seen = 0;
+struct highlight
+{
+ int len, cap;
+ fz_rect *box;
+};
- for (ch = line->first_char; ch; ch = ch->next)
+static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch)
+{
+ struct highlight *hits = arg;
+ float vfuzz = ch->size * 0.1f;
+ float hfuzz = ch->size * 0.5f;
+ fz_rect bbox;
+
+ if (line->dir.x > line->dir.y)
+ {
+ bbox.x0 = ch->bbox.x0;
+ bbox.x1 = ch->bbox.x1;
+ bbox.y0 = line->bbox.y0;
+ bbox.y1 = line->bbox.y1;
+ }
+ else
+ {
+ bbox.x0 = line->bbox.x0;
+ bbox.x1 = line->bbox.x1;
+ bbox.y0 = ch->bbox.y0;
+ bbox.y1 = ch->bbox.y1;
+ }
+
+ if (hits->len > 0)
+ {
+ fz_rect *end = &hits->box[hits->len-1];
+ if (fz_abs(bbox.y0 - end->y0) < vfuzz && fz_abs(bbox.y1 - end->y1) < vfuzz)
+ {
+ if (bbox.x1 < end->x0)
{
- c = ch->c;
- if (c < 32)
- c = FZ_REPLACEMENT_CHARACTER;
- if (ch->bbox.x1 >= x0 && ch->bbox.x0 <= x1 && ch->bbox.y1 >= y0 && ch->bbox.y0 <= y1)
+ if (end->x0 - bbox.x1 < hfuzz)
{
- fz_append_rune(ctx, buffer, c);
- seen = 1;
+ end->x0 = bbox.x0;
+ return;
}
}
-
- seen = (seen && line == block->u.t.last_line);
+ else if (bbox.x0 > end->x1)
+ {
+ if (bbox.x0 - end->x1 < hfuzz)
+ {
+ end->x1 = bbox.x1;
+ return;
+ }
+ }
+ else
+ {
+ end->x0 = fz_min(bbox.x0, end->x0);
+ end->x1 = fz_max(bbox.x1, end->x1);
+ return;
+ }
+ }
+ if (fz_abs(bbox.x0 - end->x0) < vfuzz && fz_abs(bbox.x1 - end->x1) < vfuzz)
+ {
+ if (bbox.y1 < end->y0)
+ {
+ if (end->y0 - bbox.y1 < hfuzz)
+ {
+ end->y0 = bbox.y0;
+ return;
+ }
+ }
+ else if (bbox.y0 > end->y1)
+ {
+ if (bbox.y0 - end->y1 < hfuzz)
+ {
+ end->y1 = bbox.y1;
+ return;
+ }
+ }
+ else
+ {
+ end->y0 = fz_min(bbox.y0, end->y0);
+ end->y1 = fz_max(bbox.y1, end->y1);
+ return;
+ }
}
}
+ if (hits->len < hits->cap)
+ hits->box[hits->len++] = bbox;
+}
+
+static void on_highlight_line(fz_context *ctx, void *arg, fz_stext_line *line)
+{
+}
+
+int
+fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_rect *hit_bbox, int hit_max)
+{
+ struct callbacks cb;
+ struct highlight hits;
+
+ hits.len = 0;
+ hits.cap = hit_max;
+ hits.box = hit_bbox;
+
+ cb.on_char = on_highlight_char;
+ cb.on_line = on_highlight_line;
+ cb.arg = &hits;
+
+ fz_enumerate_selection(ctx, page, a, b, &cb);
+
+ return hits.len;
+}
+
+static void on_copy_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch)
+{
+ fz_buffer *buffer = arg;
+ int c = ch->c;
+ if (c < 32)
+ c = FZ_REPLACEMENT_CHARACTER;
+ fz_append_rune(ctx, buffer, c);
+}
+
+static void on_copy_line_crlf(fz_context *ctx, void *arg, fz_stext_line *line)
+{
+ fz_buffer *buffer = arg;
+ fz_append_byte(ctx, buffer, '\r');
+ fz_append_byte(ctx, buffer, '\n');
+}
+
+static void on_copy_line_lf(fz_context *ctx, void *arg, fz_stext_line *line)
+{
+ fz_buffer *buffer = arg;
+ fz_append_byte(ctx, buffer, '\n');
+}
+
+char *
+fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf)
+{
+ struct callbacks cb;
+ fz_buffer *buffer;
+ unsigned char *s;
+
+ buffer = fz_new_buffer(ctx, 1024);
+
+ cb.on_char = on_copy_char;
+ cb.on_line = crlf ? on_copy_line_crlf : on_copy_line_lf;
+ cb.arg = buffer;
+
+ fz_enumerate_selection(ctx, page, a, b, &cb);
+
fz_terminate_buffer(ctx, buffer);
fz_buffer_extract(ctx, buffer, &s); /* take over the data */
fz_drop_buffer(ctx, buffer);