summaryrefslogtreecommitdiff
path: root/apps
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2012-03-06 17:41:04 +0100
committerTor Andersson <tor.andersson@artifex.com>2012-03-12 18:43:02 +0100
commit0b0e2af392428b5dbc88d6fbd2f6b5181e85165f (patch)
tree9668c284b68b912c103f0778ff298aafb4d6453b /apps
parentbc1c06ddd335f899025470dde7e839a82d792972 (diff)
downloadmupdf-0b0e2af392428b5dbc88d6fbd2f6b5181e85165f.tar.xz
Create style sheet and group extracted text into blocks, lines and spans.
Diffstat (limited to 'apps')
-rw-r--r--apps/mudraw.c58
-rw-r--r--apps/pdfapp.c160
-rw-r--r--apps/pdfapp.h3
-rw-r--r--apps/x11_main.c2
4 files changed, 134 insertions, 89 deletions
diff --git a/apps/mudraw.c b/apps/mudraw.c
index 19f359a1..31847665 100644
--- a/apps/mudraw.c
+++ b/apps/mudraw.c
@@ -10,6 +10,8 @@
#include <sys/time.h>
#endif
+enum { TEXT_PLAIN = 1, TEXT_HTML = 2, TEXT_XML = 3 };
+
static char *output = NULL;
static float resolution = 72;
static float rotation = 0;
@@ -28,6 +30,7 @@ static int width = 0;
static int height = 0;
static int fit = 0;
+static fz_text_sheet *sheet = NULL;
static fz_colorspace *colorspace;
static char *filename;
@@ -157,42 +160,43 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum)
if (showtext)
{
- fz_text_span *text = NULL;
+ fz_text_page *text = NULL;
fz_var(text);
fz_try(ctx)
{
- text = fz_new_text_span(ctx);
- dev = fz_new_text_device(ctx, text);
+ text = fz_new_text_page(ctx, fz_bound_page(doc, page));
+ dev = fz_new_text_device(ctx, sheet, text);
if (list)
fz_run_display_list(list, dev, fz_identity, fz_infinite_bbox, NULL);
else
fz_run_page(doc, page, dev, fz_identity, NULL);
fz_free_device(dev);
dev = NULL;
- if (showtext > 1)
+ if (showtext == TEXT_XML)
{
- printf("<page number=\"%d\">\n", pagenum);
- fz_debug_text_span_xml(text);
- printf("</page>\n");
+ fz_print_text_page_xml(stdout, text);
}
- else
+ else if (showtext == TEXT_HTML)
+ {
+ fz_print_text_page_html(stdout, text);
+ }
+ else if (showtext == TEXT_PLAIN)
{
- printf("[Page %d]\n", pagenum);
- fz_debug_text_span(text);
+ fz_print_text_page(stdout, text);
+ printf("\f\n");
}
- printf("\n");
}
fz_catch(ctx)
{
fz_free_device(dev);
- fz_free_text_span(ctx, text);
+ fz_free_text_page(ctx, text);
fz_free_display_list(ctx, list);
fz_free_page(doc, page);
fz_rethrow(ctx);
}
- fz_free_text_span(ctx, text);
+ fz_free_text_page(ctx, text);
}
if (showmd5 || showtime)
@@ -456,9 +460,23 @@ int main(int argc, char **argv)
timing.minpage = 0;
timing.maxpage = 0;
- if (showxml || showtext > 1)
+ if (showxml || showtext == TEXT_XML)
printf("<?xml version=\"1.0\"?>\n");
+ if (showtext)
+ sheet = fz_new_text_sheet(ctx);
+
+ if (showtext == TEXT_HTML)
+ {
+ printf("<style>\n");
+ printf("body{background-color:gray;margin:12tp;}\n");
+ printf("div.page{background-color:white;margin:6pt;padding:6pt;}\n");
+ printf("div.block{border:1px solid gray;margin:6pt;padding:6pt;}\n");
+ printf("p{margin:0;padding:0;}\n");
+ printf("</style>\n");
+ printf("<body>\n");
+ }
+
fz_try(ctx)
{
while (fz_optind < argc)
@@ -478,7 +496,7 @@ int main(int argc, char **argv)
if (!fz_authenticate_password(doc, password))
fz_throw(ctx, "cannot authenticate password: %s", filename);
- if (showxml || showtext > 1)
+ if (showxml || showtext == TEXT_XML)
printf("<document name=\"%s\">\n", filename);
if (showoutline)
@@ -492,7 +510,7 @@ int main(int argc, char **argv)
drawrange(ctx, doc, argv[fz_optind++]);
}
- if (showxml || showtext > 1)
+ if (showxml || showtext == TEXT_XML)
printf("</document>\n");
fz_close_document(doc);
@@ -504,6 +522,14 @@ int main(int argc, char **argv)
fz_close_document(doc);
}
+ if (showtext == TEXT_HTML)
+ {
+ printf("</body>\n");
+ printf("<style>\n");
+ fz_print_text_sheet(stdout, sheet);
+ printf("</style>\n");
+ }
+
if (showtime)
{
printf("total %dms / %d pages for an average of %dms\n",
diff --git a/apps/pdfapp.c b/apps/pdfapp.c
index 4f093508..e5742645 100644
--- a/apps/pdfapp.c
+++ b/apps/pdfapp.c
@@ -157,7 +157,7 @@ void pdfapp_close(pdfapp_t *app)
app->page_list = NULL;
if (app->page_text)
- fz_free_text_span(app->ctx, app->page_text);
+ fz_free_text_page(app->ctx, app->page_text);
app->page_text = NULL;
if (app->page_links)
@@ -228,7 +228,7 @@ static void pdfapp_loadpage(pdfapp_t *app)
if (app->page_list)
fz_free_display_list(app->ctx, app->page_list);
if (app->page_text)
- fz_free_text_span(app->ctx, app->page_text);
+ fz_free_text_page(app->ctx, app->page_text);
if (app->page_links)
fz_drop_link(app->ctx, app->page_links);
if (app->page)
@@ -273,8 +273,9 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai
app->hitlen = 0;
/* Extract text */
- app->page_text = fz_new_text_span(app->ctx);
- tdev = fz_new_text_device(app->ctx, app->page_text);
+ app->page_sheet = fz_new_text_sheet(app->ctx);
+ app->page_text = fz_new_text_page(app->ctx, app->page_bbox);
+ tdev = fz_new_text_device(app->ctx, app->page_sheet, app->page_text);
fz_run_display_list(app->page_list, tdev, fz_identity, fz_infinite_bbox, NULL);
fz_free_device(tdev);
}
@@ -354,23 +355,61 @@ static void pdfapp_gotopage(pdfapp_t *app, int number)
pdfapp_showpage(app, 1, 1, 1);
}
-static inline fz_bbox bboxcharat(fz_text_span *span, int idx)
+static fz_text_char textcharat(fz_text_page *page, int idx)
{
+ static fz_text_char emptychar = { {0,0,0,0}, ' ' };
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
int ofs = 0;
- while (span)
+ for (block = page->blocks; block < page->blocks + page->len; block++)
{
- if (idx < ofs + span->len)
- return span->text[idx - ofs].bbox;
- if (span->eol)
+ for (line = block->lines; line < block->lines + block->len; line++)
{
- if (idx == ofs + span->len)
- return fz_empty_bbox;
- ofs ++;
+ for (span = line->spans; span < line->spans + line->len; span++)
+ {
+ if (idx < ofs + span->len)
+ return span->text[idx - ofs];
+ /* pseudo-newline */
+ if (span + 1 == line->spans + line->len)
+ {
+ if (idx == ofs + span->len)
+ return emptychar;
+ ofs++;
+ }
+ ofs += span->len;
+ }
}
- ofs += span->len;
- span = span->next;
}
- return fz_empty_bbox;
+ return emptychar;
+}
+
+static int textlen(fz_text_page *page)
+{
+ fz_text_block *block;
+ fz_text_line *line;
+ fz_text_span *span;
+ int len = 0;
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ for (line = block->lines; line < block->lines + block->len; line++)
+ {
+ for (span = line->spans; span < line->spans + line->len; span++)
+ len += span->len;
+ len++; /* pseudo-newline */
+ }
+ }
+ return len;
+}
+
+static inline int charat(fz_text_page *page, int idx)
+{
+ return textcharat(page, idx).c;
+}
+
+static inline fz_bbox bboxcharat(fz_text_page *page, int idx)
+{
+ return fz_round_rect(textcharat(page, idx).bbox);
}
void pdfapp_inverthit(pdfapp_t *app)
@@ -404,52 +443,20 @@ void pdfapp_inverthit(pdfapp_t *app)
pdfapp_invert(app, fz_transform_bbox(ctm, hitbox));
}
-static inline int charat(fz_text_span *span, int idx)
-{
- int ofs = 0;
- while (span)
- {
- if (idx < ofs + span->len)
- return span->text[idx - ofs].c;
- if (span->eol)
- {
- if (idx == ofs + span->len)
- return ' ';
- ofs ++;
- }
- ofs += span->len;
- span = span->next;
- }
- return 0;
-}
-
-static int textlen(fz_text_span *span)
-{
- int len = 0;
- while (span)
- {
- len += span->len;
- if (span->eol)
- len ++;
- span = span->next;
- }
- return len;
-}
-
-static int match(char *s, fz_text_span *span, int n)
+static int match(char *s, fz_text_page *page, int n)
{
int orig = n;
int c;
while ((c = *s++))
{
- if (c == ' ' && charat(span, n) == ' ')
+ if (c == ' ' && charat(page, n) == ' ')
{
- while (charat(span, n) == ' ')
+ while (charat(page, n) == ' ')
n++;
}
else
{
- if (tolower(c) != tolower(charat(span, n)))
+ if (tolower(c) != tolower(charat(page, n)))
return 0;
n++;
}
@@ -1067,6 +1074,9 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
{
fz_bbox hitbox;
fz_matrix ctm;
+ fz_text_page *page = app->page_text;
+ fz_text_block *block;
+ fz_text_line *line;
fz_text_span *span;
int c, i, p;
int seen;
@@ -1079,32 +1089,40 @@ void pdfapp_oncopy(pdfapp_t *app, unsigned short *ucsbuf, int ucslen)
ctm = pdfapp_viewctm(app);
p = 0;
- for (span = app->page_text; span; span = span->next)
- {
- seen = 0;
- for (i = 0; i < span->len; i++)
+ for (block = page->blocks; block < page->blocks + page->len; block++)
+ {
+ for (line = block->lines; line < block->lines + block->len; line++)
{
- hitbox = fz_transform_bbox(ctm, span->text[i].bbox);
- c = span->text[i].c;
- if (c < 32)
- c = '?';
- if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
+ for (span = line->spans; span < line->spans + line->len; span++)
{
- if (p < ucslen - 1)
- ucsbuf[p++] = c;
- seen = 1;
- }
- }
+ seen = 0;
- if (seen && span->eol)
- {
+ for (i = 0; i < span->len; i++)
+ {
+ hitbox = fz_round_rect(span->text[i].bbox);
+ hitbox = fz_transform_bbox(ctm, hitbox);
+ c = span->text[i].c;
+ if (c < 32)
+ c = '?';
+ if (hitbox.x1 >= x0 && hitbox.x0 <= x1 && hitbox.y1 >= y0 && hitbox.y0 <= y1)
+ {
+ if (p < ucslen - 1)
+ ucsbuf[p++] = c;
+ seen = 1;
+ }
+ }
+
+ if (seen && span + 1 == line->spans + line->len)
+ {
#ifdef _WIN32
- if (p < ucslen - 1)
- ucsbuf[p++] = '\r';
+ if (p < ucslen - 1)
+ ucsbuf[p++] = '\r';
#endif
- if (p < ucslen - 1)
- ucsbuf[p++] = '\n';
+ if (p < ucslen - 1)
+ ucsbuf[p++] = '\n';
+ }
+ }
}
}
diff --git a/apps/pdfapp.h b/apps/pdfapp.h
index 0c1b6ac4..db83335f 100644
--- a/apps/pdfapp.h
+++ b/apps/pdfapp.h
@@ -52,7 +52,8 @@ struct pdfapp_s
fz_page *page;
fz_rect page_bbox;
fz_display_list *page_list;
- fz_text_span *page_text;
+ fz_text_page *page_text;
+ fz_text_sheet *page_sheet;
fz_link *page_links;
/* snapback history */
diff --git a/apps/x11_main.c b/apps/x11_main.c
index 091f0ec6..fe0196fc 100644
--- a/apps/x11_main.c
+++ b/apps/x11_main.c
@@ -466,7 +466,7 @@ void windocopy(pdfapp_t *app)
{
ucs = ucs2[0];
- utf8 += runetochar(utf8, &ucs);
+ utf8 += fz_runetochar(utf8, ucs);
if (ucs < 256)
*latin1++ = ucs;