summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTor Andersson <tor.andersson@artifex.com>2013-06-12 16:28:14 +0200
committerTor Andersson <tor.andersson@artifex.com>2013-06-12 16:28:14 +0200
commitb86aa63432a2436563bdcc398577ec4884883434 (patch)
tree749002e3de9d52943898a47f8c57d34c6a09b188
parentb975f1b82a061db61124d1cf0cd55ab60c22dc8e (diff)
downloadmupdf-b86aa63432a2436563bdcc398577ec4884883434.tar.xz
Support begin/end page calls in text extraction device.
-rw-r--r--android/jni/mupdf.c16
-rw-r--r--apps/mudraw.c3
-rw-r--r--apps/pdfapp.c2
-rw-r--r--fitz/fitz.h3
-rw-r--r--fitz/text_extract.c61
-rw-r--r--fitz/text_output.c5
-rw-r--r--ios/main.m2
-rw-r--r--winrt/mupdfwinrt/muctx.cpp4
8 files changed, 53 insertions, 43 deletions
diff --git a/android/jni/mupdf.c b/android/jni/mupdf.c
index 8edc8c41..f436a65d 100644
--- a/android/jni/mupdf.c
+++ b/android/jni/mupdf.c
@@ -1104,17 +1104,13 @@ JNI_FN(MuPDFCore_searchPage)(JNIEnv * env, jobject thiz, jstring jtext)
fz_try(ctx)
{
- fz_rect mbrect;
-
if (glo->hit_bbox == NULL)
glo->hit_bbox = fz_malloc_array(ctx, MAX_SEARCH_HITS, sizeof(*glo->hit_bbox));
zoom = glo->resolution / 72;
fz_scale(&ctm, zoom, zoom);
- mbrect = pc->media_box;
- fz_transform_rect(&mbrect, &ctm);
sheet = fz_new_text_sheet(ctx);
- text = fz_new_text_page(ctx, &mbrect);
+ text = fz_new_text_page(ctx);
dev = fz_new_text_device(ctx, sheet, text);
fz_run_page(doc, pc->page, dev, &ctm, NULL);
fz_free_device(dev);
@@ -1199,15 +1195,12 @@ JNI_FN(MuPDFCore_text)(JNIEnv * env, jobject thiz)
fz_try(ctx)
{
- fz_rect mbrect;
int b, l, s, c;
zoom = glo->resolution / 72;
fz_scale(&ctm, zoom, zoom);
- mbrect = pc->media_box;
- fz_transform_rect(&mbrect, &ctm);
sheet = fz_new_text_sheet(ctx);
- text = fz_new_text_page(ctx, &mbrect);
+ text = fz_new_text_page(ctx);
dev = fz_new_text_device(ctx, sheet, text);
fz_run_page(doc, pc->page, dev, &ctm, NULL);
fz_free_device(dev);
@@ -1309,14 +1302,11 @@ JNI_FN(MuPDFCore_textAsHtml)(JNIEnv * env, jobject thiz)
fz_try(ctx)
{
- fz_rect mbrect;
int b, l, s, c;
ctm = fz_identity;
- mbrect = pc->media_box;
- fz_transform_rect(&mbrect, &ctm);
sheet = fz_new_text_sheet(ctx);
- text = fz_new_text_page(ctx, &mbrect);
+ text = fz_new_text_page(ctx);
dev = fz_new_text_device(ctx, sheet, text);
fz_run_page(doc, pc->page, dev, &ctm, NULL);
fz_free_device(dev);
diff --git a/apps/mudraw.c b/apps/mudraw.c
index e94b692d..38ac47d2 100644
--- a/apps/mudraw.c
+++ b/apps/mudraw.c
@@ -426,8 +426,7 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum)
fz_try(ctx)
{
- fz_rect bounds;
- text = fz_new_text_page(ctx, fz_bound_page(doc, page, &bounds));
+ text = fz_new_text_page(ctx);
dev = fz_new_text_device(ctx, sheet, text);
if (showtext == TEXT_HTML)
fz_disable_device_hints(dev, FZ_IGNORE_IMAGE);
diff --git a/apps/pdfapp.c b/apps/pdfapp.c
index 50ecaf6f..6e6e5920 100644
--- a/apps/pdfapp.c
+++ b/apps/pdfapp.c
@@ -601,7 +601,7 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai
/* Extract text */
app->page_sheet = fz_new_text_sheet(app->ctx);
- app->page_text = fz_new_text_page(app->ctx, &app->page_bbox);
+ app->page_text = fz_new_text_page(app->ctx);
if (app->page_list || app->annotations_list)
{
diff --git a/fitz/fitz.h b/fitz/fitz.h
index 6a7a0a12..1dc12ff9 100644
--- a/fitz/fitz.h
+++ b/fitz/fitz.h
@@ -2100,6 +2100,7 @@ struct fz_text_page_s
fz_rect mediabox;
int len, cap;
fz_page_block *blocks;
+ fz_text_page *next;
};
/*
@@ -2267,7 +2268,7 @@ void fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet);
The text page is filled out by the text device to contain the blocks,
lines and spans of text on the page.
*/
-fz_text_page *fz_new_text_page(fz_context *ctx, const fz_rect *mediabox);
+fz_text_page *fz_new_text_page(fz_context *ctx);
void fz_free_text_page(fz_context *ctx, fz_text_page *page);
void fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page);
diff --git a/fitz/text_extract.c b/fitz/text_extract.c
index 45e0395a..bf50e6a4 100644
--- a/fitz/text_extract.c
+++ b/fitz/text_extract.c
@@ -429,13 +429,14 @@ fz_lookup_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, const
}
fz_text_page *
-fz_new_text_page(fz_context *ctx, const fz_rect *mediabox)
+fz_new_text_page(fz_context *ctx)
{
fz_text_page *page = fz_malloc(ctx, sizeof(*page));
- page->mediabox = *mediabox;
+ page->mediabox = fz_empty_rect;
page->len = 0;
page->cap = 0;
page->blocks = NULL;
+ page->next = NULL;
return page;
}
@@ -946,34 +947,48 @@ fz_bidi_reorder_text_page(fz_context *ctx, fz_text_page *page)
}
static void
-fz_text_free_user(fz_device *dev)
+fz_text_begin_page(fz_device *dev, const fz_rect *mediabox, const fz_matrix *ctm)
{
fz_context *ctx = dev->ctx;
fz_text_device *tdev = dev->user;
- fz_try(ctx)
+ if (tdev->page->len)
{
+ tdev->page->next = fz_new_text_page(ctx);
+ tdev->page = tdev->page->next;
+ }
- add_span_to_soup(tdev->spans, tdev->cur_span);
- tdev->cur_span = NULL;
+ tdev->page->mediabox = *mediabox;
+ fz_transform_rect(&tdev->page->mediabox, ctm);
- strain_soup(ctx, tdev);
+ tdev->spans = new_span_soup(ctx);
+}
- /* TODO: smart sorting of blocks in reading order */
- /* TODO: unicode NFC normalization */
+static void
+fz_text_end_page(fz_device *dev)
+{
+ fz_context *ctx = dev->ctx;
+ fz_text_device *tdev = dev->user;
- fz_bidi_reorder_text_page(ctx, tdev->page);
- }
- fz_always(ctx)
- {
- free_span_soup(tdev->spans);
- fz_free(dev->ctx, tdev);
- }
- fz_catch(ctx)
- {
- /* TODO: mark fz_free_device as "doesn't throw" (else rethrowing would
- have to be caught/rethrown again in fz_free_device) */
- }
+ add_span_to_soup(tdev->spans, tdev->cur_span);
+ tdev->cur_span = NULL;
+
+ strain_soup(ctx, tdev);
+ free_span_soup(tdev->spans);
+ tdev->spans = NULL;
+
+ /* TODO: smart sorting of blocks in reading order */
+ /* TODO: unicode NFC normalization */
+
+ fz_bidi_reorder_text_page(ctx, tdev->page);
+}
+
+static void
+fz_text_free_user(fz_device *dev)
+{
+ fz_text_device *tdev = dev->user;
+ free_span_soup(tdev->spans);
+ fz_free(dev->ctx, tdev);
}
fz_device *
@@ -984,12 +999,14 @@ fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device);
tdev->sheet = sheet;
tdev->page = page;
- tdev->spans = new_span_soup(ctx);
+ tdev->spans = NULL;
tdev->cur_span = NULL;
tdev->lastchar = ' ';
dev = fz_new_device(ctx, tdev);
dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE;
+ dev->begin_page = fz_text_begin_page;
+ dev->end_page = fz_text_end_page;
dev->free_user = fz_text_free_user;
dev->fill_text = fz_text_fill_text;
dev->stroke_text = fz_text_stroke_text;
diff --git a/fitz/text_output.c b/fitz/text_output.c
index 6942eb67..c4af4160 100644
--- a/fitz/text_output.c
+++ b/fitz/text_output.c
@@ -279,7 +279,10 @@ fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page)
{
int block_n;
- fz_printf(out, "<page>\n");
+ fz_printf(out, "<page width=\"%g\" height=\"%g\">\n",
+ page->mediabox.x1 - page->mediabox.x0,
+ page->mediabox.y1 - page->mediabox.y0);
+
for (block_n = 0; block_n < page->len; block_n++)
{
switch (page->blocks[block_n].type)
diff --git a/ios/main.m b/ios/main.m
index d1ca130e..5668d415 100644
--- a/ios/main.m
+++ b/ios/main.m
@@ -133,7 +133,7 @@ search_page(fz_document *doc, int number, char *needle, fz_cookie *cookie)
fz_page *page = fz_load_page(doc, number);
fz_text_sheet *sheet = fz_new_text_sheet(ctx);
- fz_text_page *text = fz_new_text_page(ctx, &fz_empty_rect);
+ fz_text_page *text = fz_new_text_page(ctx);
fz_device *dev = fz_new_text_device(ctx, sheet, text);
fz_run_page(doc, page, dev, &fz_identity, cookie);
fz_free_device(dev);
diff --git a/winrt/mupdfwinrt/muctx.cpp b/winrt/mupdfwinrt/muctx.cpp
index 9821039d..63f008cf 100644
--- a/winrt/mupdfwinrt/muctx.cpp
+++ b/winrt/mupdfwinrt/muctx.cpp
@@ -297,7 +297,7 @@ int muctx::GetTextSearch(int page_num, char* needle, sh_vector_text texts_vec)
{
page = fz_load_page(mu_doc, page_num);
sheet = fz_new_text_sheet(ctx_clone);
- text = fz_new_text_page(ctx_clone, &fz_empty_rect);
+ text = fz_new_text_page(ctx_clone);
dev = fz_new_text_device(ctx_clone, sheet, text);
fz_run_page(mu_doc, page, dev, &fz_identity, NULL);
fz_free_device(dev); /* Why does this need to be done here? Seems odd */
@@ -483,7 +483,7 @@ String^ muctx::GetHTML(int page_num)
{
page = fz_load_page(mu_doc, page_num);
sheet = fz_new_text_sheet(ctx_clone);
- text = fz_new_text_page(ctx_clone, &fz_empty_rect);
+ text = fz_new_text_page(ctx_clone);
dev = fz_new_text_device(ctx_clone, sheet, text);
fz_run_page(mu_doc, page, dev, &fz_identity, NULL);
fz_free_device(dev);