diff options
author | Tor Andersson <tor.andersson@artifex.com> | 2013-06-12 16:28:14 +0200 |
---|---|---|
committer | Tor Andersson <tor.andersson@artifex.com> | 2013-06-12 16:28:14 +0200 |
commit | b86aa63432a2436563bdcc398577ec4884883434 (patch) | |
tree | 749002e3de9d52943898a47f8c57d34c6a09b188 | |
parent | b975f1b82a061db61124d1cf0cd55ab60c22dc8e (diff) | |
download | mupdf-b86aa63432a2436563bdcc398577ec4884883434.tar.xz |
Support begin/end page calls in text extraction device.
-rw-r--r-- | android/jni/mupdf.c | 16 | ||||
-rw-r--r-- | apps/mudraw.c | 3 | ||||
-rw-r--r-- | apps/pdfapp.c | 2 | ||||
-rw-r--r-- | fitz/fitz.h | 3 | ||||
-rw-r--r-- | fitz/text_extract.c | 61 | ||||
-rw-r--r-- | fitz/text_output.c | 5 | ||||
-rw-r--r-- | ios/main.m | 2 | ||||
-rw-r--r-- | winrt/mupdfwinrt/muctx.cpp | 4 |
8 files changed, 53 insertions, 43 deletions
diff --git a/android/jni/mupdf.c b/android/jni/mupdf.c index 8edc8c41..f436a65d 100644 --- a/android/jni/mupdf.c +++ b/android/jni/mupdf.c @@ -1104,17 +1104,13 @@ JNI_FN(MuPDFCore_searchPage)(JNIEnv * env, jobject thiz, jstring jtext) fz_try(ctx) { - fz_rect mbrect; - if (glo->hit_bbox == NULL) glo->hit_bbox = fz_malloc_array(ctx, MAX_SEARCH_HITS, sizeof(*glo->hit_bbox)); zoom = glo->resolution / 72; fz_scale(&ctm, zoom, zoom); - mbrect = pc->media_box; - fz_transform_rect(&mbrect, &ctm); sheet = fz_new_text_sheet(ctx); - text = fz_new_text_page(ctx, &mbrect); + text = fz_new_text_page(ctx); dev = fz_new_text_device(ctx, sheet, text); fz_run_page(doc, pc->page, dev, &ctm, NULL); fz_free_device(dev); @@ -1199,15 +1195,12 @@ JNI_FN(MuPDFCore_text)(JNIEnv * env, jobject thiz) fz_try(ctx) { - fz_rect mbrect; int b, l, s, c; zoom = glo->resolution / 72; fz_scale(&ctm, zoom, zoom); - mbrect = pc->media_box; - fz_transform_rect(&mbrect, &ctm); sheet = fz_new_text_sheet(ctx); - text = fz_new_text_page(ctx, &mbrect); + text = fz_new_text_page(ctx); dev = fz_new_text_device(ctx, sheet, text); fz_run_page(doc, pc->page, dev, &ctm, NULL); fz_free_device(dev); @@ -1309,14 +1302,11 @@ JNI_FN(MuPDFCore_textAsHtml)(JNIEnv * env, jobject thiz) fz_try(ctx) { - fz_rect mbrect; int b, l, s, c; ctm = fz_identity; - mbrect = pc->media_box; - fz_transform_rect(&mbrect, &ctm); sheet = fz_new_text_sheet(ctx); - text = fz_new_text_page(ctx, &mbrect); + text = fz_new_text_page(ctx); dev = fz_new_text_device(ctx, sheet, text); fz_run_page(doc, pc->page, dev, &ctm, NULL); fz_free_device(dev); diff --git a/apps/mudraw.c b/apps/mudraw.c index e94b692d..38ac47d2 100644 --- a/apps/mudraw.c +++ b/apps/mudraw.c @@ -426,8 +426,7 @@ static void drawpage(fz_context *ctx, fz_document *doc, int pagenum) fz_try(ctx) { - fz_rect bounds; - text = fz_new_text_page(ctx, fz_bound_page(doc, page, &bounds)); + text = fz_new_text_page(ctx); dev = fz_new_text_device(ctx, sheet, text); if (showtext == TEXT_HTML) fz_disable_device_hints(dev, FZ_IGNORE_IMAGE); diff --git a/apps/pdfapp.c b/apps/pdfapp.c index 50ecaf6f..6e6e5920 100644 --- a/apps/pdfapp.c +++ b/apps/pdfapp.c @@ -601,7 +601,7 @@ static void pdfapp_showpage(pdfapp_t *app, int loadpage, int drawpage, int repai /* Extract text */ app->page_sheet = fz_new_text_sheet(app->ctx); - app->page_text = fz_new_text_page(app->ctx, &app->page_bbox); + app->page_text = fz_new_text_page(app->ctx); if (app->page_list || app->annotations_list) { diff --git a/fitz/fitz.h b/fitz/fitz.h index 6a7a0a12..1dc12ff9 100644 --- a/fitz/fitz.h +++ b/fitz/fitz.h @@ -2100,6 +2100,7 @@ struct fz_text_page_s fz_rect mediabox; int len, cap; fz_page_block *blocks; + fz_text_page *next; }; /* @@ -2267,7 +2268,7 @@ void fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet); The text page is filled out by the text device to contain the blocks, lines and spans of text on the page. */ -fz_text_page *fz_new_text_page(fz_context *ctx, const fz_rect *mediabox); +fz_text_page *fz_new_text_page(fz_context *ctx); void fz_free_text_page(fz_context *ctx, fz_text_page *page); void fz_analyze_text(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page); diff --git a/fitz/text_extract.c b/fitz/text_extract.c index 45e0395a..bf50e6a4 100644 --- a/fitz/text_extract.c +++ b/fitz/text_extract.c @@ -429,13 +429,14 @@ fz_lookup_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, const } fz_text_page * -fz_new_text_page(fz_context *ctx, const fz_rect *mediabox) +fz_new_text_page(fz_context *ctx) { fz_text_page *page = fz_malloc(ctx, sizeof(*page)); - page->mediabox = *mediabox; + page->mediabox = fz_empty_rect; page->len = 0; page->cap = 0; page->blocks = NULL; + page->next = NULL; return page; } @@ -946,34 +947,48 @@ fz_bidi_reorder_text_page(fz_context *ctx, fz_text_page *page) } static void -fz_text_free_user(fz_device *dev) +fz_text_begin_page(fz_device *dev, const fz_rect *mediabox, const fz_matrix *ctm) { fz_context *ctx = dev->ctx; fz_text_device *tdev = dev->user; - fz_try(ctx) + if (tdev->page->len) { + tdev->page->next = fz_new_text_page(ctx); + tdev->page = tdev->page->next; + } - add_span_to_soup(tdev->spans, tdev->cur_span); - tdev->cur_span = NULL; + tdev->page->mediabox = *mediabox; + fz_transform_rect(&tdev->page->mediabox, ctm); - strain_soup(ctx, tdev); + tdev->spans = new_span_soup(ctx); +} - /* TODO: smart sorting of blocks in reading order */ - /* TODO: unicode NFC normalization */ +static void +fz_text_end_page(fz_device *dev) +{ + fz_context *ctx = dev->ctx; + fz_text_device *tdev = dev->user; - fz_bidi_reorder_text_page(ctx, tdev->page); - } - fz_always(ctx) - { - free_span_soup(tdev->spans); - fz_free(dev->ctx, tdev); - } - fz_catch(ctx) - { - /* TODO: mark fz_free_device as "doesn't throw" (else rethrowing would - have to be caught/rethrown again in fz_free_device) */ - } + add_span_to_soup(tdev->spans, tdev->cur_span); + tdev->cur_span = NULL; + + strain_soup(ctx, tdev); + free_span_soup(tdev->spans); + tdev->spans = NULL; + + /* TODO: smart sorting of blocks in reading order */ + /* TODO: unicode NFC normalization */ + + fz_bidi_reorder_text_page(ctx, tdev->page); +} + +static void +fz_text_free_user(fz_device *dev) +{ + fz_text_device *tdev = dev->user; + free_span_soup(tdev->spans); + fz_free(dev->ctx, tdev); } fz_device * @@ -984,12 +999,14 @@ fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page) fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device); tdev->sheet = sheet; tdev->page = page; - tdev->spans = new_span_soup(ctx); + tdev->spans = NULL; tdev->cur_span = NULL; tdev->lastchar = ' '; dev = fz_new_device(ctx, tdev); dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE; + dev->begin_page = fz_text_begin_page; + dev->end_page = fz_text_end_page; dev->free_user = fz_text_free_user; dev->fill_text = fz_text_fill_text; dev->stroke_text = fz_text_stroke_text; diff --git a/fitz/text_output.c b/fitz/text_output.c index 6942eb67..c4af4160 100644 --- a/fitz/text_output.c +++ b/fitz/text_output.c @@ -279,7 +279,10 @@ fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page) { int block_n; - fz_printf(out, "<page>\n"); + fz_printf(out, "<page width=\"%g\" height=\"%g\">\n", + page->mediabox.x1 - page->mediabox.x0, + page->mediabox.y1 - page->mediabox.y0); + for (block_n = 0; block_n < page->len; block_n++) { switch (page->blocks[block_n].type) @@ -133,7 +133,7 @@ search_page(fz_document *doc, int number, char *needle, fz_cookie *cookie) fz_page *page = fz_load_page(doc, number); fz_text_sheet *sheet = fz_new_text_sheet(ctx); - fz_text_page *text = fz_new_text_page(ctx, &fz_empty_rect); + fz_text_page *text = fz_new_text_page(ctx); fz_device *dev = fz_new_text_device(ctx, sheet, text); fz_run_page(doc, page, dev, &fz_identity, cookie); fz_free_device(dev); diff --git a/winrt/mupdfwinrt/muctx.cpp b/winrt/mupdfwinrt/muctx.cpp index 9821039d..63f008cf 100644 --- a/winrt/mupdfwinrt/muctx.cpp +++ b/winrt/mupdfwinrt/muctx.cpp @@ -297,7 +297,7 @@ int muctx::GetTextSearch(int page_num, char* needle, sh_vector_text texts_vec) { page = fz_load_page(mu_doc, page_num); sheet = fz_new_text_sheet(ctx_clone); - text = fz_new_text_page(ctx_clone, &fz_empty_rect); + text = fz_new_text_page(ctx_clone); dev = fz_new_text_device(ctx_clone, sheet, text); fz_run_page(mu_doc, page, dev, &fz_identity, NULL); fz_free_device(dev); /* Why does this need to be done here? Seems odd */ @@ -483,7 +483,7 @@ String^ muctx::GetHTML(int page_num) { page = fz_load_page(mu_doc, page_num); sheet = fz_new_text_sheet(ctx_clone); - text = fz_new_text_page(ctx_clone, &fz_empty_rect); + text = fz_new_text_page(ctx_clone); dev = fz_new_text_device(ctx_clone, sheet, text); fz_run_page(mu_doc, page, dev, &fz_identity, NULL); fz_free_device(dev); |