From 27ff02a7aa99eeb93148bd23b2dd73362efda494 Mon Sep 17 00:00:00 2001 From: Andreas Politz Date: Thu, 12 Feb 2015 23:52:42 +0100 Subject: Added charlayout command. * server/epdfinfo.c (cmd_charlayout): Provides edges of character. * lisp/pdf-info.el (pdf-info-query--transform-response): Added transform. (pdf-info-charlayout): Added interface for the command. --- lisp/pdf-info.el | 29 ++++++++++++++++++++++ server/epdfinfo.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/lisp/pdf-info.el b/lisp/pdf-info.el index f4290b9..78f7fb3 100644 --- a/lisp/pdf-info.el +++ b/lisp/pdf-info.el @@ -371,6 +371,13 @@ interrupted." (open nil) (close (equal "1" (caar response))) (number-of-pages (string-to-number (caar response))) + (charlayout + (mapcar (lambda (elt) + (cl-assert (= 1 (length (cadr elt))) t) + `(,(aref (cadr elt) 0) + ,(mapcar 'string-to-number + (split-string (car elt) " " t)))) + response)) ((search-string search-regexp) (let ((matches (mapcar @@ -901,6 +908,28 @@ aforementioned function, when called with the same arguments." (pdf-info-getselection page '(0 0 1 1) 'glyph file-or-buffer)) +(defun pdf-info-charlayout (page edges-or-pos &optional file-or-buffer) + "Return the layout of characters of PAGE in/at EDGES-OR-POS. + +Returns a list of elements \(CHAR . \(LEFT TOP RIGHT BOT\)\) of +character and corresponding boundingboxes. + +EDGES-OR-POS may be a region \(LEFT TOP RIGHT BOT\) restricting +the returned value to include only characters fully contained in +it. Or a cons \(LEFT . TOP\) which means to only include the +character at this position. In this case the return value +contains at most one element." + + (when (numberp (cdr edges-or-pos)) + (setq edges-or-pos (list (car edges-or-pos) + (cdr edges-or-pos) + -1 -1))) + (pdf-info-query + 'charlayout + (pdf-info--normalize-file-or-buffer file-or-buffer) + page + (mapconcat 'number-to-string edges-or-pos " "))) + (defun pdf-info-pagesize (page &optional file-or-buffer) "Return the size of PAGE as a cons \(WIDTH . HEIGHT\) diff --git a/server/epdfinfo.c b/server/epdfinfo.c index 78f288a..b917e94 100644 --- a/server/epdfinfo.c +++ b/server/epdfinfo.c @@ -3119,6 +3119,79 @@ cmd_boundingbox (const epdfinfo_t *ctx, const command_arg_t *args) if (page) g_object_unref (page); } +const command_arg_type_t cmd_charlayout_spec[] = + { + ARG_DOC, + ARG_NATNUM, /* page number */ + ARG_EDGES_OR_POSITION, /* region or position */ + }; + +static void +cmd_charlayout(const epdfinfo_t *ctx, const command_arg_t *args) +{ + PopplerDocument *doc = args[0].value.doc->pdf; + int pn = args[1].value.natnum; + PopplerRectangle region = args[2].value.rectangle; + double width, height; + PopplerPage *page = poppler_document_get_page(doc, pn - 1); + char *text = NULL; + char *text_p; + PopplerRectangle *rectangles = NULL; + int nrectangles; + int i; + gboolean have_position = region.y2 < 0; + + perror_if_not (page, "No such page %d", pn); + + text = poppler_page_get_text (page); + text_p = text; + poppler_page_get_text_layout (page, &rectangles, &nrectangles); + poppler_page_get_size (page, &width, &height); + region.x1 *= width; + region.x2 *= width; + region.y1 *= height; + region.y2 *= height; + + OK_BEGIN (); + for (i = 0; i < nrectangles && *text_p; ++i) + { + PopplerRectangle *r = &rectangles[i]; + char *nextc = g_utf8_offset_to_pointer (text_p, 1); + + if ((have_position + && region.x1 >= r->x1 + && region.x1 <= r->x2 + && region.y1 >= r->y1 + && region.y1 <= r->y2) + || (! have_position + && r->x1 >= region.x1 + && r->y1 >= region.y1 + && r->x2 <= region.x2 + && r->y2 <= region.y2)) + { + char endc = *nextc; + + printf ("%f %f %f %f:", + r->x1 / width, r->y1 / height, + r->x2 / width, r->y2 / height); + *nextc = '\0'; + print_response_string (text_p, NEWLINE); + *nextc = endc; + } + text_p = nextc; + } + OK_END (); + + g_free (rectangles); + g_object_unref (page); + g_free (text); + + error: + return; +} + + + /* ================================================================== * * Main -- cgit v1.0