From 5e2a817134aec2100c259734b178656c11cbd1bd Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Tue, 12 Jan 2021 01:40:08 +0200 Subject: [PATCH 1/8] Extract pdf text by areas --- src/pdfboxing/text.clj | 19 ++++++++++++++++++- test/pdfboxing/text_test.clj | 17 ++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj index bc837c2..e745618 100644 --- a/src/pdfboxing/text.clj +++ b/src/pdfboxing/text.clj @@ -1,6 +1,8 @@ (ns pdfboxing.text (:require [pdfboxing.common :as common]) - (:import org.apache.pdfbox.text.PDFTextStripper)) + (:import (org.apache.pdfbox.text PDFTextStripper + PDFTextStripperByArea) + (java.awt Rectangle))) (defn extract "get text from a PDF document" @@ -8,3 +10,18 @@ (with-open [doc (common/obtain-document pdfdoc)] (-> (PDFTextStripper.) (.getText doc)))) + +(defn- area-text [doc {:keys [x y w h page-number] :as area}] + (let [page-number (or page-number 0) + rectangle (Rectangle. x y w h) + pdpage (.getPage doc page-number) + textstripper (doto (PDFTextStripperByArea.) + (.addRegion "region" rectangle) + (.extractRegions pdpage))] + (.getTextForRegion textstripper "region"))) + +(defn extract-by-areas + "get text from a specified area of a PDF document" + [pdfdoc areas] + (with-open [doc (common/obtain-document pdfdoc)] + (doall (map #(area-text doc %) areas)))) diff --git a/test/pdfboxing/text_test.clj b/test/pdfboxing/text_test.clj index d34b5b7..b23dc64 100644 --- a/test/pdfboxing/text_test.clj +++ b/test/pdfboxing/text_test.clj @@ -1,9 +1,24 @@ (ns pdfboxing.text-test (:require [clojure.test :refer [deftest is]] - [pdfboxing.text :refer [extract]])) + [pdfboxing.text :refer [extract extract-by-areas]])) (def line-separator (System/getProperty "line.separator")) (deftest text-extraction (is (= (str "Hello, this is pdfboxing.text" line-separator) (extract "test/pdfs/hello.pdf")))) + +(deftest text-extract-by-areas + (let [areas [{:x 150 + :y 100 + :w 260 + :h 40 + :page-number 0} + {:x 380 + :y 500 + :w 27 + :h 23 + :page-number 4}]] + (is (= ["Clojure 1.6 Cheat Sheet (v21)\n" + "*ns*\n"] + (extract-by-areas "test/pdfs/multi-page.pdf" areas))))) From 2516b5522851fda14e5b35f19ae602789d46c877 Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Tue, 12 Jan 2021 01:54:10 +0200 Subject: [PATCH 2/8] Appease the linter monster --- src/pdfboxing/text.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj index e745618..bd3122d 100644 --- a/src/pdfboxing/text.clj +++ b/src/pdfboxing/text.clj @@ -11,7 +11,7 @@ (-> (PDFTextStripper.) (.getText doc)))) -(defn- area-text [doc {:keys [x y w h page-number] :as area}] +(defn- area-text [doc {:keys [x y w h page-number]}] (let [page-number (or page-number 0) rectangle (Rectangle. x y w h) pdpage (.getPage doc page-number) From 05a146e650f1a9d9fe86bc01319ccc5a4cde248f Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Tue, 12 Jan 2021 02:12:03 +0200 Subject: [PATCH 3/8] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a59b71..14fff70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## WIP ### Added +- Added the ability to partially parse PDF content based on a vector of regions - Added the ability to merge multiple images into a single PDF - Added the ability to load PDFs from byte arrays From a66aff396d0fe82ab3dc95ea61b3f61ae2948331 Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Tue, 12 Jan 2021 02:21:19 +0200 Subject: [PATCH 4/8] Reorder changelog I had added my change in the beginning of the changelog, incorrectly. This commit fixes that mistake. --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14fff70..c7b0188 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,9 @@ ## WIP ### Added -- Added the ability to partially parse PDF content based on a vector of regions - Added the ability to merge multiple images into a single PDF - Added the ability to load PDFs from byte arrays +- Added the ability to partially parse PDF content based on a vector of regions ### Changed - Using lists for :imports From 133eee2605977b8e15eb7c53ff4835f8c306d7a5 Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Thu, 14 Jan 2021 00:21:05 +0200 Subject: [PATCH 5/8] Update function docstring to reflect reality --- src/pdfboxing/text.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj index bd3122d..e129806 100644 --- a/src/pdfboxing/text.clj +++ b/src/pdfboxing/text.clj @@ -21,7 +21,7 @@ (.getTextForRegion textstripper "region"))) (defn extract-by-areas - "get text from a specified area of a PDF document" + "get text from specified areas of a PDF document" [pdfdoc areas] (with-open [doc (common/obtain-document pdfdoc)] (doall (map #(area-text doc %) areas)))) From af01aa0af9de1eeef2c0d429568577176dfddacd Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Sat, 16 Jan 2021 00:58:04 +0200 Subject: [PATCH 6/8] Make area-text function more robust * Missing coordinates are now assumed 0 * Added new test case with missing coords --- src/pdfboxing/text.clj | 6 +++--- test/pdfboxing/text_test.clj | 27 +++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj index e129806..36b2123 100644 --- a/src/pdfboxing/text.clj +++ b/src/pdfboxing/text.clj @@ -11,9 +11,9 @@ (-> (PDFTextStripper.) (.getText doc)))) -(defn- area-text [doc {:keys [x y w h page-number]}] - (let [page-number (or page-number 0) - rectangle (Rectangle. x y w h) +(defn- area-text [doc {:keys [x y w h page-number] + :or {x 0 y 0 w 0 h 0 page-number 0}}] + (let [rectangle (Rectangle. x y w h) pdpage (.getPage doc page-number) textstripper (doto (PDFTextStripperByArea.) (.addRegion "region" rectangle) diff --git a/test/pdfboxing/text_test.clj b/test/pdfboxing/text_test.clj index b23dc64..50c18f0 100644 --- a/test/pdfboxing/text_test.clj +++ b/test/pdfboxing/text_test.clj @@ -1,5 +1,5 @@ (ns pdfboxing.text-test - (:require [clojure.test :refer [deftest is]] + (:require [clojure.test :refer [deftest is testing]] [pdfboxing.text :refer [extract extract-by-areas]])) (def line-separator (System/getProperty "line.separator")) @@ -21,4 +21,27 @@ :page-number 4}]] (is (= ["Clojure 1.6 Cheat Sheet (v21)\n" "*ns*\n"] - (extract-by-areas "test/pdfs/multi-page.pdf" areas))))) + (extract-by-areas "test/pdfs/multi-page.pdf" areas)))) + + (testing "default coordinate value is 0" + (let [areas [{:x 150 + :y 100 + :w 260 + :h 40} + {:x 150 + :y 100 + :w 260 + :h 40 + :page-number 0} + {:x 0 + :y 0 + :w 280 + :h 100 + :page-number 0} + {:w 280 + :h 100}]] + (is (= ["Clojure 1.6 Cheat Sheet (v21)\n" + "Clojure 1.6 Cheat Sheet (v21)\n" + "5/23/2015\nClojure\n" + "5/23/2015\nClojure\n"] + (extract-by-areas "test/pdfs/multi-page.pdf" areas)))))) From 46b6aeea17031682b9b3f22ae9c69ea819f8efa1 Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Sat, 16 Jan 2021 01:45:39 +0200 Subject: [PATCH 7/8] Add documentation for extracting text from regions --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index edf9f32..215236d 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,36 @@ Clojure PDF manipulation library & wrapper for [PDFBox](http://pdfbox.apache.org (text/extract "test/pdfs/hello.pdf") ``` +### Extract text from specific regions + +```clojure +(require '[pdfboxing.text :as text]) +(let [areas [{:x 0 + :y 100 + :w 350 + :h 50 + :page-number 0} + {:x 0 + :y 580 + :w 540 + :h 100 + :page-number 0}]] + (text/extract-by-areas "test/pdfs/clojure-1.pdf" areas)) +``` + +results in +```clojure +=> ("Clojure is a dynamic programming language\n" "Rationale\nFeatures\nDownload\nGetting Started\nDocumentation\nClojureScript\nClojureCLR\n") +``` + +Then you can easily turn the result into a map using zipmap to get the following: + +```clojure +;; Result of (zipmap [:description :links] text-extract) + +{:description "Clojure is a dynamic programming language\n", :links "Rationale\nFeatures\nDownload\nGetting Started\nDocumentation\nClojureScript\nClojureCLR\n"} +``` + ### Merge multiple PDFs ```clojure From 5d769337ab784dc6bfb6a565b771f8411e15b35e Mon Sep 17 00:00:00 2001 From: Pavlos Melissinos Date: Mon, 11 Oct 2021 23:32:57 +0300 Subject: [PATCH 8/8] Make pdf area extraction eager with reduce --- src/pdfboxing/text.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj index 36b2123..54dc99c 100644 --- a/src/pdfboxing/text.clj +++ b/src/pdfboxing/text.clj @@ -24,4 +24,4 @@ "get text from specified areas of a PDF document" [pdfdoc areas] (with-open [doc (common/obtain-document pdfdoc)] - (doall (map #(area-text doc %) areas)))) + (reduce (fn [v area] (conj v (area-text doc area))) [] areas)))