dotemacs · PavlosMelissinos · Jan 11, 2021 · Jan 11, 2021 · Jan 12, 2021 · Jan 12, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 - Added the ability to merge multiple images into a single PDF
 - Added the ability to load PDFs from byte arrays
 - Added the ability to run tests automatically using GitHub actions [#64](https://github.com/dotemacs/pdfboxing/pull/64)
+- Added the ability to partially parse PDF content based on a vector of regions
 
 ### Changed
 - Using lists for :imports

diff --git a/README.md b/README.md
@@ -18,6 +18,36 @@ Clojure PDF manipulation library & wrapper for [PDFBox](http://pdfbox.apache.org
 (text/extract "test/pdfs/hello.pdf")
 ```
 
+### Extract text from specific regions
+
+```clojure
+(require '[pdfboxing.text :as text])
+(let [areas [{:x           0
+              :y           100
+              :w           350
+              :h           50
+              :page-number 0}
+             {:x           0
+              :y           580
+              :w           540
+              :h           100
+              :page-number 0}]]
+  (text/extract-by-areas "test/pdfs/clojure-1.pdf" areas))
+```
+
+results in
+```clojure
+=> ("Clojure is a dynamic programming language\n" "Rationale\nFeatures\nDownload\nGetting Started\nDocumentation\nClojureScript\nClojureCLR\n")
+```
+
+Then you can easily turn the result into a map using zipmap to get the following:
+
+```clojure
+;; Result of (zipmap [:description :links] text-extract)
+
+{:description "Clojure is a dynamic programming language\n", :links "Rationale\nFeatures\nDownload\nGetting Started\nDocumentation\nClojureScript\nClojureCLR\n"}
+```
+
 ### Merge multiple PDFs
 
 ```clojure

diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj
@@ -1,10 +1,27 @@
 (ns pdfboxing.text
   (:require [pdfboxing.common :as common])
-  (:import org.apache.pdfbox.text.PDFTextStripper))
+  (:import (org.apache.pdfbox.text PDFTextStripper
+                                   PDFTextStripperByArea)
+           (java.awt Rectangle)))
 
 (defn extract
   "get text from a PDF document"
   [pdfdoc]
   (with-open [doc (common/obtain-document pdfdoc)]
     (-> (PDFTextStripper.)
         (.getText doc))))
+
+(defn- area-text [doc {:keys [x y w h page-number]
+                       :or {x 0 y 0 w 0 h 0 page-number 0}}]
+  (let [rectangle    (Rectangle. x y w h)
+        pdpage       (.getPage doc page-number)
+        textstripper (doto (PDFTextStripperByArea.)
+                       (.addRegion "region" rectangle)
+                       (.extractRegions pdpage))]
+    (.getTextForRegion textstripper "region")))
+
+(defn extract-by-areas
+  "get text from specified areas of a PDF document"
+  [pdfdoc areas]
+  (with-open [doc (common/obtain-document pdfdoc)]
+    (reduce (fn [v area] (conj v (area-text doc area))) [] areas)))
diff --git a/test/pdfboxing/text_test.clj b/test/pdfboxing/text_test.clj
@@ -1,9 +1,47 @@
 (ns pdfboxing.text-test
-  (:require [clojure.test :refer [deftest is]]
-            [pdfboxing.text :refer [extract]]))
+  (:require [clojure.test :refer [deftest is testing]]
+            [pdfboxing.text :refer [extract extract-by-areas]]))
 
 (def line-separator (System/getProperty "line.separator"))
 
 (deftest text-extraction
   (is (= (str "Hello, this is pdfboxing.text" line-separator)
          (extract "test/pdfs/hello.pdf"))))
+
+(deftest text-extract-by-areas
+  (let [areas [{:x           150
+                :y           100
+                :w           260
+                :h           40
+                :page-number 0}
+               {:x           380
+                :y           500
+                :w           27
+                :h           23
+                :page-number 4}]]
+    (is (= ["Clojure 1.6 Cheat Sheet (v21)\n"
+            "*ns*\n"]
+           (extract-by-areas "test/pdfs/multi-page.pdf" areas))))
+
+  (testing "default coordinate value is 0"
+    (let [areas [{:x           150
+                  :y           100
+                  :w           260
+                  :h           40}
+                 {:x           150
+                  :y           100
+                  :w           260
+                  :h           40
+                  :page-number 0}
+                 {:x           0
+                  :y           0
+                  :w           280
+                  :h           100
+                  :page-number 0}
+                 {:w           280
+                  :h           100}]]
+      (is (= ["Clojure 1.6 Cheat Sheet (v21)\n"
+              "Clojure 1.6 Cheat Sheet (v21)\n"
+              "5/23/2015\nClojure\n"
+              "5/23/2015\nClojure\n"]
+             (extract-by-areas "test/pdfs/multi-page.pdf" areas))))))