diff --git a/CHANGELOG.md b/CHANGELOG.md index 85243bd..d3f7d62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Added the ability to merge multiple images into a single PDF - Added the ability to load PDFs from byte arrays - Added the ability to run tests automatically using GitHub actions [#64](https://github.com/dotemacs/pdfboxing/pull/64) +- Added the ability to partially parse PDF content based on a vector of regions ### Changed - Using lists for :imports diff --git a/README.md b/README.md index edf9f32..215236d 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,36 @@ Clojure PDF manipulation library & wrapper for [PDFBox](http://pdfbox.apache.org (text/extract "test/pdfs/hello.pdf") ``` +### Extract text from specific regions + +```clojure +(require '[pdfboxing.text :as text]) +(let [areas [{:x 0 + :y 100 + :w 350 + :h 50 + :page-number 0} + {:x 0 + :y 580 + :w 540 + :h 100 + :page-number 0}]] + (text/extract-by-areas "test/pdfs/clojure-1.pdf" areas)) +``` + +results in +```clojure +=> ("Clojure is a dynamic programming language\n" "Rationale\nFeatures\nDownload\nGetting Started\nDocumentation\nClojureScript\nClojureCLR\n") +``` + +Then you can easily turn the result into a map using zipmap to get the following: + +```clojure +;; Result of (zipmap [:description :links] text-extract) + +{:description "Clojure is a dynamic programming language\n", :links "Rationale\nFeatures\nDownload\nGetting Started\nDocumentation\nClojureScript\nClojureCLR\n"} +``` + ### Merge multiple PDFs ```clojure diff --git a/src/pdfboxing/text.clj b/src/pdfboxing/text.clj index bc837c2..54dc99c 100644 --- a/src/pdfboxing/text.clj +++ b/src/pdfboxing/text.clj @@ -1,6 +1,8 @@ (ns pdfboxing.text (:require [pdfboxing.common :as common]) - (:import org.apache.pdfbox.text.PDFTextStripper)) + (:import (org.apache.pdfbox.text PDFTextStripper + PDFTextStripperByArea) + (java.awt Rectangle))) (defn extract "get text from a PDF document" @@ -8,3 +10,18 @@ (with-open [doc (common/obtain-document pdfdoc)] (-> (PDFTextStripper.) (.getText doc)))) + +(defn- area-text [doc {:keys [x y w h page-number] + :or {x 0 y 0 w 0 h 0 page-number 0}}] + (let [rectangle (Rectangle. x y w h) + pdpage (.getPage doc page-number) + textstripper (doto (PDFTextStripperByArea.) + (.addRegion "region" rectangle) + (.extractRegions pdpage))] + (.getTextForRegion textstripper "region"))) + +(defn extract-by-areas + "get text from specified areas of a PDF document" + [pdfdoc areas] + (with-open [doc (common/obtain-document pdfdoc)] + (reduce (fn [v area] (conj v (area-text doc area))) [] areas))) diff --git a/test/pdfboxing/text_test.clj b/test/pdfboxing/text_test.clj index d34b5b7..50c18f0 100644 --- a/test/pdfboxing/text_test.clj +++ b/test/pdfboxing/text_test.clj @@ -1,9 +1,47 @@ (ns pdfboxing.text-test - (:require [clojure.test :refer [deftest is]] - [pdfboxing.text :refer [extract]])) + (:require [clojure.test :refer [deftest is testing]] + [pdfboxing.text :refer [extract extract-by-areas]])) (def line-separator (System/getProperty "line.separator")) (deftest text-extraction (is (= (str "Hello, this is pdfboxing.text" line-separator) (extract "test/pdfs/hello.pdf")))) + +(deftest text-extract-by-areas + (let [areas [{:x 150 + :y 100 + :w 260 + :h 40 + :page-number 0} + {:x 380 + :y 500 + :w 27 + :h 23 + :page-number 4}]] + (is (= ["Clojure 1.6 Cheat Sheet (v21)\n" + "*ns*\n"] + (extract-by-areas "test/pdfs/multi-page.pdf" areas)))) + + (testing "default coordinate value is 0" + (let [areas [{:x 150 + :y 100 + :w 260 + :h 40} + {:x 150 + :y 100 + :w 260 + :h 40 + :page-number 0} + {:x 0 + :y 0 + :w 280 + :h 100 + :page-number 0} + {:w 280 + :h 100}]] + (is (= ["Clojure 1.6 Cheat Sheet (v21)\n" + "Clojure 1.6 Cheat Sheet (v21)\n" + "5/23/2015\nClojure\n" + "5/23/2015\nClojure\n"] + (extract-by-areas "test/pdfs/multi-page.pdf" areas))))))