Add support for exporting pdf to image
[Re #55]
This commit also changes slightly the prerequisities for split function
Previously it only allowed strings as inputs. IMHO it should also
accept files.
Damian Hryniewicz committed Nov 12, 2020
1 parent 4e9d388 commit 25eff3f
Splits into two PDFs, the first having 5 pages and second has rest
(pdf/split-pdf-at :input "test/pdfs/multi-page.pdf" :split 5)

### Export a PDF to an image
(require '[pdfboxing.image :as image])
Export a thumbnail of a PDF
(image/export-to-image :input "test/pdfs/multi-page.pdf")

Export a thumbnail with custom DPI (default is 300)
(image/export-to-image :input "test/pdfs/multi-page.pdf" :dpi 72)

Export a thumbnail of custom page index (defaults to first page)
(image/export-to-image :input "test/pdfs/multi-page.pdf" :page-idx 1)

### List form fields of a PDF

To list fields and values:
4 changes: 4 additions & 0 deletions src/pdfboxing/common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
( RandomAccessFile)
(org.apache.pdfbox.pdfparser PDFParser)))

(defn throw-exception
(throw (IllegalArgumentException. message)))

(defn try-get-as-pdf
"Try and get the pdf-file-or-path as a PDF.
Returns nil if pdf-file-or-path could not be loaded as a PDF."
36 changes: 36 additions & 0 deletions src/pdfboxing/image.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
(ns pdfboxing.image
(:require [clojure.spec.alpha :as s]
[pdfboxing.common :as common]
[ :as info])
(:import [org.apache.pdfbox.rendering PDFRenderer ImageType]
[org.apache.pdfbox.pdmodel PDDocument]
[java.awt.image BufferedImage]))

(s/def ::input #(or (instance? PDDocument %)
(common/is-pdf? %)))
(s/def ::dpi int?)
(s/def ::page-idx int?)
(s/def ::export-to-image-config
(s/keys :req-un [::input]
:opt-un [::dpi ::page-idx]))

(s/def ::export-to-image-ret #(instance? BufferedImage %))

(defn- page-idx-in-bounds
[page-idx input]
(if (<= 0 page-idx (dec (info/page-number input)))
(common/throw-exception "Page index out of bounds")))

(defn export-to-image
"Export PDF or PDDocument into BufferedImage
Only one page will be exported (first by default).
Split the document first if you want one image for each page."
[& {:keys [input dpi page-idx]
:or {dpi 300 page-idx 0}
:as config}]
{:pre [(s/valid? ::export-to-image-config config)
(page-idx-in-bounds page-idx input)]
:post [(s/valid? ::export-to-image-ret %)]}
(with-open [doc (common/obtain-document input)]
(.renderImageWithDPI (PDFRenderer. doc) page-idx dpi ImageType/RGB)))
17 changes: 8 additions & 9 deletions src/pdfboxing/merge.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,35 @@
(org.apache.pdfbox.pdmodel.common PDRectangle)
( PDImageXObject)))

(defn throw-exception
(throw (IllegalArgumentException. message)))

(defn check-if-present
"Check if the input & output file names where supplied"
[input output]
(when (some true? (map empty? [input output]))
(throw-exception "argument can't be empty")))
(common/throw-exception "argument can't be empty")))

(defn check-for-pdfs
"Check if all the files supplied are actual PDFs."
(if (some false? (map common/is-pdf? files))
(throw-exception "the files supplied need to be PDFs")
(common/throw-exception "the files supplied need to be PDFs")

(defn arg-check [output input]
(check-if-present input output)
(if (sequential? input)
(check-for-pdfs input)
(throw-exception "input - needs to be sequential")))
(common/throw-exception "input - needs to be sequential")))

(defn merge-pdfs
"merge multiple PDFs into output file"
[& {:keys [output input]}]
{:pre [(arg-check output input)]}
(let [merger (PDFMergerUtility.)]
(doseq [f input]
(.addSource merger (FileInputStream. (File. f))))
(doseq [f input
:let [file (if (string? f)
(File. f)
(.addSource merger (FileInputStream. file)))
(.setDestinationFileName merger output)
(.mergeDocuments merger)))

12 changes: 7 additions & 5 deletions src/pdfboxing/split.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
(:require [clojure.string :as s]
[pdfboxing.common :as common]
[pdfboxing.merge :as merge])
(:import (org.apache.pdfbox.multipdf PDFMergerUtility Splitter)))
(:import [org.apache.pdfbox.multipdf PDFMergerUtility Splitter]
[ File]))

(defn check-if-integer
(if (every? integer? coll)
(merge/throw-exception ":start and :end may only be integers")))
(common/throw-exception ":start and :end may only be integers")))

(defn arg-check [input start end split]
(let [int-args [start end split]]
(if (string? input)
(if (or (string? input) (instance? File input))
(merge/check-for-pdfs [input])
(merge/throw-exception "input must be a string"))
(common/throw-exception "input must be a string"))
(check-if-integer (filter (complement nil?) int-args))))

(defn pddocument->byte-array
(defn pddocument->byte-array
(into [] (.split splitter doc)))))

(defn split-pdf-at
"Splits a pdf into two documents and writes them to disk"
"Splits a pdf into two documents and writes them to disk
If the split key is not provided then it will split the document approx. in half."
[& {:keys [input split]}]
(let [base-name (first (s/split input #".pdf"))
f-names (for [x (range 1 3)] (str base-name "-" x ".pdf"))
16 changes: 16 additions & 0 deletions test/pdfboxing/image_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
(ns pdfboxing.image-test
(:require [clojure.test :refer [deftest is]]
[pdfboxing.image :as image])
(:import [java.awt.image BufferedImage]))

(deftest export-to-image
(let [file "test/pdfs/multi-page.pdf"
exporting-outcome (image/export-to-image :input file)
exporting-outcome-other-page (image/export-to-image :input file :page-idx 1)
exporting-outcome-small-dpi (image/export-to-image :input file :dpi 72)]
(is (instance? BufferedImage exporting-outcome))
(is (instance? BufferedImage exporting-outcome-other-page))
(is (thrown? IllegalArgumentException (image/export-to-image :input file :page-idx 100)))
(is (instance? BufferedImage exporting-outcome-small-dpi))
(is (not= exporting-outcome exporting-outcome-other-page))
(is (> (.getWidth exporting-outcome) (.getWidth exporting-outcome-small-dpi)))))
14 changes: 11 additions & 3 deletions test/pdfboxing/merge_test.clj
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
(ns pdfboxing.merge-test
(:require [ :as io]
[clojure.test :refer [deftest is]]
[clojure.test :refer [deftest is testing]]
[pdfboxing.common :as common]
[pdfboxing.merge :refer [arg-check merge-pdfs]]))
[pdfboxing.merge :refer [arg-check merge-pdfs]])
(:import [ File]))

(deftest input-output-argument-check
(is (thrown? IllegalArgumentException (arg-check)))
Expand All @@ -21,7 +22,14 @@
:input ["test/pdfs/clojure-1.pdf" "test/pdfs/clojure-2.pdf"])
merged-pdf-file (.exists (io/as-file file))]
(is (true? merged-pdf-file))
(is (true? (common/is-pdf? file)))))
(is (true? (common/is-pdf? file)))

(testing "Accepts both file paths and File instances as an input"
(let [merging-outcome (merge-pdfs :output file
:input [(File. "test/pdfs/clojure-1.pdf") "test/pdfs/clojure-2.pdf"])
merged-pdf-file (.exists (io/as-file file))]
(is (true? merged-pdf-file))
(is (true? (common/is-pdf? file)))))))

;; clean up
(defn clean-up [file]
Expand Down

