Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bump pdfbox to 3.0.1 #79

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions deps.edn
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{:deps {org.clojure/clojure {:mvn/version "1.11.1"},
org.apache.pdfbox/pdfbox {:mvn/version "2.0.29"}
org.apache.pdfbox/pdfbox-tools {:mvn/version "2.0.29"}}
org.apache.pdfbox/pdfbox {:mvn/version "3.0.1"}
org.apache.pdfbox/pdfbox-io {:mvn/version "3.0.1"}
dotemacs marked this conversation as resolved.
Show resolved Hide resolved
org.apache.pdfbox/pdfbox-tools {:mvn/version "3.0.1"}}
:aliases {:test {:extra-paths ["test"]
:extra-deps {org.clojure/test.check {:mvn/version "RELEASE"}}}
:runner {:extra-deps {com.cognitect/test-runner
Expand Down
5 changes: 3 additions & 2 deletions project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
:license {:name "BSD"
:url "https://opensource.org/license/bsd-3-clause/"}
:dependencies [[org.clojure/clojure "1.11.1"]
[org.apache.pdfbox/pdfbox "2.0.29"]
[org.apache.pdfbox/pdfbox-tools "2.0.29"]])
[org.apache.pdfbox/pdfbox "3.0.1"]
[org.apache.pdfbox/pdfbox-io "3.0.1"]
[org.apache.pdfbox/pdfbox-tools "3.0.1"]])
39 changes: 14 additions & 25 deletions src/pdfboxing/common.clj
Original file line number Diff line number Diff line change
@@ -1,55 +1,44 @@
(ns pdfboxing.common
(:require [clojure.java.io :as io])
(:import (java.io File)
(org.apache.pdfbox.pdmodel PDDocument)
(org.apache.pdfbox.io RandomAccessFile)
(org.apache.pdfbox.pdfparser PDFParser)))
(org.apache.pdfbox Loader)
(org.apache.pdfbox.pdmodel PDDocument)))

(defn try-get-as-pdf
"Try and get the pdf-file-or-path as a PDF.
Returns nil if pdf-file-or-path could not be loaded as a PDF."
[pdf-file-or-path]
(let [^File pdf-file (io/as-file pdf-file-or-path)
random-access-file (RandomAccessFile. pdf-file "r")
parser (PDFParser. random-access-file)]
(try
(.parse parser)
(.getPDDocument parser)
(catch Exception _))))

(defn load-pdf-from-media [pdf-file-or-path]
(try
(-> pdf-file-or-path
^File (io/as-file)
(Loader/loadPDF))
(catch Exception _)))

(defn is-pdf?
"Confirm that the PDF supplied is really a PDF"
[pdf-file-or-path]
(if-let [pdf (try-get-as-pdf pdf-file-or-path)]
(if-let [pdf (load-pdf-from-media pdf-file-or-path)]
(try
(not (nil? pdf))
(finally
(.close pdf)))
false))

(defn load-pdf
"Load a given PDF only after checking if it really is a PDF"
[pdf-file-or-path]
(if-let [pdf (try-get-as-pdf pdf-file-or-path)]
pdf
(throw (IllegalArgumentException. (format "%s is not a PDF file" pdf-file-or-path)))))

(defprotocol PDFDocument
"return an object from which text can be extracted"
(obtain-document [source]))

(extend-protocol PDFDocument
(Class/forName "[B") ;; byte-array
#_{:clj-kondo/ignore [:function-name]}
(obtain-document [source]
(PDDocument/load source))
(Loader/loadPDF source))

String
(obtain-document [source]
(load-pdf source))
(load-pdf-from-media source))

File
(obtain-document [source]
(load-pdf source))
(load-pdf-from-media source))

PDDocument
(obtain-document [source]
Expand Down
4 changes: 2 additions & 2 deletions src/pdfboxing/html.clj
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
"Convert a PDF document to a simple HTML file.
`pdf-doc` - string, name of a PDF document that is in the root folder
this is command line tool from pdfbox more info at:
https://pdfbox.apache.org/2.0/commandline.html#extracttext"
https://pdfbox.apache.org/3.0/commandline.html#extracttext"
[pdf-doc]
(ExtractText/main (into-array String ["-html" pdf-doc])))
(ExtractText/main (into-array String ["-html" "-i" pdf-doc])))
22 changes: 15 additions & 7 deletions src/pdfboxing/merge.clj
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
(ns pdfboxing.merge
(:require [pdfboxing.common :as common])
(:import (java.io InputStream OutputStream)
(:import (java.io File InputStream OutputStream)
(org.apache.pdfbox.io IOUtils RandomAccessRead RandomAccessReadBuffer)
(org.apache.pdfbox.multipdf PDFMergerUtility)
(org.apache.pdfbox.pdmodel PDDocument
PDPage
PDPageContentStream)
PDPage PDPageContentStream)
(org.apache.pdfbox.pdmodel.common PDRectangle)
(org.apache.pdfbox.pdmodel.graphics.image PDImageXObject)))

(defn throw-exception
[message]
[^String message]
(throw (IllegalArgumentException. message)))

(defn check-if-present
Expand Down Expand Up @@ -43,18 +43,26 @@
{:pre [(arg-check output input)]}
(let [merger (PDFMergerUtility.)]
(doseq [source input]
(.addSource merger source))
(condp instance? source
File (.addSource merger ^File source)
String (.addSource merger ^String source)
InputStream (.addSource merger ^RandomAccessRead
(RandomAccessReadBuffer. ^InputStream source))))
(cond
(instance? OutputStream output)
(.setDestinationStream merger output)

:else
(.setDestinationFileName merger output))
(.mergeDocuments merger)))
(.mergeDocuments merger (IOUtils/createMemoryOnlyStreamCache))
(condp instance? output
File (.close output)
OutputStream (.close output)
nil)))

(defn- add-image-to-page
"Adds image as a page to the document object"
[doc ^org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject image]
[^PDDocument doc ^PDImageXObject image]
(let [page-size PDRectangle/A4
original-width (.getWidth image)
original-height (.getHeight image)
Expand Down
Loading