Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.dspace.content.Bitstream;
Expand Down Expand Up @@ -153,8 +153,8 @@ public File getImageFile(File f, boolean verbose)
// the CropBox is missing or empty because pdfbox will set it to the
// same size as the MediaBox if it doesn't exist. Also note that we
// only need to check the first page, since that's what we use for
// generating the thumbnail (PDDocument uses a zero-based index).
PDPage pdfPage = PDDocument.load(f).getPage(0);
// generating the thumbnail (PDPage uses a zero-based index).
PDPage pdfPage = Loader.loadPDF(f).getPage(0);
PDRectangle pdfPageMediaBox = pdfPage.getMediaBox();
PDRectangle pdfPageCropBox = pdfPage.getCropBox();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import java.io.InputStream;

import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.rendering.PDFRenderer;
Expand Down Expand Up @@ -71,7 +73,7 @@ public InputStream getDestinationStream(Item currentItem, InputStream source, bo
BufferedImage buf;

// Render the page image.
try ( PDDocument doc = PDDocument.load(source); ) {
try ( PDDocument doc = Loader.loadPDF(new RandomAccessReadBuffer(source)); ) {
PDFRenderer renderer = new PDFRenderer(doc);
buf = renderer.renderImage(0);
} catch (InvalidPasswordException ex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.io.ScratchFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.dspace.authorize.AuthorizeException;
Expand Down Expand Up @@ -330,19 +330,24 @@ private void crosswalkPDF(Context context, Item item, InputStream metadata)
COSDocument cos = null;

try {
ScratchFile scratchFile = null;
PDDocument document = null;

try {
long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100; // use up to 80% of JVM free memory
scratchFile = new ScratchFile(
MemoryUsageSetting.setupMixed(useRAM)); // then fallback to temp file (unlimited size)
// Use up to 80% of JVM free memory and fall back to a temp file (unlimited size)
long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100;
document = Loader.loadPDF(
new RandomAccessReadBuffer(metadata),
() -> new ScratchFile(MemoryUsageSetting.setupMixed(useRAM)));
} catch (IOException ioe) {
log.warn("Error initializing scratch file: " + ioe.getMessage());
}

PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile);
parser.parse();
cos = parser.getDocument();
// sanity check: loaded PDF document must not be null.
if (document == null) {
throw new MetadataValidationException("The provided stream could not be parsed into a PDF document.");
}

cos = document.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.dspace.authorize.AuthorizeException;
import org.dspace.authorize.service.AuthorizeService;
import org.dspace.content.Bitstream;
Expand Down Expand Up @@ -304,7 +307,7 @@ public Pair<byte[], Long> makeCitedDocument(Context context, Bitstream bitstream
Item item = (Item) bitstreamService.getParentObject(context, bitstream);
final InputStream inputStream = bitstreamService.retrieve(context, bitstream);
try {
sourceDocument = sourceDocument.load(inputStream);
sourceDocument = Loader.loadPDF(new RandomAccessReadBuffer(inputStream));
} finally {
inputStream.close();
}
Expand Down Expand Up @@ -335,9 +338,10 @@ protected void generateCoverPage(Context context, PDDocument document, PDPage co
int xwidth = 550;
int ygap = 20;

PDFont fontHelvetica = PDType1Font.HELVETICA;
PDFont fontHelveticaBold = PDType1Font.HELVETICA_BOLD;
PDFont fontHelveticaOblique = PDType1Font.HELVETICA_OBLIQUE;
PDFont fontHelvetica = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
PDFont fontHelveticaBold = new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD);
PDFont fontHelveticaOblique = new PDType1Font(Standard14Fonts.FontName.HELVETICA_OBLIQUE);

contentStream.setNonStrokingColor(Color.BLACK);

String[][] content = {header1};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException;
import org.apache.http.client.utils.URIBuilder;
import org.apache.jena.ext.xerces.impl.dv.util.Base64;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.xerces.impl.dv.util.Base64;
import org.dspace.app.util.XMLUtils;
import org.dspace.content.Item;
import org.dspace.importer.external.datamodel.ImportRecord;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.CharEncoding;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.solr.client.solrj.SolrServerException;
Expand Down Expand Up @@ -989,7 +991,7 @@ private String extractPDFText(byte[] content) throws IOException {

try (ByteArrayInputStream source = new ByteArrayInputStream(content);
Writer writer = new StringWriter();
PDDocument pdfDoc = PDDocument.load(source)) {
PDDocument pdfDoc = Loader.loadPDF(new RandomAccessReadBuffer(source))) {

pts.writeText(pdfDoc, writer);
return writer.toString();
Expand All @@ -998,7 +1000,7 @@ private String extractPDFText(byte[] content) throws IOException {

private int getNumberOfPdfPages(byte[] content) throws IOException {
try (ByteArrayInputStream source = new ByteArrayInputStream(content);
PDDocument pdfDoc = PDDocument.load(source)) {
PDDocument pdfDoc = Loader.loadPDF(new RandomAccessReadBuffer(source))) {
return pdfDoc.getNumberOfPages();
}
}
Expand Down
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@
<jaxb-runtime.version>4.0.5</jaxb-runtime.version>
<jcache-version>1.1.1</jcache-version>
<!-- NOTE: Jetty needed for Solr, Handle Server & tests -->
<jetty.version>9.4.57.v20241219</jetty.version>
<log4j.version>2.24.3</log4j.version>
<pdfbox-version>2.0.34</pdfbox-version>
<jetty.version>9.4.58.v20250814</jetty.version>
<log4j.version>2.25.2</log4j.version>
<pdfbox-version>3.0.5</pdfbox-version>
<rome.version>1.19.0</rome.version>
<slf4j.version>2.0.17</slf4j.version>
<tika.version>2.9.4</tika.version>
<tika.version>3.2.3</tika.version>
<!-- Sync BouncyCastle & ASM with whatever version Tika uses -->
<bouncycastle.version>1.81</bouncycastle.version>
<asm.version>8.0.1</asm.version>
Expand Down