diff --git a/pom.xml b/pom.xml index f9e1ed0..e05db67 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ 4.0.0 - fr.bnf.digitools - scannedPdf + au.gov.nla + scanned-pdf-detector jar 1.2 scannedPdf @@ -18,11 +18,16 @@ repo - - https://github.com/tledoux/scannedPdf - scm:git:git://github.com/tledoux/scannedPdf.git - scm:git:git@github.com:tledoux/scannedPdf.git - + + + nla-public-releases + https://dev.nla.gov.au/nexus/content/repositories/nla-public/ + + + nla-public-snapshots + https://dev.nla.gov.au/nexus/content/repositories/nla-public-snapshots/ + + UTF-8 @@ -33,13 +38,6 @@ pdfbox 2.0.29 - - - javax.media - jai_imageio - 1.1.1 - - com.github.spotbugs @@ -188,13 +186,5 @@ - - - - Geotoolkit - Geotoolkit - http://maven.geotoolkit.org/ - - \ No newline at end of file diff --git a/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java b/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java index da8bb06..15ecb5f 100644 --- a/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java +++ b/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java @@ -1,5 +1,6 @@ package fr.bnf.toolslab; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; @@ -28,6 +29,15 @@ public class StrictPdfBoxScanDetector extends AbstractScanDetector { List imageDimensions; private Map processedInlineImages = new HashMap<>(); private AtomicInteger inlineImageCounter = new AtomicInteger(0); + private final int numSamples; // Number of pages to sample + + public StrictPdfBoxScanDetector(int numSamples) { + this.numSamples = numSamples; + } + + public StrictPdfBoxScanDetector() { + this(MAX_SAMPLES); // Default to 5 samples + } @Override public void init(FileDescriptor fd) { @@ -67,15 +77,14 @@ public void parse() throws IOException { return; } - // Second heuristic: pick some pages and look if the image covers + // Second heuristic: pick the first x pages and look if the image covers // all the page - int nbSamples = Math.min(nbPages, MAX_SAMPLES); - List pagesToTest = pickSamples(nbSamples, nbPages); + int nbSamples = Math.min(nbPages, numSamples); // Classify all the dpiFound (could be 0) DpiCounter counter = new DpiCounter(); - for (int pageNum : pagesToTest) { + for (int pageNum = 0; pageNum < nbSamples; pageNum++) { DimensionInfo dimPage = pageDimensions.get(pageNum); DimensionInfo dimImage = imageDimensions.get(pageNum); LOGGER.fine("Page [" + pageNum + "] dimension " + dimImage); @@ -151,5 +160,14 @@ protected int parsePage(PDPage page, int numPage) throws IOException { } return nbImagesInPage; } + + public static boolean isScan(File file, int samples) throws IOException { + FileDescriptor fileDescriptor = new FileDescriptor(file); + + StrictPdfBoxScanDetector strictPdfBoxScanDetector = new StrictPdfBoxScanDetector(samples); + strictPdfBoxScanDetector.init(fileDescriptor); + strictPdfBoxScanDetector.parse(); + return fileDescriptor.isScan(); + } }