diff --git a/pom.xml b/pom.xml
index f9e1ed0..e05db67 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,7 +1,7 @@
4.0.0
- fr.bnf.digitools
- scannedPdf
+ au.gov.nla
+ scanned-pdf-detector
jar
1.2
scannedPdf
@@ -18,11 +18,16 @@
repo
-
- https://github.com/tledoux/scannedPdf
- scm:git:git://github.com/tledoux/scannedPdf.git
- scm:git:git@github.com:tledoux/scannedPdf.git
-
+
+
+ nla-public-releases
+ https://dev.nla.gov.au/nexus/content/repositories/nla-public/
+
+
+ nla-public-snapshots
+ https://dev.nla.gov.au/nexus/content/repositories/nla-public-snapshots/
+
+
UTF-8
@@ -33,13 +38,6 @@
pdfbox
2.0.29
-
-
- javax.media
- jai_imageio
- 1.1.1
-
-
com.github.spotbugs
@@ -188,13 +186,5 @@
-
-
-
- Geotoolkit
- Geotoolkit
- http://maven.geotoolkit.org/
-
-
\ No newline at end of file
diff --git a/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java b/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java
index da8bb06..15ecb5f 100644
--- a/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java
+++ b/src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java
@@ -1,5 +1,6 @@
package fr.bnf.toolslab;
+import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
@@ -28,6 +29,15 @@ public class StrictPdfBoxScanDetector extends AbstractScanDetector {
List imageDimensions;
private Map processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
+ private final int numSamples; // Number of pages to sample
+
+ public StrictPdfBoxScanDetector(int numSamples) {
+ this.numSamples = numSamples;
+ }
+
+ public StrictPdfBoxScanDetector() {
+ this(MAX_SAMPLES); // Default to 5 samples
+ }
@Override
public void init(FileDescriptor fd) {
@@ -67,15 +77,14 @@ public void parse() throws IOException {
return;
}
- // Second heuristic: pick some pages and look if the image covers
+ // Second heuristic: pick the first x pages and look if the image covers
// all the page
- int nbSamples = Math.min(nbPages, MAX_SAMPLES);
- List pagesToTest = pickSamples(nbSamples, nbPages);
+ int nbSamples = Math.min(nbPages, numSamples);
// Classify all the dpiFound (could be 0)
DpiCounter counter = new DpiCounter();
- for (int pageNum : pagesToTest) {
+ for (int pageNum = 0; pageNum < nbSamples; pageNum++) {
DimensionInfo dimPage = pageDimensions.get(pageNum);
DimensionInfo dimImage = imageDimensions.get(pageNum);
LOGGER.fine("Page [" + pageNum + "] dimension " + dimImage);
@@ -151,5 +160,14 @@ protected int parsePage(PDPage page, int numPage) throws IOException {
}
return nbImagesInPage;
}
+
+ public static boolean isScan(File file, int samples) throws IOException {
+ FileDescriptor fileDescriptor = new FileDescriptor(file);
+
+ StrictPdfBoxScanDetector strictPdfBoxScanDetector = new StrictPdfBoxScanDetector(samples);
+ strictPdfBoxScanDetector.init(fileDescriptor);
+ strictPdfBoxScanDetector.parse();
+ return fileDescriptor.isScan();
+ }
}