Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 12 additions & 22 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>fr.bnf.digitools</groupId>
<artifactId>scannedPdf</artifactId>
<groupId>au.gov.nla</groupId>
<artifactId>scanned-pdf-detector</artifactId>
<packaging>jar</packaging>
<version>1.2</version>
<name>scannedPdf</name>
Expand All @@ -18,11 +18,16 @@
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<url>https://github.com/tledoux/scannedPdf</url>
<connection>scm:git:git://github.com/tledoux/scannedPdf.git</connection>
<developerConnection>scm:git:git@github.com:tledoux/scannedPdf.git</developerConnection>
</scm>
<distributionManagement>
<repository>
<id>nla-public-releases</id>
<url>https://dev.nla.gov.au/nexus/content/repositories/nla-public/</url>
</repository>
<snapshotRepository>
<id>nla-public-snapshots</id>
<url>https://dev.nla.gov.au/nexus/content/repositories/nla-public-snapshots/</url>
</snapshotRepository>
</distributionManagement>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
Expand All @@ -33,13 +38,6 @@
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
<!-- Required by PDFBox to read JPEG2000 images -->
<dependency>
<groupId>javax.media</groupId>
<artifactId>jai_imageio</artifactId>
<version>1.1.1</version>
</dependency>

<!-- In order to annotate the spotbugs report -->
<dependency>
<groupId>com.github.spotbugs</groupId>
Expand Down Expand Up @@ -188,13 +186,5 @@
</plugin>
</plugins>
</reporting>

<repositories>
<repository>
<id>Geotoolkit</id>
<name>Geotoolkit</name>
<url>http://maven.geotoolkit.org/</url>
</repository>
</repositories>

</project>
26 changes: 22 additions & 4 deletions src/main/java/fr/bnf/toolslab/StrictPdfBoxScanDetector.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package fr.bnf.toolslab;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -28,6 +29,15 @@ public class StrictPdfBoxScanDetector extends AbstractScanDetector {
List<DimensionInfo> imageDimensions;
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
private final int numSamples; // Number of pages to sample

public StrictPdfBoxScanDetector(int numSamples) {
this.numSamples = numSamples;
}

public StrictPdfBoxScanDetector() {
this(MAX_SAMPLES); // Default to 5 samples
}

@Override
public void init(FileDescriptor fd) {
Expand Down Expand Up @@ -67,15 +77,14 @@ public void parse() throws IOException {
return;
}

// Second heuristic: pick some pages and look if the image covers
// Second heuristic: pick the first x pages and look if the image covers
// all the page
int nbSamples = Math.min(nbPages, MAX_SAMPLES);
List<Integer> pagesToTest = pickSamples(nbSamples, nbPages);
int nbSamples = Math.min(nbPages, numSamples);

// Classify all the dpiFound (could be 0)
DpiCounter counter = new DpiCounter();

for (int pageNum : pagesToTest) {
for (int pageNum = 0; pageNum < nbSamples; pageNum++) {
DimensionInfo dimPage = pageDimensions.get(pageNum);
DimensionInfo dimImage = imageDimensions.get(pageNum);
LOGGER.fine("Page [" + pageNum + "] dimension " + dimImage);
Expand Down Expand Up @@ -151,5 +160,14 @@ protected int parsePage(PDPage page, int numPage) throws IOException {
}
return nbImagesInPage;
}

public static boolean isScan(File file, int samples) throws IOException {
FileDescriptor fileDescriptor = new FileDescriptor(file);

StrictPdfBoxScanDetector strictPdfBoxScanDetector = new StrictPdfBoxScanDetector(samples);
strictPdfBoxScanDetector.init(fileDescriptor);
strictPdfBoxScanDetector.parse();
return fileDescriptor.isScan();
}
}

Loading