OCRmyPDF.sh

#!/bin/sh
##############################################################################
# Copyright (c) 2013-14: fritz-hh from Github (https://github.com/fritz-hh)
##############################################################################

# Determine real path of this script, following symlinks if present
! command -v python2 > /dev/null && echo "Please install python v2.x. Exiting..." && exit 1
BASEPATH="$(dirname $(python2 -c "import os; print os.path.realpath(\"$0\")"))"

# Import required scripts
. "$BASEPATH/src/config.sh"

# Set variables corresponding to the input parameters
ARGUMENTS="$@"

START=`date +%s`

usage() {
	cat << EOF
--------------------------------------------------------------------------------------
Script aimed at generating a searchable PDF file from a PDF file containing only images.
(The script performs optical character recognition of each respective page using the
tesseract engine)

Copyright: fritz-hh  from Github (https://github.com/fritz-hh)
Version: $VERSION

Usage: OCRmyPDF.sh  [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f|-s] [-l lan1[+lan2...]] [-C filename] inputfile outputfile

-h : Display this help message
-v : Increase the verbosity (this option can be used more than once) (e.g. -vvv)
-k : Do not delete the temporary files
-g : Activate debug mode:
     - Generates a PDF file containing each page twice (once with the image, once without the image
       but with the OCRed text as well as the detected bounding boxes)
     - Set the verbosity to the highest possible
     - Do not delete the temporary files
-d : Deskew each page before performing OCR
-c : Clean each page before performing OCR
-i : Incorporate the cleaned image in the final PDF file (by default the original image, or the deskewed image if the -d option is set)
-o : If the resolution of an image is lower than dpi value provided as argument, provide the OCR engine with 
     an oversampled image having the latter dpi value. This can improve the OCR results but can lead to a larger output PDF file.
     (default: no oversampling performed)
-f : Force to OCR the whole document, even if some page already contain font data.  
     (which should not be the case for PDF files built from scnanned images) 
     Any text data will be rendered to raster format and then fed through OCR.
-s : If pages contain font data, do not OCR that page, but include the page (as is) in the final output.
-l : Language(s) of the PDF file. The language should be set correctly in order to get good OCR results.
     Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
     Multiple languages may be specified, separated by '+' characters.
     (The default language is defined in the config file)
-C : Pass an additional configuration file to the tesseract OCR engine.
     (this option can be used more than once)
     Note 1: The configuration file must be available in the "tessdata/configs" folder of your tesseract installation
inputfile  : PDF file to be OCRed
outputfile : The PDF/A file that will be generated 
--------------------------------------------------------------------------------------
EOF
}


#################################################
# Get an absolute path from a relative path to a file
#
# Param1 : Relative path
# Returns: 1 if the folder in which the file is located does not exist
#          0 otherwise
################################################# 
absolutePath() {
	local wdsave absolutepath 
	wdsave="$(pwd)"
	! cd "$(dirname "$1")" 1> /dev/null 2> /dev/null && return 1
	absolutepath="$(pwd)/$(basename "$1")"
	cd "$wdsave"
	echo "$absolutepath"
	return 0
}


# Initialization the configuration parameters with default values
VERBOSITY="$LOG_ERR"		# default verbosity level
LAN="$DEFAULT_LANGUAGES"	# default language(s) of the PDF file (required to get good OCR results)
KEEP_TMP="0"			# 0=no, 1=yes (keep the temporary files)
PREPROCESS_DESKEW="0"		# 0=no, 1=yes (deskew image)
PREPROCESS_CLEAN="0"		# 0=no, 1=yes (clean image to improve OCR)
PREPROCESS_CLEANTOPDF="0"	# 0=no, 1=yes (put cleaned image in final PDF)
OVERSAMPLING_DPI="0"		# 0=do not perform oversampling (dpi value under which oversampling should be performed)
PDF_NOIMG="0"			# 0=no, 1=yes (generates each PDF page twice, with and without image)
FORCE_OCR="0"			# 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
SKIP_TEXT="0"			# 0=do not skip text pages, 1=skip text pages
TESS_CFG_FILES=""		# list of additional configuration files to be used by tesseract

# Parse optional command line arguments
while getopts ":hvgkdcio:fsl:C:" opt; do
	case $opt in
		h) usage ; exit 0 ;;
		v) VERBOSITY=$(($VERBOSITY+1)) ;;
		k) KEEP_TMP="1" ;;
		g) PDF_NOIMG="1"; VERBOSITY="$LOG_DEBUG"; KEEP_TMP="1" ;;
		d) PREPROCESS_DESKEW="1" ;;
		c) PREPROCESS_CLEAN="1" ;;
		i) PREPROCESS_CLEANTOPDF="1" ;;
		o) OVERSAMPLING_DPI="$OPTARG" ;;
		f) FORCE_OCR="1" ;;
		s) SKIP_TEXT="1" ;;
		l) LAN="$OPTARG" ;;
		C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
		\?)
			echo "Invalid option: -$OPTARG"
			usage
			exit $EXIT_BAD_ARGS ;;
		:)
			echo "Option -$OPTARG requires an argument"
			usage
			exit $EXIT_BAD_ARGS ;;
	esac
done

# Remove the optional arguments parsed above.
shift $((OPTIND-1))

# Check if the number of mandatory parameters provided is as expected
if [ "$#" -ne "2" ]; then
	echo "Exactly two mandatory argument shall be provided ($# arguments provided)"
	usage
	exit $EXIT_BAD_ARGS
fi

# Ensure that -f and -s are not both set
if [ "$SKIP_TEXT" -eq "1" -a "$FORCE_OCR" -eq "1" ]; then
	echo "Options -f and -s are mutually exclusive; choose one or the other"
	usage
	exit $EXIT_BAD_ARGS
fi


[ ! -f "$1" ] && echo "The input file does not exist. Exiting..." && exit $EXIT_BAD_ARGS
FILE_INPUT_PDF="`absolutePath "$1"`"

! absolutePath "$2" >/dev/null \
	&& echo "The folder in which the output file should be generated does not exist. Exiting..." && exit $EXIT_BAD_ARGS
[ -d "$2" ] && echo "Please enter the path of the file to be generated (and not a path to a folder). Exitíng..." && exit $EXIT_BAD_ARGS
[ -f "$2" ] && echo "The output file already exists. Exiting..." && exit $EXIT_BAD_ARGS
FILE_OUTPUT_PDFA="`absolutePath "$2"`"


# set script path as working directory
cd "$BASEPATH"

[ $VERBOSITY -ge $LOG_DEBUG ] && echo "$TOOLNAME version: $VERSION"
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Arguments: $ARGUMENTS"

# check if the required utilities are installed
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Checking if all dependencies are installed"
! command -v identify > /dev/null && echo "Please install ImageMagick. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v parallel > /dev/null && echo "Please install GNU Parallel. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdfimages > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdftoppm > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdffonts > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdfseparate > /dev/null && echo "Please install or update poppler-utils to at least 0.24.5. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
[ $PREPROCESS_CLEAN -eq 1 ] && ! command -v unpaper > /dev/null && echo "Please install unpaper. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v tesseract > /dev/null && echo "Please install tesseract and tesseract-data. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! python2 -c 'import lxml' 2>/dev/null && echo "Please install the python library lxml. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! python2 -c 'import sys, reportlab; (getattr(reportlab, "Version", "0.0") >= "3.0") or sys.exit(1)' 2>/dev/null \
	&& echo "Please install the python library reportlab. Exiting..." && exit $EXIT_MISSING_DEPENDENCY

! command -v gs > /dev/null && echo "Please install ghostscript. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v java > /dev/null && echo "Please install java. Exiting..." && exit $EXIT_MISSING_DEPENDENCY


# ensure the right tesseract version is installed
# older versions are known to produce malformed hocr output and should not be used
# Even 3.02.01 fails in few cases (see issue #28). I decided to allow this version anyway because
# 3.02.02 is not yet available for some widespread linux distributions
reqtessversion="3.02.01"
tessversion=`tesseract -v 2>&1 | grep "tesseract" | sed s/[^0-9.]//g`
tesstooold=$(echo "`echo $tessversion | sed s/[.]//2`-`echo $reqtessversion | sed s/[.]//2` < 0" | bc)
[ "$tesstooold" -eq "1" ] \
	&& echo "Please install tesseract ${reqtessversion} or newer (currently installed version is ${tessversion})" && exit $EXIT_MISSING_DEPENDENCY

# ensure the right GNU parallel version is installed
# older version do not support -q flag (required to escape special characters)
reqparallelversion="20121122"
parallelversion=`parallel --minversion 0`
! parallel --minversion "$reqparallelversion" > /dev/null \
	&& echo "Please install GNU parallel ${reqparallelversion} or newer (currently installed version is ${parallelversion})" && exit $EXIT_MISSING_DEPENDENCY

# ensure pdftoppm is provided by poppler-utils, not the older xpdf version
! pdftoppm -v 2>&1 | grep -q 'Poppler' && echo "Please remove xpdf and install poppler-utils. Exiting..." && $EXIT_MISSING_DEPENDENCY


# Display the version of the tools if log level is LOG_DEBUG
if [ $VERBOSITY -ge $LOG_DEBUG ]; then
	echo "--------------------------------"
	echo "ImageMagick version:"
	identify --version
	echo "--------------------------------"
	echo "GNU Parallel version:"
	parallel --version
	echo "--------------------------------"
	echo "Poppler-utils version:"
	pdfimages -v
	pdftoppm -v
	pdffonts -v
	pdfseparate -v
	echo "--------------------------------"
	echo "unpaper version:"
	unpaper --version
	echo "--------------------------------"
	echo "tesseract version:"
	tesseract --version
	echo "--------------------------------"
	echo "python2 version:"
	python2 --version
	echo "--------------------------------"
	echo "Ghostscript version:"
	gs --version
	echo "--------------------------------"
	echo "Java version:"
	java -version
	echo "--------------------------------"
fi


# check if the language(s) passed to tesseract are all supported
for currentlan in `echo "$LAN" | sed 's/+/ /g'`; do
	if ! tesseract --list-langs 2>&1 | grep "^$currentlan\$" > /dev/null; then
		echo "The language \"$currentlan\" is not supported by tesseract."
		tesseract --list-langs 2>&1 | tr '\n' ' '; echo
		echo "Exiting..."
		exit $EXIT_BAD_ARGS
	fi
done


# Initialize path to temporary files using mktemp
# Goal: save tmp file in a sub-folder of the $TMPDIR environment variable (or in "/tmp" if unset)
# Unfortunately, Linux mktemp is not compatible with FreeBSD/OSX mktemp
# Linux version requires no arg
# FreeBSD requires '-t prefix' to be used so that $TMPDIR is taken into account
# But in Linux '-t template' is handled differently than in FreeBSD
# Therefore different calls must be used for Linux and for FreeBSD
prefix="$(date +"%Y%m%d_%H%M").filename.$(basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*$//')"	# prefix made of date, time and pdf file name without extension
TMP_FLD=`mktemp -d 2>/dev/null || mktemp -d -t "${prefix}" 2>/dev/null`				# try Linux syntax first, if it fails try FreeBSD/OSX			
if [ $? -ne 0 ]; then
	if [ -z "$TMPDIR" ]; then
		echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"/tmp\" exists"
	else
		echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"$TMPDIR\" exists"
	fi
	exit $EXIT_FILE_ACCESS_ERROR
fi
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Created temporary folder: \"$TMP_FLD\""

FILE_TMP="${TMP_FLD}/tmp.txt"						# temporary file with a very short lifetime (may be used for several things)
FILE_PAGES_INFO="${TMP_FLD}/pages-info.txt"				# for each page: page #; width in pt; height in pt
FILE_VALIDATION_LOG="${TMP_FLD}/pdf_validation.log"			# log file containing the results of the validation of the PDF/A file


# get the size of each pdf page (width / height) in pt (i.e. inch/72)
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Input file: Extracting size of each page (in pt)"
! identify -format "%w %h\n" "$FILE_INPUT_PDF" > "$FILE_TMP" \
	&& echo "Could not get size of PDF pages. Exiting..." && exit $EXIT_BAD_INPUT_FILE
# removing empty lines (last one should be) and add page # before each line
sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO"
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`

# process each page of the input pdf file
parallel --gnu -q -k --halt-on-error 1 "$OCR_PAGE" "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
	"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
	"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
ret_code="$?"
[ $ret_code -ne 0 ] && exit $ret_code 

# concatenate all pages and convert the pdf file to match PDF/A format
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Concatenating all pages to the final PDF/A file" 
! gs -dQUIET -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor \
	-sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 \
	-sOutputFile="$FILE_OUTPUT_PDFA" "${TMP_FLD}/"*ocred*.pdf 1> /dev/null 2> /dev/null \
	&& echo "Could not concatenate all pages to the final PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR

# validate generated pdf file (compliance to PDF/A)
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard" 
! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
	&& echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
# check the validation results
pdf_valid=1
grep -i 'ErrorMessage' "$FILE_VALIDATION_LOG" && pdf_valid=0
grep -i 'Status.*not valid' "$FILE_VALIDATION_LOG" && pdf_valid=0
grep -i 'Status.*Not well-formed' "$FILE_VALIDATION_LOG" && pdf_valid=0
! grep -i 'Profile:.*PDF/A-1' "$FILE_VALIDATION_LOG" > /dev/null && echo "PDF file profile is not PDF/A-1" && pdf_valid=0
[ $pdf_valid -ne 1 ] && echo "Output file: The generated PDF/A file is INVALID"
[ $pdf_valid -eq 1 ] && [ $VERBOSITY -ge $LOG_INFO ] && echo "Output file: The generated PDF/A file is VALID"


# delete temporary files
if [ $KEEP_TMP -eq 0 ]; then
	[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Deleting temporary files"
	rm -r -f "${TMP_FLD}"
fi


END=`date +%s`
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Script took $(($END-$START)) seconds"


[ $pdf_valid -ne 1 ] && exit $EXIT_INVALID_OUTPUT_PDFA || exit 0