forked from fritz-hh/OCRmyPDF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
OCRmyPDF.sh
316 lines (263 loc) · 14.7 KB
/
OCRmyPDF.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/bin/sh
##############################################################################
# Copyright (c) 2013-14: fritz-hh from Github (https://github.com/fritz-hh)
##############################################################################
# Determine real path of this script, following symlinks if present
! command -v python2 > /dev/null && echo "Please install python v2.x. Exiting..." && exit 1
BASEPATH="$(dirname $(python2 -c "import os; print os.path.realpath(\"$0\")"))"
# Import required scripts
. "$BASEPATH/src/config.sh"
# Set variables corresponding to the input parameters
ARGUMENTS="$@"
START=`date +%s`
usage() {
cat << EOF
--------------------------------------------------------------------------------------
Script aimed at generating a searchable PDF file from a PDF file containing only images.
(The script performs optical character recognition of each respective page using the
tesseract engine)
Copyright: fritz-hh from Github (https://github.com/fritz-hh)
Version: $VERSION
Usage: OCRmyPDF.sh [-h] [-v] [-g] [-k] [-d] [-c] [-i] [-o dpi] [-f|-s] [-l lan1[+lan2...]] [-C filename] inputfile outputfile
-h : Display this help message
-v : Increase the verbosity (this option can be used more than once) (e.g. -vvv)
-k : Do not delete the temporary files
-g : Activate debug mode:
- Generates a PDF file containing each page twice (once with the image, once without the image
but with the OCRed text as well as the detected bounding boxes)
- Set the verbosity to the highest possible
- Do not delete the temporary files
-d : Deskew each page before performing OCR
-c : Clean each page before performing OCR
-i : Incorporate the cleaned image in the final PDF file (by default the original image, or the deskewed image if the -d option is set)
-o : If the resolution of an image is lower than dpi value provided as argument, provide the OCR engine with
an oversampled image having the latter dpi value. This can improve the OCR results but can lead to a larger output PDF file.
(default: no oversampling performed)
-f : Force to OCR the whole document, even if some page already contain font data.
(which should not be the case for PDF files built from scnanned images)
Any text data will be rendered to raster format and then fed through OCR.
-s : If pages contain font data, do not OCR that page, but include the page (as is) in the final output.
-l : Language(s) of the PDF file. The language should be set correctly in order to get good OCR results.
Any language supported by tesseract is supported (Tesseract uses 3-character ISO 639-2 language codes)
Multiple languages may be specified, separated by '+' characters.
(The default language is defined in the config file)
-C : Pass an additional configuration file to the tesseract OCR engine.
(this option can be used more than once)
Note 1: The configuration file must be available in the "tessdata/configs" folder of your tesseract installation
inputfile : PDF file to be OCRed
outputfile : The PDF/A file that will be generated
--------------------------------------------------------------------------------------
EOF
}
#################################################
# Get an absolute path from a relative path to a file
#
# Param1 : Relative path
# Returns: 1 if the folder in which the file is located does not exist
# 0 otherwise
#################################################
absolutePath() {
local wdsave absolutepath
wdsave="$(pwd)"
! cd "$(dirname "$1")" 1> /dev/null 2> /dev/null && return 1
absolutepath="$(pwd)/$(basename "$1")"
cd "$wdsave"
echo "$absolutepath"
return 0
}
# Initialization the configuration parameters with default values
VERBOSITY="$LOG_ERR" # default verbosity level
LAN="$DEFAULT_LANGUAGES" # default language(s) of the PDF file (required to get good OCR results)
KEEP_TMP="0" # 0=no, 1=yes (keep the temporary files)
PREPROCESS_DESKEW="0" # 0=no, 1=yes (deskew image)
PREPROCESS_CLEAN="0" # 0=no, 1=yes (clean image to improve OCR)
PREPROCESS_CLEANTOPDF="0" # 0=no, 1=yes (put cleaned image in final PDF)
OVERSAMPLING_DPI="0" # 0=do not perform oversampling (dpi value under which oversampling should be performed)
PDF_NOIMG="0" # 0=no, 1=yes (generates each PDF page twice, with and without image)
FORCE_OCR="0" # 0=do not force, 1=force (force to OCR the whole document, even if some page already contain font data)
SKIP_TEXT="0" # 0=do not skip text pages, 1=skip text pages
TESS_CFG_FILES="" # list of additional configuration files to be used by tesseract
# Parse optional command line arguments
while getopts ":hvgkdcio:fsl:C:" opt; do
case $opt in
h) usage ; exit 0 ;;
v) VERBOSITY=$(($VERBOSITY+1)) ;;
k) KEEP_TMP="1" ;;
g) PDF_NOIMG="1"; VERBOSITY="$LOG_DEBUG"; KEEP_TMP="1" ;;
d) PREPROCESS_DESKEW="1" ;;
c) PREPROCESS_CLEAN="1" ;;
i) PREPROCESS_CLEANTOPDF="1" ;;
o) OVERSAMPLING_DPI="$OPTARG" ;;
f) FORCE_OCR="1" ;;
s) SKIP_TEXT="1" ;;
l) LAN="$OPTARG" ;;
C) TESS_CFG_FILES="$OPTARG $TESS_CFG_FILES" ;;
\?)
echo "Invalid option: -$OPTARG"
usage
exit $EXIT_BAD_ARGS ;;
:)
echo "Option -$OPTARG requires an argument"
usage
exit $EXIT_BAD_ARGS ;;
esac
done
# Remove the optional arguments parsed above.
shift $((OPTIND-1))
# Check if the number of mandatory parameters provided is as expected
if [ "$#" -ne "2" ]; then
echo "Exactly two mandatory argument shall be provided ($# arguments provided)"
usage
exit $EXIT_BAD_ARGS
fi
# Ensure that -f and -s are not both set
if [ "$SKIP_TEXT" -eq "1" -a "$FORCE_OCR" -eq "1" ]; then
echo "Options -f and -s are mutually exclusive; choose one or the other"
usage
exit $EXIT_BAD_ARGS
fi
[ ! -f "$1" ] && echo "The input file does not exist. Exiting..." && exit $EXIT_BAD_ARGS
FILE_INPUT_PDF="`absolutePath "$1"`"
! absolutePath "$2" >/dev/null \
&& echo "The folder in which the output file should be generated does not exist. Exiting..." && exit $EXIT_BAD_ARGS
[ -d "$2" ] && echo "Please enter the path of the file to be generated (and not a path to a folder). Exitíng..." && exit $EXIT_BAD_ARGS
[ -f "$2" ] && echo "The output file already exists. Exiting..." && exit $EXIT_BAD_ARGS
FILE_OUTPUT_PDFA="`absolutePath "$2"`"
# set script path as working directory
cd "$BASEPATH"
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "$TOOLNAME version: $VERSION"
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Arguments: $ARGUMENTS"
# check if the required utilities are installed
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Checking if all dependencies are installed"
! command -v identify > /dev/null && echo "Please install ImageMagick. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v parallel > /dev/null && echo "Please install GNU Parallel. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdfimages > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdftoppm > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdffonts > /dev/null && echo "Please install poppler-utils. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v pdfseparate > /dev/null && echo "Please install or update poppler-utils to at least 0.24.5. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
[ $PREPROCESS_CLEAN -eq 1 ] && ! command -v unpaper > /dev/null && echo "Please install unpaper. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v tesseract > /dev/null && echo "Please install tesseract and tesseract-data. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! python2 -c 'import lxml' 2>/dev/null && echo "Please install the python library lxml. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! python2 -c 'import sys, reportlab; (getattr(reportlab, "Version", "0.0") >= "3.0") or sys.exit(1)' 2>/dev/null \
&& echo "Please install the python library reportlab. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v gs > /dev/null && echo "Please install ghostscript. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
! command -v java > /dev/null && echo "Please install java. Exiting..." && exit $EXIT_MISSING_DEPENDENCY
# ensure the right tesseract version is installed
# older versions are known to produce malformed hocr output and should not be used
# Even 3.02.01 fails in few cases (see issue #28). I decided to allow this version anyway because
# 3.02.02 is not yet available for some widespread linux distributions
reqtessversion="3.02.01"
tessversion=`tesseract -v 2>&1 | grep "tesseract" | sed s/[^0-9.]//g`
tesstooold=$(echo "`echo $tessversion | sed s/[.]//2`-`echo $reqtessversion | sed s/[.]//2` < 0" | bc)
[ "$tesstooold" -eq "1" ] \
&& echo "Please install tesseract ${reqtessversion} or newer (currently installed version is ${tessversion})" && exit $EXIT_MISSING_DEPENDENCY
# ensure the right GNU parallel version is installed
# older version do not support -q flag (required to escape special characters)
reqparallelversion="20121122"
parallelversion=`parallel --minversion 0`
! parallel --minversion "$reqparallelversion" > /dev/null \
&& echo "Please install GNU parallel ${reqparallelversion} or newer (currently installed version is ${parallelversion})" && exit $EXIT_MISSING_DEPENDENCY
# ensure pdftoppm is provided by poppler-utils, not the older xpdf version
! pdftoppm -v 2>&1 | grep -q 'Poppler' && echo "Please remove xpdf and install poppler-utils. Exiting..." && $EXIT_MISSING_DEPENDENCY
# Display the version of the tools if log level is LOG_DEBUG
if [ $VERBOSITY -ge $LOG_DEBUG ]; then
echo "--------------------------------"
echo "ImageMagick version:"
identify --version
echo "--------------------------------"
echo "GNU Parallel version:"
parallel --version
echo "--------------------------------"
echo "Poppler-utils version:"
pdfimages -v
pdftoppm -v
pdffonts -v
pdfseparate -v
echo "--------------------------------"
echo "unpaper version:"
unpaper --version
echo "--------------------------------"
echo "tesseract version:"
tesseract --version
echo "--------------------------------"
echo "python2 version:"
python2 --version
echo "--------------------------------"
echo "Ghostscript version:"
gs --version
echo "--------------------------------"
echo "Java version:"
java -version
echo "--------------------------------"
fi
# check if the language(s) passed to tesseract are all supported
for currentlan in `echo "$LAN" | sed 's/+/ /g'`; do
if ! tesseract --list-langs 2>&1 | grep "^$currentlan\$" > /dev/null; then
echo "The language \"$currentlan\" is not supported by tesseract."
tesseract --list-langs 2>&1 | tr '\n' ' '; echo
echo "Exiting..."
exit $EXIT_BAD_ARGS
fi
done
# Initialize path to temporary files using mktemp
# Goal: save tmp file in a sub-folder of the $TMPDIR environment variable (or in "/tmp" if unset)
# Unfortunately, Linux mktemp is not compatible with FreeBSD/OSX mktemp
# Linux version requires no arg
# FreeBSD requires '-t prefix' to be used so that $TMPDIR is taken into account
# But in Linux '-t template' is handled differently than in FreeBSD
# Therefore different calls must be used for Linux and for FreeBSD
prefix="$(date +"%Y%m%d_%H%M").filename.$(basename "$FILE_INPUT_PDF" | sed 's/[.][^.]*$//')" # prefix made of date, time and pdf file name without extension
TMP_FLD=`mktemp -d 2>/dev/null || mktemp -d -t "${prefix}" 2>/dev/null` # try Linux syntax first, if it fails try FreeBSD/OSX
if [ $? -ne 0 ]; then
if [ -z "$TMPDIR" ]; then
echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"/tmp\" exists"
else
echo "Could not create folder for temporary files. Please ensure you have sufficient right and \"$TMPDIR\" exists"
fi
exit $EXIT_FILE_ACCESS_ERROR
fi
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Created temporary folder: \"$TMP_FLD\""
FILE_TMP="${TMP_FLD}/tmp.txt" # temporary file with a very short lifetime (may be used for several things)
FILE_PAGES_INFO="${TMP_FLD}/pages-info.txt" # for each page: page #; width in pt; height in pt
FILE_VALIDATION_LOG="${TMP_FLD}/pdf_validation.log" # log file containing the results of the validation of the PDF/A file
# get the size of each pdf page (width / height) in pt (i.e. inch/72)
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Input file: Extracting size of each page (in pt)"
! identify -format "%w %h\n" "$FILE_INPUT_PDF" > "$FILE_TMP" \
&& echo "Could not get size of PDF pages. Exiting..." && exit $EXIT_BAD_INPUT_FILE
# removing empty lines (last one should be) and add page # before each line
sed '/^$/d' "$FILE_TMP" | awk '{printf "%04d %s\n", NR, $0}' > "$FILE_PAGES_INFO"
numpages=`tail -n 1 "$FILE_PAGES_INFO" | cut -f1 -d" "`
# process each page of the input pdf file
parallel --gnu -q -k --halt-on-error 1 "$OCR_PAGE" "$FILE_INPUT_PDF" "{}" "$numpages" "$TMP_FLD" \
"$VERBOSITY" "$LAN" "$KEEP_TMP" "$PREPROCESS_DESKEW" "$PREPROCESS_CLEAN" "$PREPROCESS_CLEANTOPDF" "$OVERSAMPLING_DPI" \
"$PDF_NOIMG" "$FORCE_OCR" "$SKIP_TEXT" "$TESS_CFG_FILES" < "$FILE_PAGES_INFO"
ret_code="$?"
[ $ret_code -ne 0 ] && exit $ret_code
# concatenate all pages and convert the pdf file to match PDF/A format
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Concatenating all pages to the final PDF/A file"
! gs -dQUIET -dPDFA -dBATCH -dNOPAUSE -dUseCIEColor \
-sProcessColorModel=DeviceCMYK -sDEVICE=pdfwrite -sPDFACompatibilityPolicy=2 \
-sOutputFile="$FILE_OUTPUT_PDFA" "${TMP_FLD}/"*ocred*.pdf 1> /dev/null 2> /dev/null \
&& echo "Could not concatenate all pages to the final PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
# validate generated pdf file (compliance to PDF/A)
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Output file: Checking compliance to PDF/A standard"
! java -jar "$JHOVE" -c "$JHOVE_CFG" -m PDF-hul "$FILE_OUTPUT_PDFA" 2> /dev/null 1> "$FILE_VALIDATION_LOG" \
&& echo "Unexpected error while checking compliance to PDF/A file. Exiting..." && exit $EXIT_OTHER_ERROR
grep -i "Status|Message" "$FILE_VALIDATION_LOG" # summary of the validation
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "The full validation log is available here: \"$FILE_VALIDATION_LOG\""
# check the validation results
pdf_valid=1
grep -i 'ErrorMessage' "$FILE_VALIDATION_LOG" && pdf_valid=0
grep -i 'Status.*not valid' "$FILE_VALIDATION_LOG" && pdf_valid=0
grep -i 'Status.*Not well-formed' "$FILE_VALIDATION_LOG" && pdf_valid=0
! grep -i 'Profile:.*PDF/A-1' "$FILE_VALIDATION_LOG" > /dev/null && echo "PDF file profile is not PDF/A-1" && pdf_valid=0
[ $pdf_valid -ne 1 ] && echo "Output file: The generated PDF/A file is INVALID"
[ $pdf_valid -eq 1 ] && [ $VERBOSITY -ge $LOG_INFO ] && echo "Output file: The generated PDF/A file is VALID"
# delete temporary files
if [ $KEEP_TMP -eq 0 ]; then
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Deleting temporary files"
rm -r -f "${TMP_FLD}"
fi
END=`date +%s`
[ $VERBOSITY -ge $LOG_DEBUG ] && echo "Script took $(($END-$START)) seconds"
[ $pdf_valid -ne 1 ] && exit $EXIT_INVALID_OUTPUT_PDFA || exit 0