-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathscan_perpage
executable file
·144 lines (128 loc) · 4.74 KB
/
scan_perpage
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
# Usage: scan_perpage <imagefile>
# where imagefile is the data just scanned
# (specify this script to scanadf via -S)
usage()
{
echo "Usage: $0 <imagefile>"
echo "Set the following environment variables:"
echo " UNPAPER"
echo " SEARCHABLE"
echo " LANGUAGE"
echo " RESOLUTION"
echo " PGWIDTHIN"
echo " PGHEIGHTIN"
echo " SKIP_EMPTY_PAGES"
echo " BRIGHTNESS_CONTRAST"
echo " PS2PDF_OPTS (optional)"
echo " VERBOSE (optional)"
echo " LOCKFILE (required if VERBOSE=1)"
}
log()
{
if [[ $VERBOSE == 1 ]]; then
echo "scan_perpage: $1"
fi
}
logstdout()
{
if [[ $VERBOSE == 1 ]]; then
cat
else
cat > /dev/null
fi
}
runconstrained()
{
if [[ -x "$(command -v sem)" ]]; then
# use up to 75% of the cores available
sem --jobs 75% --id scan_perpage --fg "$@"
else
"$@"
fi
}
if [[ $# < 1 ]]; then
usage
exit 1
fi
if [[ "$UNPAPER" == "" || "$SEARCHABLE" == "" || "$RESOLUTION" == "" || "$RESOLUTION" == "" || "$SKIP_EMPTY_PAGES" == "" ]]; then
usage
exit 1
fi
IMAGE_PATH=$1
IMAGE_DIR=$(dirname $1)
IMAGE_FILE=$(basename $1)
TIMEVERBOSE=
if [[ $VERBOSE == 1 ]]; then
TIMEVERBOSE=time
fi
process_page() {
log ""
log "-------------------------------------------------------------------------------"
log "Post-processing scanned page ${IMAGE_PATH}, deskew=$UNPAPER, searchable=$SEARCHABLE, skip-empty-pages=$SKIP_EMPTY_PAGES, white-threshold=$WHITE_THRESHOLD, brightness-contrast-sw=$BRIGHTNESS_CONTRAST..."
log "-------------------------------------------------------------------------------"
if [[ "$BRIGHTNESS_CONTRAST" != "" ]]; then
log "Adjust brightness and contrast in ImageMagick by $BRIGHTNESS_CONTRAST"
convert "$IMAGE_PATH" -brightness-contrast $BRIGHTNESS_CONTRAST "$IMAGE_PATH"
fi
if [[ $SKIP_EMPTY_PAGES == 1 ]]; then
PERCENTAGE_WHITE=$(convert "$IMAGE_PATH" -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || PERCENTAGE_WHITE=0
log "$IMAGE_PATH has $PERCENTAGE_WHITE % white"
else
PERCENTAGE_WHITE=0
fi
PP_PREFIX=
if [[ $SKIP_EMPTY_PAGES == 1 && $(echo "$PERCENTAGE_WHITE > $WHITE_THRESHOLD" | bc -l) == 1 ]]; then
log "Skipping empty page $IMAGE_FILE with white percentage $PERCENTAGE_WHITE"
else
if [[ $UNPAPER == 1 ]]; then
log "Applying unpaper post-processing to image data..."
PP_PREFIX="unpaper-"
if [[ $VERBOSE == 1 ]]; then
UNPAPERVERBOSE="-v"
fi
#runconstrained $TIMEVERBOSE unpaper $UNPAPERVERBOSE --no-mask-scan --overwrite --deskew-scan-range=35 --dpi $RESOLUTION --no-blackfilter $IMAGE_FILE $PP_PREFIX$IMAGE_FILE | logstdout
runconstrained $TIMEVERBOSE unpaper $UNPAPERVERBOSE --overwrite --deskew-scan-range=35 --dpi $RESOLUTION $IMAGE_PATH $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE | logstdout
fi
if [[ $SEARCHABLE == 1 ]]; then
log "Converting image data to searchable pdf..."
# tesseract uses the input's DPI header, we need to convert to a format that supports this (like tiff)
log "...Running convert"
runconstrained $TIMEVERBOSE convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff | logstdout
log "...Running tesseract"
runconstrained $TIMEVERBOSE tesseract $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff $IMAGE_DIR/${IMAGE_FILE%.*} -l $LANGUAGE pdf | logstdout
[[ -f $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff ]] && rm $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}.tiff
else
log "Converting image data to pdf..."
if [[ "$PGWIDTHIN" == "" || "$PGHEIGHTIN" == "" ]]; then
PAGEOPTS="-equalpixels -dpi=$RESOLUTION -noturn"
else
PAGEOPTS="-imagewidth $PGWIDTHIN -imageheight $PGHEIGHTIN"
fi
PNMVERBOSE=
# older versions of pnmtops (Ubuntu, Debian) don't have the -verbose option, test for it
if [[ $VERBOSE == 1 && ! "$(pnmtops -verbose 2>&1 < /dev/null)" =~ "unrecognized option" ]]; then
PNMVERBOSE="-verbose"
fi
log "...Running pnmtops on $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE using page options: $PAGEOPTS"
runconstrained $TIMEVERBOSE pnmtops $PNMVERBOSE $PAGEOPTS $IMAGE_DIR/$PP_PREFIX$IMAGE_FILE > $IMAGE_DIR/${IMAGE_FILE}.ps | logstdout
log "...Running ps2pdf on $IMAGE_DIR/${IMAGE_FILE}.ps"
runconstrained $TIMEVERBOSE ps2pdf $PS2PDF_OPTS $IMAGE_DIR/${IMAGE_FILE}.ps $IMAGE_DIR/${IMAGE_FILE}.pdf | logstdout
[[ -f $IMAGE_DIR/${IMAGE_FILE}.ps ]] && rm $IMAGE_DIR/${IMAGE_FILE}.ps
fi
fi
status=$?
rm $IMAGE_PATH
[[ -f $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE} ]] && rm $IMAGE_DIR/$PP_PREFIX${IMAGE_FILE}
log ""
log "Scan page processing done, status = $status"
}
if [[ $VERBOSE == 1 ]]; then
(
flock 200
process_page
) 200>$LOCKFILE
else
process_page
fi;
exit $status