-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_corpus.sh
executable file
·132 lines (100 loc) · 2.56 KB
/
convert_corpus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env sh
SCRIPT=$( readlink -f "$0" )
SCRIPT_PATH=$( dirname "$SCRIPT" )
CONVERSION_SCRIPT_DEFAULT="${SCRIPT_PATH}/run_conversion_chain.sh"
CORPUS_FILES_LIST_DEFAULT="${SCRIPT_PATH}/corpus_files.txt"
CORPUS_ROOT_PATH_DEFAULT="${SCRIPT_PATH}/KielCorpus/"
usage () {
cat << END
$0 [-h|--help] [-c CONVERSION_SCRIPT] [-l CORPUS_FILES_LIST] [-r CORPUS_ROOT_PATH]
Script that converts Kiel Corpus files based on a list of corpus files
using a conversion script.
[-h|--help] this output
[-c CONVERSION_SCRIPT] specify conversion script
default: $CONVERSION_SCRIPT_DEFAULT
[-l CORPUS_FILES_LIST] specify list of corpus files
default: $CORPUS_FILES_LIST_DEFAULT
[-r CORPUS_ROOT_PATH] specify root path of Kiel Corpus
default: $CORPUS_ROOT_PATH_DEFAULT
END
}
clean_exit () {
wait
echo
echo "Stopping ..."
date
exit
}
while [ $# -gt 0 ]; do
case $1 in
-c)
CONVERSION_SCRIPT="$2"
shift
;;
-l)
CORPUS_FILES_LIST="$2"
shift
;;
-r)
CORPUS_ROOT_PATH="$2"
shift
;;
-h|--help)
usage
exit
;;
*)
;;
esac
shift
done
if [ "$CONVERSION_SCRIPT" = "" ]; then
CONVERSION_SCRIPT=$CONVERSION_SCRIPT_DEFAULT
fi
if [ ! -e "$CONVERSION_SCRIPT" ]; then
echo "ERROR: Unable to find conversion script ${CONVERSION_SCRIPT}!"
echo "Exitting ..."
exit 1
fi
if [ "$CORPUS_FILES_LIST" = "" ]; then
CORPUS_FILES_LIST=$CORPUS_FILES_LIST_DEFAULT
fi
if [ ! -e "$CORPUS_FILES_LIST" ]; then
echo "ERROR: Unable to find list of corpus files ${CORPUS_FILES_LIST}!"
echo "Exitting ..."
exit 1
fi
if [ "$CORPUS_ROOT_PATH" = "" ]; then
CORPUS_ROOT_PATH=$CORPUS_ROOT_PATH_DEFAULT
fi
if [ ! -d "$CORPUS_ROOT_PATH" ]; then
echo "ERROR: Unable to find corpus root path ${CORPUS_ROOT_PATH}!"
echo "Exitting ..."
exit 1
fi
trap clean_exit 2
FILES=$( cat "$CORPUS_FILES_LIST" )
# we want to see some progress during processing files
FILE_AMOUNT=$( echo "$FILES" | wc -l | awk '{print $1}' )
MSG_ALL_NR_FILES=50
COUNTER=0
ALL_COUNTER=0
date
echo "Processing $FILE_AMOUNT Kiel Corpus files ..."
echo -n "... "
for F in $FILES; do
# progress message
if [ $COUNTER -eq $MSG_ALL_NR_FILES ]; then
ALL_COUNTER=$(( $ALL_COUNTER + $MSG_ALL_NR_FILES ))
echo -n " $ALL_COUNTER ..."
COUNTER=0
fi
FILE="${CORPUS_ROOT_PATH}/${F}"
OUT=$( $CONVERSION_SCRIPT "$FILE" 2>&1 )
if [ $? -ne 0 ]; then
echo "ERROR: unable to convert ${FILE}!"
echo "$OUT"
fi
COUNTER=$(( COUNTER + 1 ))
done
clean_exit