@@ -4,15 +4,30 @@ if [[ "$2" == "" ]]; then
4
4
echo " $( basename $0 ) DIR NTHREADS [OUTPUT]" 1>&2
5
5
echo " Reads files in DIR and processes them using NTHREADS parallel sorts." 1>&2
6
6
echo " Files are processed as input files unless OUTPUT is specified." 1>&2
7
+ echo " FILES MUST END WITH A NEWLINE. Fix them with \" sed -i -e '\$ a\\ ' *\" ." 1>&2
7
8
exit 1
8
9
fi
9
10
10
11
DIR=$1
11
12
NTHREADS=$2
12
13
OUTPUT=$3
13
14
15
+ function file_ends_with_newline() {
16
+ [[ $( tail -c1 " $1 " | wc -l) -gt 0 ]]
17
+ }
18
+
14
19
FILES=$( mktemp)
15
20
find $DIR -type f > $FILES
21
+
22
+ # Check that all files end with a newline
23
+
24
+ while read FILE; do
25
+ if ! file_ends_with_newline $FILE ; then
26
+ echo " File $FILE does not end with a newline" 1>&2
27
+ exit 1
28
+ fi
29
+ done < $FILES
30
+
16
31
NFILES=$( cat $FILES | wc -l)
17
32
18
33
# To avoid empty splits, there must be at least as many threads as files
@@ -26,19 +41,12 @@ SPLITBASE=$(mktemp)
26
41
split -n l/$NTHREADS $FILES $SPLITBASE
27
42
SPLITS=$( for file in ${SPLITBASE} ?* ; do echo $file ; done)
28
43
29
- for SPLIT in $SPLITS ; do
44
+ for SPLIT in $SPLITS ; do
30
45
mkfifo $SPLIT .pipe
31
-
32
- # For each file, delete first line (labels); cut will add a newline at the end if missing
33
-
34
46
if [[ " $OUTPUT " != " " ]]; then
35
- ( while read FILE; do
36
- cut -f2,7,10 " $FILE " | tail -n+2 | awk ' { if ($3 == 0) print $1 "\t" $2 }'
37
- done < $SPLIT | LC_ALL=C sort -S2G > $SPLIT .pipe) &
47
+ (tail -q -n+2 $( cat $SPLIT ) | cut -f2,7,10 | awk ' { if ($3 == 0) print $1 "\t" $2 }' | LC_ALL=C sort -S2G > $SPLIT .pipe) &
38
48
else
39
- ( while read FILE; do
40
- cut -f7,13 " $FILE " | tail -n+2 | awk ' { print $2 "\t" $1 }'
41
- done < $SPLIT | LC_ALL=C sort -S2G > $SPLIT .pipe) &
49
+ (tail -q -n+2 $( cat $SPLIT ) | cut -f7,13 | awk ' { print $2 "\t" $1 }' | LC_ALL=C sort -S2G > $SPLIT .pipe) &
42
50
fi
43
51
done
44
52
0 commit comments