-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.sh
75 lines (61 loc) · 1.68 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
# credits:
# - https://www.biostars.org/p/78929/
FILE=$1
if [[ "$FILE" == "" ]];then
echo Missing gz file name!
exit 0
fi
if [[ -f $FILE ]];then
echo "$FILE exists"
else
echo "$FILE doesn't exist"
exit 0
fi
echo
echo Preprocessing :
echo
echo Splitting $FILE into individual VCFs using PERL
echo
echo Step 1 : creating common and private txt files -
echo This may take a while...
time zcat $FILE | perl -lane '
if (/^#/) { if (/^##/) { print STDERR } else {
print STDERR join "\t", @F[0..8]; @samples = @F;
} } else {
print STDERR join "\t", @F[0..8];
for ($i = 9; $i <= $#F; $i++) {
if ($F[$i] =~ /^..[1-9]/) {
print STDOUT join "\t", $samples[$i], $lc, $F[$i];
} } } $lc++' 2> vcfs/_vcf.common.txt | sort -k1,1 -k2,2n > vcfs/_vcf.private.txt
echo Step 2 : converting common and private txt files to individual VCF files -
echo This also may take a while...
mkdir -p vcfs/split
time perl -lane 'BEGIN {
open IN, "vcfs/_vcf.common.txt" or die $!;
chomp(@common = <IN>); foreach (@common) {
if (/^##/) { $headers .= "$_\n" } else { $headers .= $_; last }
} close IN }
if ($F[0] ne $previousSample) {
close OUT if $previousSample;
open OUT, ">vcfs/split/$F[0].vcf";
print OUT "$headers\t$F[0]";
} $previousSample = $F[0];
print OUT "$common[$F[1]]\t$F[2]";
END { close OUT }' vcfs/_vcf.private.txt
echo Step 3 : compressing individual VCF files -
echo This also may take a while...
time for file in vcfs/split/*vcf; do
bgzip -f $file;
# tabix -fp vcf $file.gz
done
# for file in split/*vcf.gz; do
# gunzip $file
# done
# rm *vcf.gz
# rm *vcf.gz.tbi
# Clean up
mv vcfs/split/*.vcf.gz vcfs/
rmdir vcfs/split/
rm vcfs/_vcf.private.txt
rm vcfs/_vcf.common.txt