forked from MorrellLAB/sequence_handling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_sample_list.sh
executable file
·79 lines (69 loc) · 2.95 KB
/
check_sample_list.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
set -e
set -o pipefail
# Check to make sure we have our argument
if [ "$#" -ne 1 ]
then
echo -e "\
Usage: ./$0 sample_info \n\
where: 'sample_info' is a file containing a list of samples \n\
\n\
$0 is designed to check a sample list to ensure that it is properly formatted \n\
for use with sequence_handling. There are two checks run in this script: \n\
First, we check to make sure files exist. The best way to ensure that a \n\
sample list passes this check is to use full file paths for each sample \n\
in the list.
Second, we check to make sure all samples have unique names. We use the names given \n\
to each sample to name subsequent files; if any have the same name, outfiles will be \n\
overwritten. We check this to make sure no files are overwritten while using sequence_handling \n\
" >&2
exit 1
fi
# Assign our argument to a variable
SAMPLE_INFO=$1
# Check to make sure files exist
TIME=`date +%m-%d-%y-%H.%M.%S` # Figure out what the time is so that the file with missing samples isn't one messy file
declare -a MISSING # Set up an array to hold missing samples
counter=0 # Start a counter for adding to the bash array
for sample in `cat "${SAMPLE_INFO}"`
do
if [[ ! -f "$sample" ]] # If this sample doesn't exist
then
echo "Cannot find $sample" # Say which is missing
MISSING["$counter"]="$sample" # Add the sample to our missing array
let "counter += 1" # Increment the counter for the next index
fi
done
if [[ ! -z "${MISSING}" ]] # If we're missing something
then
# Write to a missing_samples file
echo "Cannot find:" > missing_samples_"${TIME}".txt
for missing in "${MISSING[@]}"
do
echo "$missing" >> missing_samples_"${TIME}".txt
done
# Say where the missing samples are
echo "A list of samples missing can be found at `pwd`/missing_samples_${TIME}.txt" # Say where the file of missing samples is
exit 1 # Exit
fi
# Make sure sample names are unique
declare -a sample_names=() # Set up an array to hold these sample names
for i in `seq 0 "$(( $( wc -l < ${SAMPLE_INFO} ) - 1 ))"`
do
sample=`basename $( head -"$(( $i + 1 ))" "${SAMPLE_INFO}" | tail -1 )`
sample_names["$i"]="$sample"
done
oldIFS="$IFS" # Save the IFS variable
sorted_samples=($(sort <<< "${sample_names[@]}")) # Sort the sample_names array
IFS="$oldIFS" # Restore the IFS variable
declare -a unique_names=(`tr ' ' '\n' <<< "${sorted_samples[@]}" | sort -u | tr '\n' ' '`) # Create an array of unique sample names
if [[ "${#sorted_samples[@]}" -ne "${#unique_names[@]}" ]]
then
echo "$(( ${#sorted_samples[@]} - ${#unique_names[@]} )) duplicate sample name(s) found!"
oldIFS="$IFS" # Save the IFS variable
IFS=$'\n\t' # Set a new IFS variable to trick 'comm' into working with arrays
Differences=($( comm --nocheck-order -3 <(echo "${sorted_samples[@]}") <(echo "${unique_names[@]}") ) )
IFS="$oldIFS"
declare -p Differences
exit 5
fi