-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfilter_csv.sh
More file actions
executable file
·150 lines (140 loc) · 4.95 KB
/
filter_csv.sh
File metadata and controls
executable file
·150 lines (140 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env bash
SCRIPTROOT=$( cd $(dirname $0) ; pwd)
Usage(){
cat <<USAGE
++++++++++++++++++++++++++++++++++++++++++++++++++++
Part of sclib (https://github.com/xfgavin/sclib)
Remove matched rows in a given list from a csv
by xfgavin@gmail.com 01/24/2018 @UCSD
+++++++++++++++++++++++++++++++++++++++++++++++++++++
`basename $0` <-i csv to work on> <-l list to remove> [-c target column in csv -ih rows of input file header -lh rows of list header ]
-i </path/to/csv>
The csv we are going to work on,
Or CSVs, in this case, please use double quotes, esp, when you are using wildcards.
-l </path/to/list>
The list to remove from csv.
Optional:
-c <column number in target csv>
Define which column in the csv has the data that in the list file.
-ih <rows of input file header>
How many rows of the input file header, by default it equals 1
-lh <rows of list header>
How many rows of the list header, by default it equals 1
-h, --help
Show me.
Examples:
`basename $0` -i target.csv -l list
`basename $0` -i target.csv -l list -c 3
`basename $0` -i "*.csv" -l list -c 3
USAGE
rm -f $lck
[ ${#ERROR} -gt 0 ] && echo -e "\e[1;101;93mError: $ERROR\e[0m" && exit -1
exit 0
}
filtercsv(){
((index_list=csv_list_header_len+1))
tail -n +$index_list $csv_list|sed -e 's/"//g' > dup_list
for csv in $csv_input
do
column_count=`head -1 $csv | sed 's/[^,]//g' | wc -c`
filename=`echo $csv|rev|cut -d. -f 2-|rev`
head -n$csv_input_header_len $csv > ${filename}_header.csv
((index_data=csv_input_header_len+1))
tail -n +$index_data $csv|sort -k$csv_input_column -t, > ${filename}_data.csv
cut -d, -f$csv_input_column ${filename}_data.csv >data_list
sort dup_list data_list |uniq -u > uniq_list
case $csv_input_column in
1)
join -t, ${filename}_data.csv uniq_list >${filename}_data_new.csv
;;
$column_count)
((index_data_1=csv_input_column-1))
#cut -d, -f1-$index_data_1 ${filename}_data.csv >data_list_1
#paste data_list data_list_1 -d, > ${filename}_data.csv
join -t, -1 $csv_input_column ${filename}_data.csv uniq_list >${filename}_data_new.csv
cut -d, -f1 ${filename}_data_new.csv >data_list
cut -d, -f2- ${filename}_data_new.csv >data_list_1
paste data_list_1 data_list -d, > ${filename}_data_new.csv
;;
*)
[ $csv_input_column -gt $column_count ] && echo -e '\E[7;31;40m'"\033[1mError: column number in input csv cannot be greater than total column count\033[0m" && exit -1
((index_data_1=csv_input_column-1))
((index_data_2=csv_input_column+1))
#cut -d, -f1-$index_data_1 ${filename}_data.csv >data_list_1
#cut -d, -f${index_data_2}- ${filename}_data.csv >data_list_2
#paste data_list data_list_1 data_list_2 -d, > ${filename}_data.csv
join -t, -1 $csv_input_column ${filename}_data.csv uniq_list >${filename}_data_new.csv
cut -d, -f1 ${filename}_data_new.csv >data_list
cut -d, -f2-$csv_input_column ${filename}_data_new.csv >data_list_1
cut -d, -f${index_data_2}- ${filename}_data_new.csv >data_list_2
paste data_list_1 data_list data_list_2 -d, > ${filename}_data_new.csv
;;
esac
mv ${filename}_header.csv $csv
cat ${filename}_data_new.csv >> $csv
rm -f ${filename}_data*.csv data_list* uniq_list
done
rm -f dup_list
}
lck=.filter_csv.lck
[ -f $lck ] && echo "filter_csv process is running, check lock file: `pwd`/$lck" && echo -1
touch $lck
re_number='^[0-9]+$'
if [ $# -eq 1 ]
then
[ $1 = "-f" -o $1 = "--help" ] && Usage
else
while [ x$1 != x ] ; do
case $1 in
-i)
#input file
#[ -f $2 ] && csv_input=$2 || (ERROR="Input file: $2 doesn't exist" ; Usage)
csv_input="$2"
shift 2
;;
-l)
#list
[ -f $2 ] && csv_list=$2 || (ERROR="list: $2 doesn't exist" ; Usage)
shift 2
;;
-v)
#Debugging
DEBUG=1
shift 1
;;
-c)
#column number in input csv that has info of the list
[[ $2 =~ $re_number ]] && csv_input_column=$2 || (ERROR="column number should be non-negative integer" ; Usage)
shift 2
;;
-ih)
#row count of input csv header
[[ $2 =~ $re_number ]] && csv_input_header_len=$2 || (ERROR="row count of input header should be non-negative integer" ; Usage)
shift 2
;;
-lh)
#row count of list csv header
[[ $2 =~ $re_number ]] && csv_list_header_len=$2 || (ERROR="row count of input header should be non-negative integer" ; Usage)
shift 2
;;
*)
shift 1
;;
esac
done
fi
if [ ${#csv_input} -eq 0 ]
then
ERROR="Please supply Input file"
Usage
elif [ ${#csv_list} -eq 0 ]
then
ERROR="Please supply list file"
Usage
else
[ ${#csv_input_column} -eq 0 ] && csv_input_column=1
[ ${#csv_input_header_len} -eq 0 ] && csv_input_header_len=1
[ ${#csv_list_header_len} -eq 0 ] && csv_list_header_len=1
filtercsv
fi
rm -f $lck