-
Notifications
You must be signed in to change notification settings - Fork 6
/
merge_bed.py
executable file
·178 lines (143 loc) · 5.88 KB
/
merge_bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
import sys,os
from collections import defaultdict
from logging import debug,warning,error,info
from optparse import *
usage = """
%prog 1.bed 2.bed [3.bed] [4.bed] [...] > merged.bed
Merge BED or BED-like files on the genomic coordinates. Deals properly
with find_circ.py output and adds a few extra columns.
"""
parser = OptionParser(usage=usage)
parser.add_option("-f","--flank",dest="flank",type=int,default=0,help="add flanking nucleotides to define more fuzzy overlap (default=0)")
parser.add_option("-s","--stats",dest="stats",default="",help="write statistics to this file (instead of stderr)")
parser.add_option("-6","--bed6",dest="bed6",default=False,action="store_true",help="ignore all columns except the first six standard BED columns (default=False)")
parser.add_option("","--score",dest="score",default=False,action="store_true",help="ignore all columns except the first six standard BED columns and only retain score column from 2nd, 3rd ... BED (default=False)")
parser.add_option("-F","--format",dest="format",default="2",choices = ["1","1.2","2"],help="select the find_circ.py outout version (choices=['1','1.2','2'])")
parser.add_option("-V","--verbatim",dest="verbatim",default=False,action="store_true",help="do not attempt to merge all columns. Simply join on coordinates, other columns reported in verbatim")
options,args = parser.parse_args()
from numpy import *
def read_to_hash(fname,ds=0,de=0,flank=0,cover=False):
#print "loading",fname
pos = {}
def src(fname):
if fname == '-':
return sys.stdin
else:
return file(fname)
for line in src(fname):
if line.startswith("#"):
continue
line = line.strip()
parts = line.split('\t')
if options.bed6:
parts = parts[:6]
chrom,start,end,name,score,sense = parts[:6]
start,end = int(start)+ds,int(end)+de
#print (chrom,start,end,sense)
pos[(chrom,start,end,sense)] = parts
if flank:
for x in xrange(flank):
pos[(chrom,start-x,end,sense)] = parts
pos[(chrom,start+x,end,sense)] = parts
pos[(chrom,start,end-x,sense)] = parts
pos[(chrom,start,end+x,sense)] = parts
#if cover:
#for x in xrange
return pos
N = defaultdict(int)
inputs = [read_to_hash(a,flank=0) for a in args]
names = [os.path.basename(a) for a in args]
shorts = ["in%d" % i for i in range(len(names))]
by_name = dict(zip(shorts,inputs))
merge = {}
support = defaultdict(list)
for name,data in zip(shorts,inputs):
N[name] = len(data)
merge.update(data)
for pos in data:
support[pos].append(name)
from collections import Counter
comb = Counter([tuple(v) for v in support.values()])
if options.stats:
sfile = file(options.stats,"w")
else:
sfile = sys.stderr
for c in sorted(comb.keys()):
sfile.write("%s\t%d\n" % ("_AND_".join(c),comb[c]))
def consensus_cols(lines,comb):
samples = []
counts = defaultdict(int)
def setup_samples(values):
allsamples = []
for v in values:
toadd = v.split(",")
samples.append(toadd)
allsamples.extend(toadd)
return ",".join(sorted(allsamples))
def assign_counts(values):
for cs,ss in zip(values,samples):
for samp,count in zip(ss,cs.split(',')):
counts[samp] += int(count)
res = []
for k in sorted(counts.keys()):
res.append(counts[k])
return ",".join([str(c) for c in res])
def append_uniq(values):
v = set()
for row in values:
v |= set(row.split(","))
return ",".join([str(x) for x in sorted(v) if x])
col_map = {
3 : lambda values : ",".join(sorted(values)), # combine identifiers
4 : lambda values : array(values,dtype=float).sum(), # sum n_reads
6 : lambda values : array(values,dtype=float).sum(), # sum n_uniq
7 : lambda values : max([int(x) for x in values]), # max of best_uniq_A
8 : lambda values : max([int(x) for x in values]), # max of best_uniq_B
9 : lambda values : array(values,dtype=int).sum(), # sum ov_linear_A
10 : lambda values : array(values,dtype=int).sum(), # sum ov_linear_B
11 : setup_samples,
12 : assign_counts,
13 : lambda values : min([int(x) for x in values]), # min of edits
14 : lambda values : min([int(x) for x in values]), # min of anchor_overlap
15 : lambda values : min([int(x) for x in values]), # min of breakpoints
}
from itertools import izip_longest
parts = []
source = enumerate(izip_longest(*[l for l in lines],fillvalue=""))
for i,column in source:
#print i,column
if i in col_map:
parts.append(str(col_map[i](column)))
else:
parts.append(append_uniq(column))
return parts
if options.verbatim:
for pos in merge.keys():
com = support[pos]
comstr = "(%s)" % ",".join(com)
cols = [comstr]
for name in com:
cols.append("%s : " % name)
cols.append("\t".join(by_name[name][pos]))
print "\t".join(cols)
elif options.score:
for pos in merge.keys():
com = support[pos]
comstr = "(%s)" % ",".join(com)
rec_name = ",".join(set([by_name[name][pos][3] for name in com]))
cols = [rec_name]
for name in shorts:
if not name in com:
cols.append("0")
else:
cols.append(by_name[name][pos][4])
cols.append(comstr)
print "\t".join(cols)
else:
for pos in merge.keys():
com = support[pos]
comstr = "(%s)" % ",".join(com)
lines = [by_name[name][pos] for name in com]
cols = [comstr] + consensus_cols(lines,comb)
print "\t".join(cols)