-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathmake_tfams.py
executable file
·146 lines (130 loc) · 7.33 KB
/
make_tfams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#! /usr/bin/env python
import argparse
from plastid.readers.bed import BED_Reader
from plastid.genomics.roitools import SegmentChain, positionlist_to_segments
from collections import defaultdict
import os
import sys
from time import strftime
parser = argparse.ArgumentParser(description='Identify overlapping transcripts from a bed file, termed transcript families ("tfams"). Each tfam will '
'be assigned a name, either based on the transcript IDs or from an optional gene name file. It is '
'recommended that this script be run in a newly created folder containing only the input BED file, '
'generated by prune_transcripts.py, and possibly the GENENAMES file. It is also recommended that the '
'default value ("tfams") be used for parameter TFAMSTEM, for consistency with later scripts.')
parser.add_argument('-g', '--genenames', help='Path to file with gene name lookup table. Formatted as a simple two-column tab-delimited file, with '
'one transcript ID followed by the corresponding gene name on each line. Gene names may be repeated, '
'though assigning the same name to non-overlapping transcripts will trigger a warning. Not every '
'transcript must be assigned a gene name. If no file is provided, or if no gene names are available '
'for any of the transcripts in a family, transcript IDs will be used as names.')
parser.add_argument('--inbed', default='transcripts.bed', help='Transcriptome BED-file (Default: transcripts.bed)')
parser.add_argument('--tfamstem', default='tfams', help='Output filestem. OUTSTEM.txt will be a tab-delimited file indicating which transcripts are '
'in which tfam. OUTSTEM.bed will be a bed file showing the genomic positions of each tfam. '
'(Default: tfams)')
parser.add_argument('-v', '--verbose', action='store_true', help='Output a log of progress and timing (to stdout)')
parser.add_argument('-f', '--force', action='store_true', help='Force file overwrite')
opts = parser.parse_args()
outbedname = opts.tfamstem + '.bed'
outtxtname = opts.tfamstem + '.txt'
if not opts.force:
if os.path.exists(outbedname):
raise IOError('%s exists; use --force to overwrite' % outbedname)
if os.path.exists(outtxtname):
raise IOError('%s exists; use --force to overwrite' % outtxtname)
if opts.verbose:
sys.stdout.write(' '.join(sys.argv) + '\n')
def logprint(nextstr):
sys.stdout.write('[%s] %s\n' % (strftime('%Y-%m-%d %H:%M:%S'), nextstr))
sys.stdout.flush()
logprint('Identifying transcript overlaps')
tfams = {} # will contain integer keys to a tuple: ([list of tid],(chrom,strand),{set of gcoord})
genlookup = defaultdict(dict) # indexed by (chrom,strand) keys to an integer key to tfam
next_tfam = 0
processed = 0
with open(opts.inbed, 'rU') as inbed:
for trans in BED_Reader(inbed):
pos_set = trans.get_position_set()
currfams = {genlookup[(trans.chrom, trans.strand)][pos] for pos in pos_set if pos in genlookup[(trans.chrom, trans.strand)]}
if currfams:
newfam = min(currfams)
currfams.discard(newfam)
for fam in currfams:
oldfam_item = tfams.pop(fam)
tfams[newfam][0].extend(oldfam_item[0]) # extend the list of tids
assert (tfams[newfam][1] == oldfam_item[1]) # chrom,strand should match
tfams[newfam][2].update(oldfam_item[2]) # union the gcoords
tfams[newfam][0].append(trans.attr['ID'])
tfams[newfam][2].update(pos_set)
else:
newfam = next_tfam
next_tfam += 1
tfams[newfam] = ([trans.attr['ID']], (trans.chrom, trans.strand), pos_set)
for pos in tfams[newfam][2]:
genlookup[(trans.chrom, trans.strand)][pos] = newfam # override old families and expand as needed
processed += 1
def _choose_name(names):
"""Somewhat silly function that chooses a gene name if more than one are given for this gene. Chooses the shortest if that's unique, then one
consisting only of letters and numbers if that's unique, then the one with fewest numbers in it, and finally the first by alphabetical order"""
remaining = list(set(names))
if len(remaining) > 1:
minlen = len(remaining[0])
nextset = [remaining[0]]
for name in remaining[1:]:
if len(name) == minlen:
nextset.append(name)
elif len(name) < minlen:
nextset = [name]
minlen = len(name)
remaining = nextset # keep the shortest
if len(remaining) > 1:
nextset = [name for name in remaining if name.isalnum()]
if nextset:
remaining = nextset # remove non-alphanumerics if possible
if len(remaining) > 1:
min_nums = sum([d.isdigit() for d in remaining[0]])
nextset = [remaining[0]]
for name in remaining[1:]:
numdig = sum([d.isdigit() for d in name])
if numdig == min_nums:
nextset.append(name)
elif numdig < min_nums:
nextset = [name]
min_nums = numdig
remaining = nextset # keep the fewest digits
if len(remaining) > 1:
remaining.sort() # if all else fails, go alphabetical
chosen = remaining[0]
if '/' in chosen:
sys.stderr.write('WARNING: Gene name %s contains illegal character "/" which was replaced with "_"\n' % chosen)
chosen = chosen.replace('/', '_') # avoid sub-keying in h5py! Yes, this has happened!
return chosen
if opts.verbose:
logprint('Assigning names to transcript families')
if opts.genenames:
with open(opts.genenames, 'rU') as infile:
gene_name_lookup = {x[0]: x[1] for x in [line.strip().split() for line in infile]}
# gene_name_lookup = pd.read_csv(opts.genenames,sep='\t',header=None,names=['tid','tfam']).set_index('tid')['tfam'].to_dict()
else:
gene_name_lookup = {}
new_tfams = {}
multi_names = defaultdict(lambda: int(1))
for tfam_val in tfams.itervalues():
geneset = {gene_name_lookup[tid] for tid in tfam_val[0] if tid in gene_name_lookup}
if not geneset:
geneset = set(tfam_val[0]) # if no gene names available, just use the tids themselves
genename = _choose_name(geneset)
if genename in new_tfams:
multi_names[genename] += 1
genename = '%s_%d' % (genename, multi_names[genename])
new_tfams[genename] = tfam_val
for (genename, num_appearances) in multi_names.iteritems():
sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' % (genename, num_appearances))
if opts.verbose:
logprint('Saving results')
with open(outbedname, 'w') as outbed:
with open(outtxtname, 'w') as outtxt:
for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems():
outbed.write(SegmentChain(*positionlist_to_segments(chrom, strand, list(genpos)), ID=tfam).as_bed())
for tid in tids:
outtxt.write('%s\t%s\n' % (tid, tfam))
if opts.verbose:
logprint('Tasks complete')