Automation_scripts/gene_annotation.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# This module is designed to annotate gene coordinates with the gene names
# in order to easily identify what genes are impacted by a certain mutation.

# First the imports are done.
import pandas as pd
import numpy as np
import pysam
import sys
import os
import seaborn as sns
from statannot import add_stat_annotation
from scipy.stats import bartlett
import scipy.stats as stats
from statannotations.Annotator import Annotator
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scikit_posthocs import posthoc_tukey
from io import StringIO
import matplotlib.pyplot as plt

# The annotation file (gff3, downloaded from gencode) is read in as a table
# using pandas.
gencode = pd.read_table("gencode.vM25.annotation.gff3",comment="#",sep = "\t",
        names = ['seqname', 'source','feature', 'start' , 'end',
            'score','strand', 'frame', 'attribute'])

# Extract the mitochondrial genes from the table above by extracting anything
# where the seqname is chrM. This will be put into a variable called
# 'mito_genes'.
mito_genes = gencode[(gencode.seqname == "chrM")][['start',
    'end','attribute', 'frame', 'strand',
    'score']].copy().reset_index().drop('index', axis=1)

# Extract gene names and type.
def gene_info(x):
    g_name = list(filter(lambda x: 'gene_name' in x,
        x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,
        x.split(";")))[0].split("=")[1]
    return (g_name, g_type)
mito_genes["gene_name"],mito_genes["gene_type"] = (zip(*mito_genes.attribute.apply(lambda x: gene_info(x))))

# Drop duplicates, sort by start position, and write to excel .
mito_genes = mito_genes.sort_values(['start'],
        ascending=True).drop_duplicates('gene_name',
                keep='first').reset_index().drop('index', axis=1)
mito_genes.to_excel('mito_genes.xlsx', index=False)

# Read in the CRM results file as a dataframe (df) so that the genes can be
# annotated to it.
df = pd.read_excel('CRMs.xlsx')

#.....
mito_genes.index = pd.IntervalIndex.from_arrays(mito_genes['start'],
        mito_genes['end'],closed='both')
def get_name(d):
    try:
        gene_name = mito_genes.loc[d]['gene_name']
        gene_type = mito_genes.loc[d]['gene_type']
        return(gene_name, gene_type)
    except KeyError:
        return('', '')
    #g_type = mito_genes.loc[d]['gene_type']
    #return(g_name, g_type)
print(df['final.start'])
df['Gene_name_start'],df['Gene_type_start'] = zip(*df['final.start'].apply(get_name))
df.to_excel('test.xlsx',index=False)
#df['Gene_name_end'] = df['final.end'].apply(get_name)
#def get_type(d):
 #   try:
  #      g_type = mito_genes.loc[d]['gene_type']
   #     return(g_type)
    #except KeyError:
     #   pass
#df['Gene_type_start'] = df['final.start'].apply(get_type)
#df['Gene_type_start'] = df['final.start'].apply(get_name.g_type)
#df['Gene_name_end'] = df['final.end'].apply(get_name)
#def get_type(d):
 #   try:
  #      return mito_genes.loc[d]['gene_type']
   # except KeyError:
    #    pass
#df['Gene_type_start'] = df['final.start'].apply(get_type)
#df['Gene_type_end'] = df['final.end'].apply(get_type)
#See if the name and type fxns can be combined into one. Kept getting errors
# when tried.

# In order to get the non-coding regions annonated, a bigbed (bb) file was
# downloaded from:
# https://hgdownload.soe.ucsc.edu/gbdb/mm10/ncbiRefSeq/refSeqFuncElems.bb
# This file was then converted to a .bed file using bigBedToBed with the option
# for chromosome set as chrom=chrM, and the file was saved as 'mito_func.bed'.
# This file is now being read in as a dataframe - df10.
df10 = pd.read_csv('mito_func.bed', sep='\t', header=None)

# The headers used int the refseq schema
# (https://genome.ucsc.edu/cgi-bin/hgTables?db=mm10&hgta_group=regulation&hgta_track=refSeqFuncElems&hgta_table=refSeqFuncElems&hgta_doSchema=describe+table+schema)
# are added to the bed file.
header = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
        'thickStart', 'thickEnd', 'reserved', 'soTerm', 'note',
        'geneIds', 'pubMedIds','experiment', 'function','_mouseOver']
df10.columns = header[:len(df10.columns)]

# The name of the feature will now be extracted by splitting the 'name' column
# on ';'.
df10['name'] = pd.DataFrame([i.split(';', 1)[0] for i in df10.name],
        columns=['Name'])

# The 'chromStart' and 'chromEnd' columns in df10 will be intervalized so that
# the elements can be matched with the location in the CRM dataframe (df)
df10.index = pd.IntervalIndex.from_arrays(df10['chromStart'],
                df10['chromEnd'],closed='both')

# The function below will extract the elements from df 10 where the interval of
# chromStart to chromEnd overlaps with the final.start site.
def get_element(d):
    try:
        return df10.loc[d]['name']
    except KeyError:
        pass
df['Gene_element_start'] = df['final.start'].apply(get_element)
df['Gene_element_end'] = df['final.end'].apply(get_element)
df.to_excel('df.xlsx',index=False)


# Add a column 'count' to the dataframe, and start it at 0.
df2=df
df2['count'] =0

# Group by 'Strain', 'Name', and 'Type' and aggregate as count. This will add
# the counts into the 'count' column.
df2 = df.groupby(['Strain','sample']).count().reset_index()
df21 = df.groupby(['Strain','sample', 'Gene_element_start']).count().reset_index()

# Get all unique strains from the df2 into a variable called 'Strain'. Then,
# creat an empty dataframe 'df3' and for each unique value in strain, append
# the df2 values from 'count'. Finally, perform f_oneway statistics on df3.
Strain = df2.Strain.unique()
df3 = []
for s in Strain:
        df3.append(df2[df2['Strain'] == s]['count'])
F = f_oneway(*df3)

# Cast the f_oneway statistics into a string so that it can then be  saved as a
# dataframe. Use the StringIO to implement a file-like class on the string
# ('F') so that it can be read in as a CSV and hence become a dataframe.
F = str(F)
F = StringIO(F)
F = pd.read_csv(F, sep=";")

# Perform groupwise comparisons using tukey HSD
tukey = pairwise_tukeyhsd(endog=df2['count'],
                                          groups=df2['Strain'],
                                                            alpha=0.05)

# Save the tukey data as a dataframe and append the F stats from F dataframe
# into it. Finally, export the dataframe to excel as 'Syn_stats.xlsx'.
tukey = pd.DataFrame(data=tukey._results_table.data[1:],
                columns=tukey._results_table.data[0])
tukey = tukey.append(F)
tukey.to_excel('Syn_stats.xlsx',index=False)

# In order to annotate the resulting graphs with stars for statistical
# significance, the tukey results need to be put into a simple matrix
# (just showing the p values, not other parameters such as meandiff, lower
# and upper etc).
tukey_df = posthoc_tukey(df2, val_col="count",
                group_col="Strain")

# The matrix needs to be converted to a non-redundant list of comparisons with
# the p-value. This is done by removing the lower half and diagonal of the
# matrix and turning the matrix format into a long dataframe using melt(). The
# code and resulting dataframe are shown below.
remove = np.tril(np.ones(tukey_df.shape), k=0).astype("bool")
tukey_df[remove] = np.nan
molten_df = tukey_df.melt(ignore_index=False).reset_index().dropna()

# x, y, and order are defined so that they can be used in the graphs
# below.
x = "Strain"
y = 'count'
order = ['Polg', 'PKO', 'W402A']
sns.set_style("whitegrid", {'axes.grid' : False})
fig, axes = plt.subplots(1, 2,figsize=(17, 4.5))
fig.suptitle('CRMs', weight='bold')
ax = sns.barplot(ax=axes[0],data=df2,x=x, y=y, order=order,
                        facecolor=(1,1,1,0),edgecolor='.2')
ax.set_title('All CRMs')

# Add a swarmplot to visualize the individual datapoints on the
# barplot. Color it black so that the points are easy to spot.
ax = sns.swarmplot(ax=axes[0],data=df2,x=x, y=y,
                order=order,color='.2')
ax.set_ylabel('Number of CRMs')
ax.set_ylim(top=max(df2['count']) + 10)

# In order to only annotate the graph where there are significant
# differences, the dataframe 'molten_df', which contains the p values, will be
# filtered so that only significant p values (<= 0.05) are in there. Note, if all
# notations are desireable, skip the filtering step.
molten_df = molten_df.loc[molten_df['value'] <= 0.05]

# The pairs for multiple comparisons is defined as all strains in the p value
# table.
pairs = [(i[1]["index"], i[1]["variable"]) for i in
                molten_df.iterrows()]

# A list of p values is generated from the molten_df dataframe. The annotator
# is then defnied and configured to annotate the graph with stars using the p
# values from the list.
p_values = [i[1]["value"] for i  in molten_df.iterrows()]
annotator = Annotator(ax, pairs, data=df2, x=x, y=y, order=order)
annotator.configure(text_format="star", loc="inside")
annotator.set_pvalues_and_annotate(p_values)


#annotator = Annotator(ax, pairs, data=df2, x=x, y=y, order=order)
#annotator.configure(text_format="star", loc="inside")
#annotator.set_pvalues_and_annotate(p_values)


# reshape the d dataframe suitable for statsmodels package
#df2 = pd.melt(df2.reset_index(), id_vars=['Strain'],value_vars=['count'])
         #fvalue, pvalue = stats.f_oneway(df2['Strain'], df2['count'])
         # Make a 1 * 2 plot
         #f_val, p_val = ss.f_oneway(df2['count'],df2['Strain'])
ax2 = sns.barplot(ax=axes[1],data=df21,x=x, y=y, order=order,
        hue='Gene_element_start')
ax2.set_title('CRMs by Start of Breakpoint')
ax2.set_ylabel('Number of CRMs')
ax2.legend(title='Gene Element')
sns.move_legend(ax2,'upper left',bbox_to_anchor=(1, 1.025))
         #annotator = Annotator(ax, pairs, data=df, x=x, y=y, order=order)
         #annotator.configure(test='Kruskal', text_format='star',loc='outside')
         #annotator.apply_and_annotate()
         #add_stat_annotation(ax, data=df, x=x, y=y, order=order,box_pairs=[("WT",
             #"Polg"), ("WT", "PKO"), ("WT","W402A")],test='t-test_ind',
             #text_format='star', loc='outside',verbose=2)
             #stats.bartlett('WT', 'Polg', 'PKO', 'W402A')
             #stat, p = bartlett('WT', 'Polg', 'PKO', 'W402A')
plt.savefig('anot_CRM.png')