Automation_scripts/Concaten_clc_data.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
# This program is designed to concatenate all the data exported from CLC
# genomics workbench, annotate with Gender, Strain, gene name, gene type, and
# functional element. It then performs statistical analysis and graphs them.
# This data is obtained by doing the following in CLC: Tools>Resequencing
# Analysis> Variant Detection> Low Frequency Variant Detection. Keep all
# defaults unless you have something specific you would like to change.
# IMPORTANT, in the 'Output Options' section, check off "Create annotated
# table". Ideally all three output options should be checked off, but the
# annotated table is the most important for this  script to work because the
# logic is based on the columns within it.

# Do all the imports
import os
import pandas as pd
import mysql.connector
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import numpy as np
import seaborn as sns
from statannotations.Annotator import Annotator
import matplotlib.patches as patches
from scipy.stats import bartlett
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scikit_posthocs import posthoc_tukey
import scipy.stats as stats
from scipy.stats import f_oneway
from io import StringIO
from scipy.stats import gmean

# Define path and files
path = os.getcwd()
files = os.listdir(path)
xl = [f for f in files if f[-13:] == 'variants.xlsx']

# Add a column "Name" to each excel file, and populate it down with the basename
for f in xl:
    df = pd.read_excel(f)
    if 'Name' in df.columns:
        continue
    else:
        a = df.insert(0, "Name", (os.path.basename(f)))

        # The 'Name' will have everything after '_' stripped and only the
        # zeroth string kept. The individual dataframes are saved to excel.
        df['Name'] = df['Name'].str.split('_').str[0]
        df.to_excel(f,index=False)

# Create an empty list to concatenate the data to
df = []

# Populate the empty list with data from the relevant columns and save it as a
# csv in the current directory.
for i in (xl):
    data = pd.read_excel(i, usecols =['Name', 'Reference Position','Type',
        'Length', 'Reference','Allele', 'Coverage', 'Frequency',
        'Forward/reverse balance', 'Average quality'])
    df.append(data)
frame = pd.concat(df, axis=0, ignore_index=True)
frame.to_csv('concat_data.csv', index=False)

# Import the tsv files resulting from bam files processed with mpileup.
ts = [f for f in files if f[-8:] == '.bam.tsv']

# Create an empty list to concatenate the data to.
tsf = []

# Add a column "Name" to each tsv file, and populate it down with the
# basename in the first column (0).
for f in ts:
    data = pd.read_csv(f, sep='\t')
    if 'Name' in data.columns:
        continue
    else:
        a = data.insert(0, "Name", (os.path.basename(f)))

        # The 'Name' will have everything after '_' stripped and only the
        # zeroth string kept. The individual dataframes are concatenated into
        # the list 'tsf' after adding the headers (as outlined belows) to each.
        data['Name'] = data['Name'].str.split('_').str[0]
        header = ['Name','Ref Genome', 'Reference Position', 'Ref Allele',
        'Depth','Reads', 'Qual']
        data.columns = header[:len(data.columns)]
        tsf.append(data)

# Concatenate the list created above into a dataframe.
tsf = pd.concat(tsf, axis=0, ignore_index=True)

# Separate the synaptosome samples from the homogenate samples into new
# dataframes tsf31 and tsf41 respectively.Strip off '-Syn' and '-WBH' from the
# names.
tsf31 = tsf[tsf['Name'].str.contains('Syn')]
tsf31 = tsf31.replace({'-Syn':''}, regex=True)
tsf41 = tsf[tsf['Name'].str.contains('WBH')]
tsf41 = tsf41.replace({'-WBH':''}, regex=True)

# Pivot both tsf31 and tsf41 so that the index is the genome position, columns
# are the names,and depth are the values.
pivot31 = tsf31.pivot(index='Reference Position', columns = 'Name',
                values='Depth')
pivot41 = tsf41.pivot(index='Reference Position', columns = 'Name',
                values='Depth')

# Calculate the geomatric mean for the depth at each genome position and insert
# the results into a new column called 'Pseudo Ref' for both pivot31 and
# pivot41.
pivot31['Pseudo Ref'] = gmean(pivot31, axis=1)
pivot41['Pseudo Ref'] = gmean(pivot41, axis=1)

# Create a new pivot 'pivot311', and pivot411. Calculate the ratio of each
# sample to the pseudo reference. Since this also divides the pseudo reference
# by itself, copy the original pseudo reference from the original pivot in here
# so that the pseudo reference column will contain the actual pseudo
# reference, not 1s. The index will be reset on both pivots so that if one
# wants to export to excel, the index (Ref Position) is visible.
pivot311 = pivot31.div(pivot31['Pseudo Ref'], axis=0)
pivot411 = pivot41.div(pivot41['Pseudo Ref'], axis=0)
pivot311['Pseudo Ref'] = pivot31['Pseudo Ref']
pivot411['Pseudo Ref'] = pivot41['Pseudo Ref']
pivot311.reset_index()
pivot411.reset_index()

# Calculate the normalization factor for each sample by taking the median of
# the ratio of each sample to the reference genome across the entire reference.
# Put the results into a series called 'Syn_med' and 'WBH_med'. These series
# are then converted to dataframes with their  indices reset, and the headers
#'Name' and 'Norm_factor' are added to them.
header = ['Name', 'Norm_factor']
Syn_med = pivot311.median(axis=0)
Syn_med = pd.DataFrame(Syn_med).reset_index()
Syn_med.columns = header[:len(Syn_med.columns)]
WBH_med = pivot411.median(axis=0)
WBH_med = pd.DataFrame(WBH_med).reset_index()
WBH_med.columns = header[:len(WBH_med.columns)]

# Connect to the sql database - 'CLC_database', create a cursor, drop the
# table 'Indels_dataframe' if it already exists, and create a new table with
# that name from df2.
conn = mysql.connector.connect(
        host="localhost",
        user="elichter",
        database="CLC")

# Create SQLAlchemy engine to connect to MySQL Database
engine = create_engine("mysql+pymysql://elichter@localhost/CLC")
c = conn.cursor()
frame.to_sql('CLC_Concat',engine,if_exists='replace',index=False);

# Add columns 'Strain' and 'Gender' to the tables, and classify samples based
# on their strains (C57-1Mo, C57-12Mo, Polg(F), Polg/PKO(D), Polg/W402A(C)
c.execute('ALTER TABLE CLC_Concat ADD Strain varchar(255)')
c.execute("UPDATE CLC_Concat SET Strain = 'Polg' WHERE Name like 'F___%'")
c.execute("UPDATE CLC_Concat SET Strain = 'Polg-PKO' WHERE Name like 'D___%'")
c.execute("UPDATE CLC_Concat SET Strain = 'PKO\n1Mo' WHERE Name like 'RA___%'")
c.execute("UPDATE CLC_Concat SET Strain = 'Polg-W402A' WHERE Name like 'C___%' AND Name NOT like '%c57%'")
c.execute("UPDATE CLC_Concat SET Strain = 'WT' WHERE Name like '%C57%' AND Name like '%12.%'")
c.execute("UPDATE CLC_Concat SET Strain = 'WT\n1Mo' WHERE Name like '%C57%' AND Name like '%1.%'")
c.execute("ALTER TABLE CLC_Concat ADD Gender VARCHAR (255)")
c.execute("UPDATE CLC_Concat SET Gender = 'Male' WHERE Name like '%_m%' OR Name like '%-m%'")
c.execute("UPDATE CLC_Concat SET Gender = 'Female' WHERE Name like '%_F%' OR Name like '%-F%'")
df2 = pd.read_sql("SELECT * From CLC_Concat", conn)
conn.commit()
conn.close()

# In order to annotate gene coordinates with the gene names so that genes
# impacted by a specific mutation, an annotation file needs to be read in.
# The annonation file comes in the gff3 format downloaded from encode. Here we
# are using vM28 because that corresponds to the GRCm39 assembly which was used
# for alignment in CLC. In the MitoSAlt program, we are using vM25, which
# corresponds to GRCm38 as that was used there for alignment (discrepancies are
# probably negligable and m38 was used in MitoSAlt because that's what the
# program automatically downloads. It can be changed, but never did).

# The annotation file (gff3, downloaded from gencode) is read in as a table
# using pandas.
gencode = pd.read_table("gencode.vM28.annotation.gff3",comment="#",sep = "\t",
                names = ['seqname', 'source','feature', 'start' , 'end',
                                'score','strand', 'frame', 'attribute'])

# Extract the mitochondrial genes from the table above by extracting anything
# where the seqname is chrM. This will be put into a variable called
# 'mito_genes'.
mito_genes = gencode[(gencode.seqname == "chrM")][['start',
        'end','attribute', 'frame', 'strand',
            'score']].copy().reset_index().drop('index', axis=1)

# Extract gene names and type.
def gene_info(x):
    g_name = list(filter(lambda x: 'gene_name' in x,
                    x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,
                        x.split(";")))[0].split("=")[1]
    return (g_name, g_type)
mito_genes["gene_name"],mito_genes["gene_type"] = zip(*mito_genes.attribute.apply(lambda x: gene_info(x)))

# Drop duplicates, sort by start position, and write to csv .
mito_genes = mito_genes.sort_values(['start'],
        ascending=True).drop_duplicates('gene_name',
                keep='first').reset_index().drop('index', axis=1)
mito_genes.to_csv('mito_genes.csv', index=False)

# The 'start' and 'end' columns in mito_genes will be intervalized so that
# the elements can be matched with the location in the df2 dataframe.
mito_genes.index = pd.IntervalIndex.from_arrays(mito_genes['start'],
                mito_genes['end'],closed='both')

# The function below will extract the gene name, and type from mito_genes where
# the interval of start and end overlaps with 'Reference Position.
def get_name(d):
    try:
        gene_name = mito_genes.loc[d]['gene_name']
        gene_type = mito_genes.loc[d]['gene_type']
        return(gene_name, gene_type)
    except KeyError:
        return('','')
df2['Gene_name'],df2['Gene_type'] = zip(*df2['Reference Position'].apply(get_name))

# The 'Gene_name' column will be split on 'mt-' and only the characters after
# that will be kept.
df2['Gene_name'] = df2['Gene_name'].str.split('mt-').str[1]

# In order to get the non-coding regions annonated, a bigbed (bb) file was
# downloaded from:
# https://hgdownload.soe.ucsc.edu/gbdb/mm10/ncbiRefSeq/refSeqFuncElems.bb
# This file was then converted to a .bed file using bigBedToBed with the option
# for chromosome set as chrom=chrM, and the file was saved as 'mito_func.bed'.
# This file is now being read in as a dataframe - df10.
df10 = pd.read_csv('mito_func.bed', sep='\t', header=None)

# The headers used int the refseq schema
# (https://genome.ucsc.edu/cgi-bin/hgTables?db=mm10&hgta_group=regulation&hgta_track=refSeqFuncElems&hgta_table=refSeqFuncElems&hgta_doSchema=describe+table+schema)
# are added to the bed file.
header = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
                'thickStart', 'thickEnd', 'reserved', 'soTerm', 'note',
                        'geneIds', 'pubMedIds','experiment',
                        'function','_mouseOver']
df10.columns = header[:len(df10.columns)]

# The name of the feature will now be extracted by splitting the 'name' column
# on ';'.
df10['name'] = pd.DataFrame([i.split(';', 1)[0] for i in df10.name],
                columns=['Name'])

# The 'chromStart' and 'chromEnd' columns in df10 will be intervalized so that
# the elements can be matched with the location in the conatenated dataframe
# (df2)
df10.index = pd.IntervalIndex.from_arrays(df10['chromStart'],
                        df10['chromEnd'],closed='both')

# The function below will extract the elements from df 10 where the interval of
# chromStart to chromEnd overlaps with the Reference Position.
def get_element(d):
    try:
        return df10.loc[d]['name']
    except KeyError:
        pass
df2['Gene_element'] = df2['Reference Position'].apply(get_element)

# Remove the 1mo (WT and PKO) samples from the data. Then sort the order of
# strain where WT goes first, followed by Polg, polg/PKO and Polg/W402A. Then
# filter out samples/sites where minimum coverage and quality
# thresholds (1000 and 30 respectively) are not met, and create a new dataframe
# caalled df2_filt from it. The 1Mo old are removed for now, but may possibly
# (based on discussions with Howard) be added back later.
df2 = df2[df2.Strain != 'WT\n1Mo']
df2 = df2[df2.Strain != 'PKO\n1Mo']
df2 = df2[df2.Strain != 'Polg-W402A']
df2['Strain'] = pd.Categorical(df2['Strain'],['WT','Polg', 'Polg-PKO'])
df2_filt = df2.loc[(df2['Coverage'] >= 1000) & (df2['Average quality'] >=30)]
# Separate the synaptosomes from the homogenates by creating dataframes
# (df31 and df41 respectively) for each. Remove 'Syn' and 'WBH' from the name
# in their respective dataframes.
df31 = df2_filt[df2_filt['Name'].str.contains('Syn')]
df31 = df31.replace({'-Syn':''}, regex=True)
df41 = df2_filt[df2_filt['Name'].str.contains('WBH')]
df41 = df41.replace({'-WBH':''}, regex=True)

# Adjust the coverage to the normalized values by dividing by the normalization
# factor.
df31['Coverage'] = df31['Coverage'] / df31['Name'].map(Syn_med.set_index(['Name'])['Norm_factor'])
df41['Coverage'] = df41['Coverage'] / df41['Name'].map(WBH_med.set_index(['Name'])['Norm_factor'])

# set figure size
plt.figure(figsize=(15,10))
# plot polar axis
#ax = plt.subplot(111, polar=True)
g = sns.scatterplot(data=df31, x='Reference Position', y = 'Frequency', hue =
        'Strain')
#g.set_yscale('log')
plt.legend(loc='upper left')
g.set_ylabel('Heteroplasmy (log scale)')
plt.savefig('dis.svg')

# A facetgrid containing four graphs (wrap 2 - two per row), one for each
# genotype will be made for the synaptsomes (using df31) and the homogenes
# (using df41). Each gentoype will be colored (hue) in the following order.
hue_order = ['WT', 'Polg', 'Polg-PKO']

# The facetgrid is called, its aspect is 2 to make it wider than taller.
# legend_out is set to True to allow room for the legend.
X = sns.FacetGrid(data=df31,col='Strain',col_wrap=1,despine=True, hue="Strain",
        hue_order=hue_order,palette = "deep", height=2.5,
        aspect=2)

# A histplot is inserted on the facetgrid. The y axis is set to log scale. X is
# Reference Position and Y is Frequency. These are renamed to "Mitochondrial
# Position" and "Heteroplasmy \n (log scale)".
X.map(sns.histplot, 'Reference Position', 'Frequency').set(yscale='log')
X.set_axis_labels("Mitocondrial Position", "Heteroplasmy\n(log scale)", weight='bold')

# The graph title is adjusted to be on top of the graph but not intruding on
# it, and it is titled and emboldened.
X.fig.subplots_adjust(top=0.8)
X.fig.suptitle('Synaptosomes', fontsize=28,weight='bold')

# Since facetgrids puts the title of each graph with '=', it is chnaged so that
# only the name without '=' is posted as the title of each graph.
X.set_titles(col_template = '{col_name}',weight='bold')

# The labeles are set with the same order as the hue order outlined above.
#labels = hue_order
# The colors are set for all labeles - will not be used for now. No legend
#colors = sns.color_palette("deep").as_hex()[:len(labels)]
# Seaborn had an issue pulling the colors for the legned in the facetgrid,
# therefore a patch from matplotlib.patches is called to correct for that.
#handles = [patches.Patch(color=col, label=lab) for col, lab in zip(colors, labels)]
# Seaborn is then called to add the legened as outlined in the patch, and
# anchored at a specified position.
#X.add_legend(legend_data= {lab: hand for lab, hand in zip(labels, handles)},
 #       loc='upper right')
#bbox_to_anchor=(1.01,0.5))

# Synaptosome plot is saved as 'hist_syn.svg'
plt.savefig('hist_Syn.svg',dpi=600)


# The heteroplasmy by gene will also be graphed in a facetgrid histogram.
X = sns.FacetGrid(data=df31,col='Strain',col_wrap=1,despine=True, hue="Strain",
                        hue_order=hue_order,palette = "deep", height=5,
                                                aspect=2)
X.map(sns.histplot, 'Gene_name', 'Frequency').set(yscale='log')
X.set_axis_labels("Gene", "Heteroplasmy\n(log scale)",
                weight='bold')

# Since we want all genes to be on the graph, the xticklabels will be rotated
# 45 degrees.
[plt.setp(ax.get_xticklabels(), rotation=45) for ax in X.axes.flat]
X.fig.subplots_adjust(top=0.8)
X.fig.suptitle('Synaptosomes', fontsize=28,weight='bold')
X.set_titles(col_template = '{col_name}', weight='bold')

# In order not to cut off the x axis title by rotating the tick labels,
# bbox_inces = tight is added to the savefig command.
plt.savefig('hist__Gene_Syn.svg',dpi=600, bbox_inches='tight')

# Below is for the homogenates:
X = sns.FacetGrid(data=df41,col='Strain',col_wrap=1,despine=True, hue="Strain",
        hue_order=hue_order,palette = "deep", height=2.5,
        aspect=2)
X.map(sns.histplot, 'Reference Position', 'Frequency').set(yscale='log')
X.set_axis_labels("Mitocondrial Position", "Heteroplasmy\n(log scale)", weight='bold')
X.fig.subplots_adjust(top=0.8)
X.fig.suptitle('Homogenates', fontsize=28,weight='bold')
X.set_titles(col_template = '{col_name}', weight='bold')
plt.savefig('hist_WBH.svg',dpi=600)

# Below is for homogenate for gene name.
X = sns.FacetGrid(data=df41,col='Strain',col_wrap=1,despine=True, hue="Strain",
                hue_order=hue_order,palette = "deep", height=5,
                        aspect=2)
X.map(sns.histplot, 'Gene_name', 'Frequency').set(yscale='log')
X.set_axis_labels("Gene", "Heteroplasmy\n(log scale)",
        weight='bold')
[plt.setp(ax.get_xticklabels(), rotation=45) for ax in X.axes.flat]
X.fig.subplots_adjust(top=0.8)
X.fig.suptitle('Homogenates', fontsize=28,weight='bold')
X.set_titles(col_template = '{col_name}', weight='bold')
plt.savefig('hist__Gene_WBH.svg',dpi=600, bbox_inches='tight')
df31.to_excel('df311_adjust.xlsx', index=False)
df41.to_excel('df411.xlsx', index=False)
# Add a column 'count' to the df31, and start it at 0. This will enable the
# program to count the mutations and do statistics below.
df31['count'] =0

# Group by 'Strain', and Name' and aggregate as count. This will add
# the counts into the 'count' column. In addition, group by 'Strain', 'Name',
# and 'Type' and aggregate as count. Finally, the index needs to be reset so
# that the variables are available downstream in graphs.
df200 = df31.groupby(['Strain','Name'],observed=True).count().reset_index()
df21 = df31.groupby(['Strain','Name', 'Type']).count().reset_index()

# Normalize the counts by the normalization factor.
df200['count'] = df200['count'] / df200['Name'].map(Syn_med.set_index(['Name'])['Norm_factor'])


# Get all unique strains from the df2 into a variable called 'Strain'. Then,
# creat an empty dataframe 'df3' and for each unique value in strain, append
# the df2 values from 'count'. Finally, perform f_oneway statistics on df3.
Strain = df200.Strain.unique()
df3 = []
for s in Strain:
        df3.append(df200[df200['Strain'] == s]['count'])
F = f_oneway(*df3)

# Cast the f_oneway statistics into a string so that it can then be saved as a
# dataframe. Use the StringIO to implement a file-like class on the string
# ('F') so that it can be read in as a CSV and hence become a dataframe.
F = str(F)
F = StringIO(F)
F = pd.read_csv(F, sep=";")

# Perform groupwise comparisons using tukey HSD
tukey = pairwise_tukeyhsd(endog=df200['count'],
                        groups=df200['Strain'],
                                alpha=0.05)

# Save the tukey data as a dataframe and append the F stats from F dataframe
# into it. Finally, export the dataframe to csv as 'Syn_stats.csv'.
tukey = pd.DataFrame(data=tukey._results_table.data[1:],
        columns=tukey._results_table.data[0])
tukey = tukey.append(F)
tukey.to_csv('Syn_stats.csv',index=False)

# In order to annotate the resulting graphs with stars for statistical
# significance, the tukey results need to be put into a simple matrix (just
# showing the p values, not other parameters such as meandiff, lower and upper
# etc).
tukey_df = posthoc_tukey(df200, val_col="count",
        group_col="Strain")

# The matrix needs to be converted to a non-redundant list of comparisons with
# the p-value. This is done by removing the lower half and diagonal of the
# matrix and turning the matrix format into a long dataframe using melt(). The
# code and resulting dataframe are shown below.
remove = np.tril(np.ones(tukey_df.shape), k=0).astype("bool")
tukey_df[remove] = np.nan
molten_df = tukey_df.melt(ignore_index=False).reset_index().dropna()

# x, y, and order are defined so that they can be used in the graphs below.
x = "Strain"
y = 'count'
#order = ['WT', 'Polg', 'Polg-PKO', 'Polg-W402A']
sns.set_style("whitegrid", {'axes.grid' : False})
fig, axes = plt.subplots(1, 2,figsize=(17, 4.5))
fig.suptitle('Synaptosomes', weight='bold')
ax = sns.barplot(ax=axes[0],data=df200,x=x, y=y,
                facecolor=(1,1,1,0),edgecolor='.2')
ax.set_title('All Mutations')

# Add a swarmplot to visualize the individual datapoints on the barplot. Color
# it black so that the points are easy to spot.
ax = sns.swarmplot(ax=axes[0],data=df200,x=x, y=y,color='.2')
ax.set_ylabel('Number of Mutations')
ax.set_ylim(top=max(df200['count']) + 10)

# In order to only annotate the graph where there are significant differences,
# the dataframe 'molten_df', which contains the p values, will be filtered so
# that only significant p values (<= 0.05) are in there. Note, if all notations
# are desireable, skip the filtering step.
molten_df = molten_df.loc[molten_df['value'] <= 0.05]

# The pairs for multiple comparisons is defined as all strains in the p value
# table.
pairs = [(i[1]["index"], i[1]["variable"]) for i in molten_df.iterrows()]

# A list of p values is generated from the molten_df dataframe. The annotator
# is then defnied and configured to annotate the graph with stars using the p
# values from the list.
p_values = [i[1]["value"] for i  in molten_df.iterrows()]
annotator = Annotator(ax, pairs, data=df200, x=x, y=y)
annotator.configure(text_format="star", loc="inside")
annotator.set_pvalues_and_annotate(p_values)

# Make the graph for mutations separated by type.
ax2 = sns.barplot(ax=axes[1],data=df21,x=x, y=y, hue='Type')
ax2.set_title('Mutations by Type')
ax2.set_ylabel('Number of Mutations')
sns.move_legend(ax2,'upper left',bbox_to_anchor=(1, 1.025))

plt.savefig('Syn_Mut_Num.png')

# Perform the same operations (read in df41 all to way to graphing with
# annotations) for the homogenates.

# Add a column 'count' to the dataframe, and start it at 0.
df41['count'] =0

# Group by 'Strain', and Name' and aggregate as count. This will add
# the counts into the 'count' column. In addition, group by 'Strain', 'Name',
# and 'Type' and aggregate as count. Finally, the index needs to be reset so
# that the variables are available downstream in graphs.
df400 = df41.groupby(['Strain','Name'],observed=True).count().reset_index()
df211 = df41.groupby(['Strain','Name', 'Type']).count().reset_index()
df311 = df41.groupby(['Strain', 'Name', 'Gene_name']).count().reset_index()

# Normalize the counts by the normalization factor.
df400['count'] = df400['count'] / df400['Name'].map(WBH_med.set_index(['Name'])['Norm_factor'])
# Get all unique strains from the df2 into a variable called 'Strain'. Then,
# creat an empty dataframe 'df3' and for each unique value in strain, append
# the df2 values from 'count'. Finally, perform f_oneway statistics on df3.
Strain = df400.Strain.unique()
df6 = []
for s in Strain:
            df6.append(df400[df400['Strain'] == s]['count'])
F = f_oneway(*df6)

# Cast the f_oneway statistics into a string so that it can then be saved as a
# dataframe. Use the StringIO to implement a file-like class on the string
# ('F') so that it can be read in as a CSV and hence become a dataframe.
F = str(F)
F = StringIO(F)
F = pd.read_csv(F, sep=";")

# Perform groupwise comparisons using tukey HSD
tukey = pairwise_tukeyhsd(endog=df400['count'],
                        groups=df400['Strain'],
                                    alpha=0.05)

# Save the tukey data as a dataframe and append the F stats from F dataframe
# into it. Finally, export the dataframe to csv as 'Syn_stats.csv'.
tukey = pd.DataFrame(data=tukey._results_table.data[1:],
                columns=tukey._results_table.data[0])
tukey = tukey.append(F)
tukey.to_csv('WBH_stats.csv',index=False)

# In order to annotate the resulting graphs with stars for statistical
# significance, the tukey results need to be put into a simple matrix (just
# showing the p values, not other parameters such as meandiff, lower and upper
# etc).
tukey_df = posthoc_tukey(df400, val_col="count",
                group_col="Strain")

# The matrix needs to be converted to a non-redundant list of comparisons with
# the p-value. This is done by removing the lower half and diagonal of the
# matrix and turning the matrix format into a long dataframe using melt(). The
# code and resulting dataframe are shown below.
remove = np.tril(np.ones(tukey_df.shape), k=0).astype("bool")
tukey_df[remove] = np.nan
molten_df = tukey_df.melt(ignore_index=False).reset_index().dropna()

# x, y, and order are defined so that they can be used in the graphs below.
x = "Strain"
y = 'count'
#order = ['WT', 'Polg', 'Polg-PKO', 'Polg-W402A']
sns.set_style("whitegrid", {'axes.grid' : False})
fig, axes = plt.subplots(1, 2,figsize=(17, 4.5))
fig.suptitle('Homogenates', weight='bold')
ax = sns.barplot(ax=axes[0],data=df400,x=x, y=y,
        facecolor=(1,1,1,0),edgecolor='.2')
ax.set_title('All Mutations')

# Add a swarmplot to visualize the individual datapoints on the barplot. Color
# it black so that the points are easy to spot.
ax = sns.swarmplot(ax=axes[0],data=df400,x=x, y=y,color='.2')
ax.set_ylabel('Number of Mutations')
ax.set_ylim(top=max(df400['count']) + 10)

# In order to only annotate the graph where there are significant differences,
# the dataframe 'molten_df', which contains the p values, will be filtered so
# that only significant p values (<= 0.05) are in there. Note, if all notations
# are desireable, skip the filtering step.
molten_df = molten_df.loc[molten_df['value'] <= 0.05]

# The pairs for multiple comparisons is defined as all strains in the p value
# table.
pairs = [(i[1]["index"], i[1]["variable"]) for i in molten_df.iterrows()]

# A list of p values is generated from the molten_df dataframe. The annotator
# is then defnied and configured to annotate the graph with stars using the p
# values from the list.
p_values = [i[1]["value"] for i  in molten_df.iterrows()]
annotator = Annotator(ax, pairs, data=df400, x=x, y=y)
annotator.configure(text_format="star", loc="inside")
annotator.set_pvalues_and_annotate(p_values)

# Make the graph for mutations separated by type.
ax2 = sns.barplot(ax=axes[1],data=df211,x=x, y=y, hue='Type')
ax2.set_title('Mutations by Type')
ax2.set_ylabel('Number of Mutations')
sns.move_legend(ax2,'upper left',bbox_to_anchor=(1, 1.025))

# Save the figure as WBH_Mut_Num.png
plt.savefig('WBH_Mut_Num.png')