Automation_scripts/CRMs.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# This program will classify duplications as CRMs based on critera outlined
# below. To do so, first the following will be imported.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from matplotlib_venn import venn2
import numpy as np

# The file containing the duplications will be read in as a dataframe 'df'.
df = pd.read_excel('Synaptosome_Duplications.xlsx')

# CRMs will be defined as those where the final start and end are between
# 14,000 and 16,299, and the final size is </= 2300.
CRMs = df.loc[(df['final.start']) >= 14000]
CRMs = CRMs.loc[(CRMs['final.size']) <= 2300]
CRMs = CRMs.loc[(CRMs['final.end']) >= 14000]
group = CRMs.groupby(['Strain']).size().reset_index()
CRMs1 = CRMs.pivot_table(index = (['final.start', 'final.end','final.size',
    'seq1', 'seq2','seq']), columns =(['Strain', 'sample']),aggfunc='count',
    fill_value=0).reset_index()
# To calculate avg size
CRMs2 = CRMs.pivot_table(index = (['final.start', 'final.end']),
columns=(['Strain', 'sample']), values = 'final.size')

CRMs3 = CRMs.pivot_table(index = (['final.start', 'final.end']),
columns=(['Strain', 'sample']), values = 'heteroplasmy')
CRMs3.to_excel('heteroplasmy_pivot.xlsx')
CRMs2.to_excel('size_pivot.xlsx')
CRMs1.to_excel('pivot.xlsx')
sns.countplot(x='Strain',data=CRMs)
plt.savefig('crms.png')

# Kmeans stuff
X = np.array(CRMs['final.size'])
X = X.reshape(-1,1)
kmeans = KMeans(n_clusters=5,random_state=0).fit(X)
kmeans.cluster_centers_
kmeans.labels_
X_u = np.unique(X)

for i in X_u:
        plt.scatter(i,0)
plt.legend()
plt.savefig('clusters.png')
# Kmeans stuff##
CRMs.to_excel('CRMs.xlsx', index=False)
#print(CRMs)