-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAneuploid_Score.R
225 lines (174 loc) · 10.4 KB
/
Aneuploid_Score.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
### Calculate aneuploidy score(s) per cell line
### 200508
### Klaske Schukken
### Goal: take aneuploidy data per cell line from Uri Ben-David
### and give each cell line an aneuploidy score
### Make a .csv or .txt file of score.
### than later (in other script) compare to sgRNA dropouts. and
### find corrolation between aneuploidy & genes?
### update 210216 - added Gene # Score
library('plot.matrix')# for making colored plots of aneuploid cells
library('ggplot2')
library('tidyverse')
library('xlsx')
### Step 1: get data ####
#Import aneuploid arm call data from Cohen-Sharir et al. 2021 Nature paper
setwd()
aneuploid<- read.csv("arm_calls_data_bendavid.csv", header=TRUE)
# make dataframe of chrm+arm, bp_arm, and gene_arm
# make this in excel, save as .csv, then upload to R
# bp data estimated from NIH genome viewer
# gene data gotten from HGNC, genes with protein products only. https://www.genenames.org/
bp_arm<- read.csv("bp_per_arm.csv", header=TRUE)
# gene data gotten from HGNC, genes with protein products only. https://www.genenames.org/
#get HGNC gene data. info per gene, including chrm loci and type of gene
HGCN_gene_data<- read.csv("HGNC.csv", header=TRUE)
### Step 2: calculate aneuploidy score(s) per chrm arm ####
## add score for aneuploid chrm arms. column 8.
## count number of aneuploid arms per cell. note: only last row has number of aneuploid arms
#There are 39 chromosome arms per cell line, 997 cell lines
# to do: make table of arm score: sum of aneuploid arms per cell line
# then merge by cell line, adding proper names
arm_Score<-data.frame(tapply(abs(aneuploid$arm_call), aneuploid$DepMap_ID, FUN=sum))
arm_Score <- cbind(rownames(arm_Score), data.frame(arm_Score, row.names=NULL))
arm_Score<- arm_Score %>% rename(DepMap_ID='rownames(arm_Score)',
arm_Score='tapply.abs.aneuploid.arm_call...aneuploid.DepMap_ID..FUN...sum.')
aneuploid<-merge(x= aneuploid, y= arm_Score,
by.x="DepMap_ID", by.y="DepMap_ID", sort = FALSE)
aneuploid <- aneuploid[order(aneuploid$DepMap_ID, aneuploid$chrom),]
head(aneuploid)
nrow(aneuploid)/39
### Now I want to make a score based on basepairs of chrmosome.
## so an aneuploid chrm 1 is bigger than aneuploid chrm 23
## this will be bp_Score
## and I want to take into account ploidy
## so that diploid + 1p is bigger than octoploid+1p.
## this will be bp_ploidy_Score
# Step 1: add collumn to aneuploid dataframe: chrom + arm
aneuploid$Chrm_Arm<- paste0(aneuploid$chrom, aneuploid$arm) #new collumn with chrm_arm: "1_p"
# Step 2: make dataframe of chrm+arm, bp_arm, and gene_arm
# make this in excel, save as .csv, then upload to R... probably easier.
# bp data estimated from NIH genome viewer
# gene data gotten from HGNC, genes with protein products only. https://www.genenames.org/
bp_arm$Chrm_Arm<- gsub("_", "", bp_arm$Chrm_Arm)
# Step 3: combine dataframes by chrm_arm. so aneuploid has row of arm bp
aneuploid<-merge(x= aneuploid, y= bp_arm,
by.x="Chrm_Arm", by.y="Chrm_Arm", sort = FALSE)
aneuploid <- aneuploid[order(aneuploid$DepMap_ID, aneuploid$Chrm_Arm),]
###Step 4: bp_aneuploid: arm_call*bp_arm per row
# sum bp_aneuploid per cell line for bp_Score
aneuploid <- aneuploid %>% mutate(bp_aneuploid = abs(wmed_CN - round(ploidy)) * bp_arm)
head(aneuploid)
# to do: make table of bp score: sum of aneuploid bp per cell line
# then merge by cell line, adding proper names
bp_Score<-data.frame(tapply(aneuploid$bp_aneuploid, aneuploid$DepMap_ID, FUN=sum))
bp_Score <- cbind(rownames(bp_Score), data.frame(bp_Score, row.names=NULL))
bp_Score<- bp_Score %>% rename(DepMap_ID='rownames(bp_Score)',
bp_Score='tapply.aneuploid.bp_aneuploid..aneuploid.DepMap_ID..FUN...sum.')
aneuploid<-merge(x= aneuploid, y= bp_Score,
by.x="DepMap_ID", by.y="DepMap_ID", sort = FALSE)
aneuploid <- aneuploid[order(aneuploid$DepMap_ID, aneuploid$Chrm_Arm),]
head(aneuploid)
### Step 5: bp_ploidy_aneuploid:
#( (wmed_CN - round(ploidy)) * bp_arm)/round(ploidy)
# sum bp_ploidy_aneuploid per cell line for bp_ploidy_Score
aneuploid <- aneuploid %>% mutate(bp_ploidy_aneuploid = abs( (wmed_CN - round(ploidy)) * bp_arm)/round(ploidy))
# to do: make table of bp anueploid score: sum of aneuploid bp (divided by ploidy) per cell line
# then merge by cell line, adding proper names
bp_ploidy_Score<-data.frame(tapply(aneuploid$bp_ploidy_aneuploid, aneuploid$DepMap_ID, FUN=sum))
bp_ploidy_Score <- cbind(rownames(bp_ploidy_Score), data.frame(bp_ploidy_Score, row.names=NULL))
bp_ploidy_Score<- bp_ploidy_Score %>% rename(DepMap_ID='rownames(bp_ploidy_Score)',
bp_ploidy_Score='tapply.aneuploid.bp_ploidy_aneuploid..aneuploid.DepMap_ID..FUN...sum.')
aneuploid<-merge(x= aneuploid, y= bp_ploidy_Score,
by.x="DepMap_ID", by.y="DepMap_ID", sort = FALSE)
aneuploid <- aneuploid[order(aneuploid$DepMap_ID, aneuploid$Chrm_Arm),]
head(aneuploid)
###
### Step 6: gene_aneuploid: arm_call*gene_arm per row
#sort data: only genes with protein products, and make chrm arm collumn
ProteinCodingGenes<- HGCN_gene_data %>% filter(Locus.Type== "gene with protein product")
ProteinCodingGenes$arm <- gsub('[0-9]+', '', ProteinCodingGenes$'Chromosome.band')
ProteinCodingGenes$arm <- gsub('[.]', '', ProteinCodingGenes$arm)
ProteinCodingGenes$arm <- str_sub(ProteinCodingGenes$arm, -1, -1) #start & end on character 1. get only p or q
ProteinCodingGenes$Chrm.Num.Arm<- paste0(ProteinCodingGenes$chromosome, ProteinCodingGenes$arm)
chrmList<-c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22") #list all chrm locations I want (get rid of "NA" and "random")
armList<- c("p", "q") # list all chrm arms I want (to get rid of "NA", and "s", and "d")
GenesOnChrm<- subset(ProteinCodingGenes, arm %in% armList) #19012 prot coding genes. filter good arm
GenesOnChrm<- subset(GenesOnChrm, chromosome %in% chrmList) #18048 prot coding genes, filter good chrm
GenesOnChrm<- GenesOnChrm[GenesOnChrm$Chrm.Num.Arm != "14p", ]
GenesOnChrm<- GenesOnChrm[GenesOnChrm$Chrm.Num.Arm != "21p", ] #18043 prot coding genes, filter out 14p and 21p
gene_arm<- data.frame(table(GenesOnChrm$Chrm.Num.Arm)) #make data frame with number of genes per chrm arm
gene_arm<- gene_arm %>% rename(Chrm.arm=Var1, Gene.Num=Freq)
### Good, now I have a list of protein coding genes per chromosome arm
## can now make aneuploidy score based on number of genes.
#first: add number of genes per chrm arm to data
aneuploid<-merge(x= aneuploid, y= gene_arm,
by.x="Chrm_Arm", by.y="Chrm.arm", sort = FALSE)
aneuploid <- aneuploid[order(aneuploid$DepMap_ID, aneuploid$Chrm_Arm),]
#make new category: aneuploid genes per aneuploid chrm arm. (0 if not aneuploid)
# I am basing this on arm call, so 2 chrm gain is same as 1. but 2 chrm gain usually only happens in
# polyploid background, so this takes ploidy into account more?
aneuploid <- aneuploid %>% mutate(gene_aneuploid = abs(arm_call) * Gene.Num)
# make list of aneuploid score per cell, sum of aneuploid genes per Cell ID
Gene_Score_perCell<-data.frame(tapply(aneuploid$gene_aneuploid, aneuploid$DepMap_ID, FUN=sum))
Gene_Score_perCell <- cbind(rownames(Gene_Score_perCell), data.frame(Gene_Score_perCell, row.names=NULL))
Gene_Score_perCell<- Gene_Score_perCell %>% rename(DepMap_ID='rownames(Gene_Score_perCell)',
Gene_Score='tapply.aneuploid.gene_aneuploid..aneuploid.DepMap_ID..FUN...sum.')
aneuploid<-merge(x= aneuploid, y= Gene_Score_perCell,
by.x="DepMap_ID", by.y="DepMap_ID", sort = FALSE)
aneuploid <- aneuploid[order(aneuploid$DepMap_ID, aneuploid$Chrm_Arm),]
aneuploid$Gene_ploidy_Score<- aneuploid$Gene_Score / aneuploid$ploidy
### Step 3: Make new dataframe with aneuploidy scores/cell #####
###Make new dataframe from the last row of each cell line,
## with the total aneuploidy score
## then cut out un-needed collumns
list<- c()
for (i in 1:997){
list<- c(list, i*39)
}# Make a list of values for row numbers with true aneuploid score
#the last row for each cell line (39th row), (for chrmosome 9q)
Score<- aneuploid[c(list),] #make a list of aneuploid scores/cell (only one row per cell)
head(Score)
#ggplot(Score,aes(ploidy)) +
# geom_bar() # checked ploidy distribution. mostly diploid & triploid.
#make new data frame with them
#then, delete unneccessary rows: chrm, arm, wmed_CN, arm_call
#If you want to try categorical data, you can. but then it's harder
#to do statistical analysis
#Score$Category<- "Aneuploid"
#Score$Category[Score$bp_ploidy_Score<250000000]= "near euploid"
#Score$Category#235 near euploid, 762 aneuploid. ~24%
#Score$Strict<- "Aneuploid"
#Score$Strict[Score$bp_ploidy_Score==0]= "euploid"
#Strict : 34 euploid, 963 aneuploid, ~3% euploid
Score<- Score[ , -which(names(Score) %in% c("wmed_CN", "Chrm_Arm",
"arm_call", "chrom", "arm",
"bp_arm", "bp_ploidy_aneuploid",
"bp_aneuploid", "X", "Chrm_Arm",
"Gene.Num", "gene_aneuploid"))]
head(Score)
write.csv(Score,
"Aneuploidy.Score.perCell.csv", row.names = TRUE)
#or upload xlsx file Aneuploidy.Score.perCell.xlsx
Score<- read_xlsx("Aneuploidy.Score.perCell.xlsx")
# 6.6% of cell lines tested are euploid. 93% are aneuploid to varying degrees.
# of which, 32 (3.2% total) have partial chrm arm gain
# all euploid cells are diploid.
# Score[Score$Gene_ploidy_Score==0,]
### Score data info:
# ploidy is mean chromosome count
# arm_Score is sum of aneuploid chromosome arms.
# bp_Score is aneuploid abs(CN of chrm - round(ploidy)) * length of arm (in bp)
# This will give you a score if you have only part of chrom gained
# bp_ploidy_Score is aneuploid bp_Score/ ploidy
# so that it accounts for gene dosage imbalance:
# ex. tetraploid needs 2x aneuploid chrm for same imbalance.
# Gene_Score is aneuploid chrm arm * genes on arm
# only genes with known protein coding genes
# downloaded data 210217
# from HGNC. https://www.genenames.org/
# Gene_ploidy_Score is aneuploid chrm arm * genes on arm/ ploidy **!!! This is the one used as cellular aneuploidy score
# so that it accounts for gene dosage imbalance:
# ex. tetraploid needs 2x aneuploid chrm for same imbalance.