-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathVariance_aneuploidy.R
564 lines (458 loc) · 27.5 KB
/
Variance_aneuploidy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
###### Calculating gene neutral ploidy coefficient variance (RNA and Protein) #####
## Variance aneuploidy.R
## Protien/RNA Variance: corrolate with aneuploidy-induce expression difference?
## By Klaske Schukken
## 210302
## Update 210504: only do genes with minimum 10 cells per category.
###
# I will calculate Protein (and/or RNA) neutral-ploidy coefficient of variance
# coefficient of variance= standard deviation of protein (and/or RNA), divided by mean expression,
# in only cells with "neutral ploidy" for the chrm the gene is located on
# (cells not-aneuploid for arm of chromosome gene is located on)
# then corrolate variance score with expression difference upon chrm gain and/or loss.
# Final result: correlation coefficient and p-value for difference vs. variance scores
# also plot difference vs variance scores.
# Expect: low variance == low difference upon aneuploidy (buffered)
# high varaince == high difference upon aneuploidy (Scaling and anti scaling)
## Protein expression data
## https://www.cell.com/cell/fulltext/S0092-8674(19)31385-6#secsectitle0190
## Nusinow et al. Cell, 2020, Quantitative proteomics of the cancer cell line encyclopedia
## chromosome arm data : Cohen-Sharir (2021) Nature paper
## RNA expression data: CCLE depmap data.
library('ggplot2')
library('tidyverse')
library('xlsx')
library('readxl')
library('reshape2')
library('BBmisc')
### Steps:
# 1) Get filtered data (RNA & Protein & aneuploidy)
# only cells that have aneuploudy, RNA and Protein data
# get data about location of each gene
# get data about difference upon gain or loss, per gene
# 2) Per gene: calculate varaince:
# i) find gene location (chrm & arm)
# ii) get cells neutral-ploidy for that arm
# iii) calculate coefficient of variance in neutral-ploidy cells, per gene
# 3) Correlate variance with difference per gene
# for loss and gain
# get p-value and correlation coefficient
# 4) Plot difference (loss or gain) vs. variance (within diploid)
###### Step 1: get data #####
### Get filtered data (RNA & Protein & aneuploidy)
# only cells that have aneuploudy, RNA and Protein data
# get data about location of each gene?
# get data about difference upon chrm gain or diploid, per gene?
#### Protein expression data
# filtered to only get data from cells that have RNA and Protein expression
## Protein info: 12755 genes
setwd("/Documents")# ! Set working directory
Protein_Info3<-read_csv("Protein_location_info.csv")
Protein_ProID<-read_csv("Protein_ID_info.csv")
#Get updated Protein info with only 371 cell lines (cell lines also have RNA Data)
# protein data 12755 Proteins
Protein.Expression.filtered<-read.delim2("Protein_Expression_filtered.csv",
dec=",", header = TRUE, sep=";")
### RNA expression data
# filtered to only get data from cells that have RNA and Protein expression
# Import RNA expression data (no aneuploidy data). CCLE depmap data.
# 371 cell lines
# 19 144 RNAs
# RNA Info. names, chrm location, etc.
# 40 674 RNAs info
RNA_Info<-read_excel("HGNC_Protein_Info_edited.xlsx", sheet ="HGNC names")
RNA_Info2<-RNA_Info[,c(2,3,11,12,22,14)]
# Get updated RNA expression with only 371 cell lines (cell lines also have RNA Data)
# 19 144 RNAs expression
RNA.Expression.filtered<-read.delim2("RNA_Expression_filtered.csv",
dec=",", header = TRUE, sep=";")
####get cell line aneuploidy data:
aneuploid<- read.csv("arm_calls_data_bendavid.csv", header=TRUE)
# Protein / RNA expression difference upon chrm arm gain/loss, per gene
# from Protein_RNA_expression.PerCell_v2.R
# or from supplementary data, difference data
CN.Diff.xRNA.yProt.ThreeGroups <- read.csv("RNA.Protein_loss.neutral.gain_Difference_Pvalue_min10points_3cat.csv",
header=TRUE) #9,414 genes
# Note: loss of about 2k genes (protein ID)--
# due to needing both RNA and Protein expression/gene
# and due to needing 3+ cell line with chrm gain/loss for chrm arm of gene location.
## Notes! :
# VarMean is Variance/Mean, also known as coefficient of variance
# Var.Mean.noAn = is coefficient of variance taken from only cells with neutral ploidy for the gene's chrm location
###### Step 2) Per Protein: calculate coefficient of varaince (neutral ploidy cells only): ####
# 2) Per Protein: calculate varaince:
# i) find gene location (chrm & arm)
# ii) get cells diploid for that arm
# iii) calculate variance (&SD?) of expression in diploid cells, per gene
# var(x) # x= data to get variance of
# sd (x) # x= data to get standard deviation of.
# var(x)/mean(x) = coeffiecient of variance
# sapply(data, var) # do var() to each collumn? of data
### Protein variance:
### Using only 371 cell lines that have both RNA and Protein data (only 367 also with aneuploidy data)
## Substep: 1) find cells with no gain/loss of chromosome X, arm p/q
## substep: 2) find location of each protein, chromosome and arm location
## substep: 3) find variance, coefficient of varance, & st. dev. in protein expression per protein, in diploid cells
## substep: 4) add variance, coefficient of varance & st. dev. per protein into a dataframe
## substep: 5) repeat for all chromosome arms.
## substep: 6) plot coefficient of varance vs st. dev. of all proteins
#Set up data
AllChrmArms<- data.frame(Chrm=c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,14,15,16,16,17,17,18,18,19,19,20,20,21,22),
Arm=c("p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q",
"q","q","q","p","q","p","q","p","q","p","q","p","q","q","q"))
## make a list of protein info per protein_ID, so I can sort by prot location later.
# only need to do this once, then use data in for loop
Protein.colnames<-data.frame(Protein_ID=colnames(Protein.Expression.filtered)) #1 col, 12757 genes
# add more protein info per protein. Add Uniprot_Acc and Gene_Symbol
Protein.onChrm1<-merge(x= Protein.colnames, y= Protein_ProID,
by.x="Protein_ID", by.y="Protein_Id",
sort = TRUE)# 7 collumns, 12755 genes
#Find genes with same gene_Symbol
Protein.onChrm2<-merge(x= Protein.onChrm1, y= Protein_Info3,
by.x="Gene_Symbol", by.y="Approved Symbol",
sort = TRUE)# 10 collumns, 12467 genes
#Of genes without matching gene symbol...
No_Gene_Symbol <- anti_join(Protein.onChrm1, Protein_Info3, #finding genes_Symbol without match
by = c("Gene_Symbol" = "Approved Symbol"))
#...find the genes with matching uniprot_ID
Protein.onChrm3<-merge(x= No_Gene_Symbol, y= Protein_Info3,
by.x="Uniprot_Acc", by.y="UniProt ID(supplied by UniProt)",
sort = TRUE)# 10 collumns, 253 genes, only no gene-symbol genes with uniprot_ID
# Merge genes with gene symbol and genes with only uniprot ID.
Protein.onChrm4<-merge(x= Protein.onChrm2, y= Protein.onChrm3,
all=TRUE)# 12 collumns, 12720 genes
# make Chrm number & arm categories, and group by chrm num/arm
Protein.onChrm4$arm <- gsub('[0-9]+', '', Protein.onChrm4$'Chromosome band')
Protein.onChrm4$arm <- gsub('[.]', '', Protein.onChrm4$arm)
Protein.onChrm4$arm <- str_sub(Protein.onChrm4$arm, -1, -1) #start & end on character 1. get only p or q
Protein.onChrm4$Chrm.Num.Arm<- paste0(Protein.onChrm4$Chromosome, Protein.onChrm4$arm)
# 10 collumns, 12720 genes
# Set up empty datat frame to fill in with loop:
Protein.var.sd<-data.frame(Protein_ID=as.character(),
Protein_ID.All=as.character(),
Variance.noAn=numeric(),
Variance.Mean.noAn=numeric(),
Variance.All=numeric(),
St.Dev.noAn= numeric(),
St.Dev.All= numeric(),
Chrm.Num.Arm= as.character(),
stringsAsFactors = TRUE)
###Run data below
##Repeat task for all chromosome arm gains:
for (i in 1:length(AllChrmArms$Chrm)) {
TestChrm<-AllChrmArms$Chrm[i]
TestArm<-AllChrmArms$Arm[i]
TestChrmArm<-paste0(TestChrm, TestArm)
#subset 1: Find cells with no gain/loss of ChrmArm, and test cells
testchrm.percell <- filter(aneuploid, chrom==TestChrm)
testchrm.percell <- filter(testchrm.percell, arm==TestArm)
Cells.Di.forchrm<- filter(testchrm.percell, arm_call==0) #all cells diploid for testProt chrm
# SubStep 2: get proteins located on TestChrmArm
# First get protein info, loci, then subset protein expression data for testarm loci only
# then link Protein values with location
Protein.onChrm5<-Protein.onChrm4 #this is Protein info only, not Prot expression
Protein.onChrm5<-subset(Protein.onChrm5, Chrm.Num.Arm == TestChrmArm) #get only Prot on chrm arm
#Now subset RNA expression data with only chromosome arm of interest.
Protein.Expression.subset <- Protein.Expression.filtered %>% select(Protein.onChrm5$Protein_ID)
Protein.Expression.subset$Cell_line<- Protein.Expression.filtered$Cell_line #add back cell info
#Substep 3: get difference in protein expression between diploid & test cells
#Now get list of protein expression in cells disomic & test cells for test ChrmArm
Protein.Di.Cells<-merge(y= Protein.Expression.subset, x= Cells.Di.forchrm,
by.y="Cell_line", by.x="DepMap_ID",
sort = TRUE)# 368 cells, 12757 proteins
# for each protein, find mean gain, mean Di, difference
Var.PerProtein <- data.frame(Protein_ID=as.character(),
Protein_ID.All=as.character(),
Variance.noAn=numeric(),
Variance.Mean.noAn=numeric(),
Variance.All=numeric(),
St.Dev.noAn= numeric(),
St.Dev.All= numeric(),
Chrm.Num.Arm= as.character(),
stringsAsFactors = TRUE) #this is needed to prevent errors
for (t in 8:length(Protein.Di.Cells)) {
#Get difference in exp between cell line x and mean no gain/loss cells cells, per protein
#protein express subset does not have 7 extra info columns, so add -7 to get same Protein_ID
Var.PerProtein<- rbind(Var.PerProtein,
data.frame(Protein_ID=colnames(Protein.Di.Cells[t]),
Protein_ID.All=colnames(Protein.Expression.subset[t-7]),
Variance.noAn=var(Protein.Di.Cells[,t], na.rm=TRUE),
Variance.Mean.noAn=var(Protein.Di.Cells[,t], na.rm=TRUE)/abs(mean(Protein.Di.Cells[,t], na.rm=TRUE)),
Variance.All=var(Protein.Expression.subset[,t-7], na.rm=TRUE),
St.Dev.noAn= sd(Protein.Di.Cells[,t], na.rm=TRUE),
St.Dev.All= sd(Protein.Expression.subset[,t-7], na.rm=TRUE),
Chrm.Num.Arm= TestChrmArm))
}
##
# Get mean protein expression difference by chrm num/arm category
#Variance.Mean.noAn= variance divided by mean expression on non-aneuploid chromosomes
Protein.var.sd<-rbind(Protein.var.sd, Var.PerProtein)
print(paste0("finished with chrm arm ", TestChrmArm))
}
### Now plot the heatmap for chromosome Protein chrm Gain changes.
length(Protein.var.sd$Protein_ID) #12 149 genes, 502 X, 42 Y, 15 other, 12 missing?
write.csv(Protein.var.sd,
file= "Protein.Variance.SD.VarMean.csv")
#Protein.var.sd<-read.delim2("Protein.Variance.SD.VarMean.csv",
# dec=".", header = TRUE, sep=",")
ggplot(Protein.var.sd, aes(x=Variance.noAn, y=Variance.Mean.noAn))+ #5x5 figure
geom_point()+
xlab("Protein Varance \n(cells without gene loci gain/loss)")+
ylab("Protein Coefficient of Variance \n(cells without gene loci gain/loss)")+
theme_classic()+
ggtitle("Protein variance and standard deviation \nwithin cells without gene loci aneuploidy")
###### Step 2.5) Per RNA: calculate coefficient of varaince (neutral ploidy cells only): ####
# step 2.5) Per RNA: calculate varaince:
# i) find gene location (chrm & arm)
# ii) get cells diploid for that arm
# iii) calculate variance (&SD?) of expression in diploid cells, per gene
### RNA variance:
## Step 2.5: get Variance and SD for RNA, in diploid-only and in all cells
## Substep: 1) find cells with no gain/loss of chromosome X, arm p/q
## substep: 2) find location of each protein, chromosome and arm location
## substep: 3) find variance & st. dev. in protein expression per protein, in diploid cells
## substep: 4) add variance & st. dev. per protein into a dataframe
## substep: 5) repeat for all chromosome arms.
## substep: 6) plot variance vs st. dev. of all proteins
#Set up data
AllChrmArms<- data.frame(Chrm=c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,14,15,16,16,17,17,18,18,19,19,20,20,21,22),
Arm=c("p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q","p","q",
"q","q","q","p","q","p","q","p","q","p","q","p","q","q","q"))
#Before running, make RNA data for all RNA Expression data
#with chrom arm info and RNA_ID equal to colnames on RNA expression data.
#Make RNA expression names into data frame with ID and name
RNA.colnames<-data.frame(RNA_ID=colnames(RNA.Expression.filtered))
RNA.colnames$RNA_Name<-sub("[..].*", "", as.character(RNA.colnames$RNA_ID))
RNA.colnames$RNA_num<-sub(".*[.]{2}", "", as.character(RNA.colnames$RNA_ID))
RNA.colnames$RNA_num<-sub("[.]", "", as.character(RNA.colnames$RNA_num))
#add more RNA info per RNA. Add Uniprot_Acc and Gene_Symbol
RNA.onChrm1<-merge(x=RNA.colnames, y= RNA_Info2,
by.x="RNA_Name", by.y="Approved Symbol",
sort = TRUE)# 8 collumns, 18591 genes
#Of genes without matching gene symbol...
No_Gene_Name <- anti_join(RNA.colnames, RNA_Info2, #finding genes_Symbol without match
by = c("RNA_Name" = "Approved Symbol"))
#...find the genes with matching uniprot_ID
RNA.onChrm2<-merge(x= No_Gene_Name, y= RNA_Info2,
by.x="RNA_num", by.y="Entrez Gene ID",
sort = TRUE)# 462 genes
# Merge genes with gene symbol and genes with only uniprot ID.
RNA.onChrm3<-merge(x= RNA.onChrm1, y= RNA.onChrm2,
all=TRUE)# 9 collumns, 19053 genes
# make Chrm number & arm categories, and group by chrm num/arm
RNA.onChrm3$arm <- gsub('[0-9]+', '', RNA.onChrm3$'Chromosome band')
RNA.onChrm3$arm <- gsub('[.]', '', RNA.onChrm3$arm)
RNA.onChrm3$arm <- str_sub(RNA.onChrm3$arm, -1, -1) #start & end on character 1. get only p or q
RNA.onChrm3$Chrm.Num.Arm<- paste0(RNA.onChrm3$Chromosome, RNA.onChrm3$arm)
RNA.onChrm3$Chrm.Num.Arm<-as.factor(RNA.onChrm3$Chrm.Num.Arm) #19 053 genes
#RNA.onChrm3 is used below in loop to subsection RNA_ID by chrm arm
# Set up empty data frame to fill in with loop:
RNA.var.sd<-data.frame(RNA_ID=as.character(),
RNA_ID.All=as.character(),
Variance.noAn=numeric(),
Variance.Mean.noAn=numeric(),
Variance.All=numeric(),
St.Dev.noAn= numeric(),
St.Dev.All= numeric(),
Chrm.Num.Arm= as.character(),
stringsAsFactors = TRUE)
###Run data below
##Repeat task for all chromosome arm gains:
for (i in 1:length(AllChrmArms$Chrm)) {
TestChrm<-AllChrmArms$Chrm[i]
TestArm<-AllChrmArms$Arm[i]
TestChrmArm<-paste0(TestChrm, TestArm)
#subset 1: Find cells with no gain/loss of ChrmArm, and test cells
testchrm.percell <- filter(aneuploid, chrom==TestChrm)
testchrm.percell <- filter(testchrm.percell, arm==TestArm)
Cells.Di.forchrm<- filter(testchrm.percell, arm_call==0) #all cells diploid for testProt chrm
# SubStep 2: get RNAs located on TestChrmArm
# First get RNA info, loci, then subset RNA expression data for testarm loci only
# then link RNA values with location
RNA.onChrm4<-RNA.onChrm3 #this is RNA info only, not RNA expression
RNA.onChrm4<-subset(RNA.onChrm4, Chrm.Num.Arm == TestChrmArm) #get only RNA on chrm arm
#Now subset RNA expression data with only chromosome arm of interest.
RNA.Expression.subset <- RNA.Expression.filtered %>% select(RNA.onChrm4$RNA_ID)
RNA.Expression.subset$Cell_line<- RNA.Expression.filtered$Cell_line #add back cell info
#Substep 3: get difference in RNA expression between diploid & test cells
#Now get list of RNA expression in cells disomic & test cells for test ChrmArm
RNA.Di.Cells<-merge(y= RNA.Expression.subset, x= Cells.Di.forchrm,
by.y="Cell_line", by.x="DepMap_ID",
sort = TRUE)# 368 cells, 12757 RNAs
# for each RNA, find mean gain, mean of neutral variance cells only, difference
Var.PerRNA <- data.frame(RNA_ID=as.character(),
RNA_ID.All=as.character(),
Variance.noAn=numeric(),
Variance.Mean.noAn=numeric(),
Variance.All=numeric(),
St.Dev.noAn= numeric(),
St.Dev.All= numeric(),
Chrm.Num.Arm= as.character(),
stringsAsFactors = TRUE) #this is needed to prevent errors
for (t in 8:length(RNA.Di.Cells)) {
#Get difference in exp between cell line x and mean no gain/loss cells cells, per RNA
#RNA express subset does not have 7 extra info columns, so add -7 to get same RNA_ID
Var.PerRNA<- rbind(Var.PerRNA,
data.frame(RNA_ID=colnames(RNA.Di.Cells[t]),
RNA_ID.All=colnames(RNA.Expression.subset[t-7]),
Variance.noAn=var(RNA.Di.Cells[,t], na.rm=TRUE),
Variance.Mean.noAn=var(RNA.Di.Cells[,t], na.rm=TRUE)/abs(mean(RNA.Di.Cells[,t], na.rm=TRUE)),
Variance.All=var(RNA.Expression.subset[,t-7], na.rm=TRUE),
St.Dev.noAn= sd(RNA.Di.Cells[,t], na.rm=TRUE),
St.Dev.All= sd(RNA.Expression.subset[,t-7], na.rm=TRUE),
Chrm.Num.Arm= TestChrmArm))
}
##
# Get mean RNA expression difference by chrm num/arm category
RNA.var.sd<-rbind(RNA.var.sd, Var.PerRNA)
print(paste0("finished with chrm arm ", TestChrmArm))
}
### Now plot the heatmap for chromosome RNA chrm Gain changes.
#setwd()#set working directory
length(RNA.var.sd$RNA_ID) #18 166 genes, 829 genes on X chromosome, 124 Y, 20 other/non-chrm
RNA.var.sd$Variance.Mean.noAn<-abs(RNA.var.sd$Variance.Mean.noAn)
write.csv(RNA.var.sd,
file= "RNA.Variance.SD.VarMean.csv")
#RNA.var.sd<-read.delim2("RNA.Variance.SD.VarMean.csv",
# dec=".", header = TRUE, sep=",")
ggplot(RNA.var.sd, aes(x=log2(St.Dev.noAn), y=log2(Variance.Mean.noAn)))+ #5x5 figure
geom_point()+
xlab("log2 RNA St. Dev. \n(in cells without chrm gain or loss)")+
ylab("Log2 RNA Coefficient of Variance \n(in cells without chrm gain or loss)")+
theme_classic()+
ggtitle("RNA variance and standard deviation per gene \nwithin cells without chromosome gain or loss")
###### Step 3) Correlate variance with difference per gene ####
# 3) Correlate variance with difference per gene
# for loss and gain
# get p-value and correlation coefficient
Protein.var.sd$Protein_Name<-as.factor(sub(".*[.]", "", as.character(Protein.var.sd$Protein_ID)))
Protein.var.sd$Protein_Name<-as.factor(sub("_HUMAN", "", as.character(Protein.var.sd$Protein_Name)))
### For Protein Coeff of variance vs Protein difference:
Prot.Gain.Diff.Var<-merge(x= CN.Diff.xRNA.yProt.ThreeGroups, y= Protein.var.sd,
by.x="Protein_ID", by.y="Protein_ID",
sort = TRUE) # 6 collumns, 11235 genes (all with difference score)
#Protein gain pearson correlation: low varance/SD vs difference in expression upon chrm gain/loss
Corr.Prot.Gain.Diff.CV<-cor.test(Prot.Gain.Diff.Var$Protein.Diff.Gain, log2(Prot.Gain.Diff.Var$Variance.Mean.noAn), method="pearson")
# Corr (log2CV to difference)= -0.03, p=0.00198
Corr.Prot.Loss.Diff.CV<-cor.test(Prot.Gain.Diff.Var$Protein.Diff.Loss, log2(Prot.Gain.Diff.Var$Variance.Mean.noAn), method="pearson")
# Corr(log2CV to difference)= 0.10, p-value<2E-16
###
### For RNA chromosome gain:
# CN.Diff.xRNA.yProt.ThreeGroups has 9414 unique RNA_IDs
RNA.var.sd$RNA_Name<-as.factor(sub("[..].*", "", as.character(RNA.var.sd$RNA_ID)))
RNA.var.sd$RNA_num<-as.factor(sub(".*[.]{2}", "", as.character(RNA.var.sd$RNA_ID)))
RNA.var.sd$RNA_num<-as.factor(sub("[.]", "", as.character(RNA.var.sd$RNA_num)))
RNA.Gain.Diff.Var<-merge(x= CN.Diff.xRNA.yProt.ThreeGroups, y=RNA.var.sd, #difference data for 11235 genes
by.x="RNA_Name", by.y="RNA_Name",
sort = TRUE) # 6 collumns, 9414 unique RNA names
# Some proteins have same RNA, leading to "Unique" rows with protein data, but same RNA
# then we have repeat RNA rows that mess with difference vs. variable info.
RNA.Gain.Diff.Var<-distinct(RNA.Gain.Diff.Var) #get only unique RNA data, get rid of duplicates
### RNA log2CV to RNA difference upon aneuploidy
#RNA Gain pearson correlation: low varance/SD vs difference in expression upon chrm gain/loss
Corr.RNA.CV.RNADiff.gain<-cor.test(RNA.Gain.Diff.Var$RNA.Diff.Gain, log2(RNA.Gain.Diff.Var$Variance.Mean.noAn), method="pearson")
# Corr= -0.133, p-value< 2E-16
Corr.RNA.CV.RNADiff.loss<-cor.test(RNA.Gain.Diff.Var$RNA.Diff.Loss, log2(RNA.Gain.Diff.Var$Variance.Mean.noAn), method="pearson")
# Corr= 0.412, p-value< 2E-16
### RNA log2CV to Protein difference upon aneuploidy
#RNA Gain pearson correlation: low varance/SD vs difference in expression upon chrm gain/loss
Corr.RNA.CV.ProteinDiff.gain<-cor.test(RNA.Gain.Diff.Var$Protein.Diff.Gain, log2(RNA.Gain.Diff.Var$Variance.Mean.noAn), method="pearson")
# Corr= -0.000311, p-value= NS
Corr.RNA.CV.ProteinDiff.loss<-cor.test(RNA.Gain.Diff.Var$Protein.Diff.Loss, log2(RNA.Gain.Diff.Var$Variance.Mean.noAn), method="pearson")
# Corr= 0.146, p-value< 2E-16
####
## RNA has distinct low variance group of genes. ~3.5k genes, from 11k total
RNA.LowCV<-subset(RNA.Gain.Diff.Var, log2(Variance.Mean.noAn)>=-1)
write.csv(RNA.LowCV, file= "RNA.lowCV.group.csv")
###### Step 4) Plot difference (gain & loss) vs. variance (within diploid) ####
# 4) Plot difference (gain & loss) vs. variance (within diploid)
## RNA varaince vs. RNA difference (gain)
ggplot(RNA.Gain.Diff.Var, aes(x=log2(Variance.Mean.noAn), y=(RNA.Diff.Gain)))+ #5x5 figure
#geom_density2d(color="black")+
#geom_point(data = subset(RNA.Gain.Diff.Var, RNA_Name %in% Protein.Loss.HSP$Gene_Symbol),
# aes(x = log2(Variance.Mean.noAn), y=(Difference.y), color="Red"))+
stat_density_2d(alpha=0.3, geom= "polygon", color="black",
aes() )+
scale_color_manual(values=c("a"="black", "b"=" Red")) +
scale_fill_manual(values=c("a"="black", "b"="Red")) +
xlab("log2(RNA coefficient of variance) \n (in cells without chrm gain/loss)")+
ylab("RNA difference upon chrm gain")+
theme_classic()+
#scale_color_manual(values=c("black", "red"))+
geom_smooth(method="lm", color="Red")+
coord_cartesian(xlim=c(-6,3), ylim=c(-1,1))+
ggtitle("RNA Relative Variance within cells without gene loci aneuploidy
vs RNA difference in expression upon chromosome gain:
all genes")
#3x3 for density2d plot
# plot.RNA.Gain.Log2CV.RNADiff
## RNA varaince vs. RNA difference (gain)
ggplot(RNA.Gain.Diff.Var, aes(x=log2(Variance.Mean.noAn), y=(RNA.Diff.Loss)))+ #5x5 figure
stat_density_2d(alpha=0.3, geom= "polygon", color="black",
aes() )+
scale_color_manual(values=c("a"="black", "b"=" Red")) +
scale_fill_manual(values=c("a"="black", "b"="Red")) +
xlab("log2(RNA coefficient of variance) \n (in cells without chrm gain/loss)")+
ylab("RNA difference upon chrm loss")+
theme_classic()+
geom_smooth(method="lm", color="Red")+
coord_cartesian(xlim=c(-6,3), ylim=c(-1,1))+
ggtitle("RNA Relative Variance within cells without gene loci aneuploidy
vs RNA difference in expression upon chromosome loss:
all Proteins")
#3x3 for density2d plot
# plot.RNA.Loss.Log2CV.RNADiff
## RNA varaince vs. Protein difference (gain)
ggplot(RNA.Gain.Diff.Var, aes(x=log2(Variance.Mean.noAn), y=(Protein.Diff.Loss)))+ #5x5 figure
stat_density_2d(alpha=0.3, geom= "polygon", color="black",
aes() )+
scale_color_manual(values=c("a"="black", "b"=" Red")) +
scale_fill_manual(values=c("a"="black", "b"="Red")) +
xlab("log2(RNA coefficient of variance) \n (in cells without chrm gain/loss)")+
ylab("Protein difference upon chrm loss")+
theme_classic()+
geom_smooth(method="lm", color="Red")+
coord_cartesian(xlim=c(-6,3), ylim=c(-1,1))+
ggtitle("RNA Relative Variance within cells without gene loci aneuploidy
vs Protein difference in expression upon chromosome loss:
all Proteins")
#3x3
# plot.RNA.Loss.Log2CV.ProtDiff
RNA.Gain.Diff.Var$LowRNA.CV<-log2(RNA.Gain.Diff.Var$Variance.Mean.noAn)< -1
RNA.Gain.Diff.Var$LowRNA.CV<- gsub("TRUE","Low", RNA.Gain.Diff.Var$LowRNA.CV)
RNA.Gain.Diff.Var$LowRNA.CV<- gsub("FALSE","High", RNA.Gain.Diff.Var$LowRNA.CV)
RNA.Gain.Diff.Var$LowRNA.CV<-factor(RNA.Gain.Diff.Var$LowRNA.CV, levels=c("Low", "High"))
# test to see if categorical (High RNA var vs low RNA var): affects RNA/Protein differnce upon aneuploidy
lowRNAvar<-subset(RNA.Gain.Diff.Var, log2(RNA.Gain.Diff.Var$Variance.Mean.noAn)< -1)
highRNAvar<-subset(RNA.Gain.Diff.Var, log2(RNA.Gain.Diff.Var$Variance.Mean.noAn)> -1)
t.test(lowRNAvar$Protein.Diff.Loss, highRNAvar$Protein.Diff.Loss) #p<2E-16
t.test(lowRNAvar$Protein.Diff.Gain, highRNAvar$Protein.Diff.Gain) #p=0.028
t.test(lowRNAvar$RNA.Diff.Loss, highRNAvar$RNA.Diff.Loss) #p<2E-16
t.test(lowRNAvar$RNA.Diff.Gain, highRNAvar$RNA.Diff.Gain) #p<2E-16
## RNA varaince vs. Protein difference in expression
ggplot(RNA.Gain.Diff.Var, aes(x=LowRNA.CV, y=Protein.Diff.Loss))+
geom_boxplot(outlier.shape = NA)+
geom_hline(yintercept = 0)+
xlab("RNA neutral ploidy variance")+
ylab("RNA difference upon chrm loss")+
theme_classic()+
coord_cartesian( ylim=c(-2,2))+
ggtitle("RNA neutral ploidy variance
vs RNA difference upon chromosome loss")
#4x4
# plot.boxplot.RNALog2CV.RNADiff.Loss
###### Step 7) Save files #####
##save as csv files. to run through g: profiler.
write.csv(RNA.lowVar.withDiffData, file= "RNA.lowVar.Diff.chrmgainandloss.csv")
RNA.lowVar<-subset(RNA.var.sd, RNA.var.sd$Variance.Mean.noAn <= 0.1)
RNA.var.sd #18 052 genes, 829 genes on X chromosome, 124 Y, 20 other, 28 missing?
#this is just RNA variance info, not difference upon aneuploidy info.
Protein.var.sd #12 149 genes, 502 X, 42 Y, 15 other, 12 missing?
Prot.lowVar.withDiffData<- merge(Prot.Gain.lowVar, Prot.Loss.lowVar,
by.x="Protein_ID", by.y= "Protein_ID")#512 Proteins
RNA.lowVar.withDiffData<- merge(RNA.Gain.lowVar, RNA.Loss.lowVar,
by.x="RNA_Name", by.y="RNA_Name") #80 RNAs
# Note: var-diff data is based on gene with both RNA and Protein data,
# and only genes on autosomes (aneuploidy data) (so no X, Y, mitochondria, or centromere genes)
# so it is a smaller list than all RNA or all Protein data.
# also 146 RNA_ID genes did not have corresponding RNA IDs in difference data.