diff --git a/README.md b/README.md index 5fd5680..c89853c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ `scripts/` contains the scripts that are used for the calculation of genomic profile scores -The main workflow script is `cv_pipeline.sh` which will run each of the required steps. +The main workflow script is `cv_pipeline.sh` which will run each of the required steps. (This pipeline is used to estimate trait heritabilities for each Polynesian Population of interest (i.e not for the CV-trait profile score stages)). It starts with a plink formatted dataset containing all populations of interest, subsets out each population, and does cross-validation for each trait of interest in each population using both GCTA and LDAK. - `scripts/create_model_reports.R` will create reports for each pop/trait combo as listed in `data/pop_trait_models.csv` and apply a template RMarkdown to pull in and summarise the results from LDAK and GCTA. The template document is `scripts/model_selection_doc.Rmd`. diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh new file mode 100644 index 0000000..daeaa9c --- /dev/null +++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh @@ -0,0 +1,115 @@ +echo NPH +POP=nphpca +RESULTS=${POP}_results +TRAIT=gout +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + + MODEL=$(basename ${covar} .covar) + PREV=0.049 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +TRAIT=t2d +for covar in ls ${RESULTS}/*.covar +do + MODEL=$(basename ${covar} .covar) + PREV=0.078 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + + +## East +echo East +POP=eastpca +RESULTS=${POP}_results +TRAIT=gout +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + MODEL=$(basename ${covar} .covar) + PREV=0.043 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +TRAIT=t2d +for covar in ls ${RESULTS}/*.covar +do + MODEL=$(basename ${covar} .covar) + PREV=0.084 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + +## West +echo WEST +POP=westpca +RESULTS=${POP}_results +TRAIT=gout +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + MODEL=$(basename ${covar} .covar) + PREV=0.051 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +TRAIT=t2d +for covar in ls ${RESULTS}/*.covar +do + + MODEL=$(basename ${covar} .covar) + PREV=0.146 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + + +## Euro + +echo EURO +POP=europca +RESULTS=${POP}_results +TRAIT=gout +mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} +for covar in ls ${RESULTS}/*.covar +do + MODEL=$(basename ${covar} .covar) + PREV=0.024 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 + +done > ${POP}_${TRAIT}.log + +TRAIT=t2d +for covar in ls ${RESULTS}/*.covar +do + MODEL=$(basename ${covar} .covar) + PREV=0.049 + mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT} + software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar} + software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4 +done > ${POP}_${TRAIT}.log + + +#Code for transferring directories to local computer.. +# scp -r benrangihuna@biochemcompute1.uod.otago.ac.nz:/Volumes/scratch/merrimanlab/ben/genomic_predictions/{nph,euro,west,east}pca_results ~/Documents/genomic_prediction_results/ + + + + diff --git a/scripts/cv_pipeline.sh b/scripts/cv_pipeline.sh index 253936f..d7b82ef 100644 --- a/scripts/cv_pipeline.sh +++ b/scripts/cv_pipeline.sh @@ -19,7 +19,7 @@ parallel 'bash scripts/generate_pop_pca.sh {}' ::: nph east west euro # create the cv splits and residuals for all pops and trait combos # **** traits must not contain underscores in their names **** -CV=5 # number fo folds for cross validation +CV=2 # number fo folds for cross validation for POP in nph east west euro do diff --git a/scripts/make_pheno_and_covar_files.R b/scripts/make_pheno_and_covar_files.R new file mode 100644 index 0000000..8768f03 --- /dev/null +++ b/scripts/make_pheno_and_covar_files.R @@ -0,0 +1,72 @@ +library(tidyverse) +library(here) + +#CREBRF MASTER FILE (FROM TANYA) +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>% + filter(SUBJECT != "Blank") + + + +nph <- read_delim(here("nphpca_results/nphpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') +east <- read_delim(here("eastpca_results/eastpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') +west <- read_delim(here("westpca_results/westpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') +euro <- read_delim(here("europca_results/europca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ') + +# pheno files + +all_IIDS <- c(nph$IID, east$IID, west$IID, euro$IID) + +tanya <- tanya %>% filter(SUBJECT %in% all_IIDS) + +tanya %>% select(IID = SUBJECT, GOUT) %>% + mutate(GOUT_recode = case_when(str_detect(GOUT,"Control") ~ 1, + GOUT %in% c("ACR Gout","GP Gout") ~ 2, + TRUE ~ NA_real_ + ), + FID = IID) %>% select(FID, IID, GOUT = GOUT_recode) %>% + write_tsv(here("data/tanya_gout.pheno")) + + +tanya %>% select(IID = SUBJECT, TYPE2D) %>% + mutate(TYPE2D_recode = case_when(TYPE2D == "No" ~ 1, + TYPE2D == "Yes" ~ 2, + TRUE ~ NA_real_ + ), + FID = IID) %>% select(FID, IID, TYPE2D = TYPE2D_recode) %>% write_tsv(here("data/tanya_t2d.pheno")) + +# covar files +nph_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(nph) + +east_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(east) + +west_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(west) + +euro_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>% + select(FID,IID, AGECOL, SEX) %>% + right_join(euro) + +cols <- names(nph_covar)[-1:-2] +for(n in cols){ + p <- nph_covar %>% select(FID, IID:!!n) + fn <- paste(names(p)[-1:-2], collapse = "_") + print(fn) + write_tsv(p, here("nphpca_results/",paste0("nphpca_", fn,".covar"))) + + east_covar %>% select(FID, IID:!!n) %>% + write_tsv( here("eastpca_results/",paste0("eastpca_", fn,".covar"))) + + west_covar %>% select(FID, IID:!!n) %>% + write_tsv( here("westpca_results/",paste0("westpca_", fn,".covar"))) + + euro_covar %>% select(FID, IID:!!n) %>% + write_tsv( here("europca_results/",paste0("europca_", fn,".covar"))) + +} + + diff --git a/scripts/make_pop_summary_stats.R b/scripts/make_pop_summary_stats.R new file mode 100644 index 0000000..0f7073a --- /dev/null +++ b/scripts/make_pop_summary_stats.R @@ -0,0 +1,49 @@ +## Remove Pukapuka from West Polynesian Pops + + +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>% + filter(SUBJECT != "Blank") %>% + mutate(GOUT_orig_code = GOUT, GOUT = case_when(str_detect(GOUT,"Control") ~ 1, + GOUT %in% c("ACR Gout","GP Gout") ~ 2, + TRUE ~ NA_real_ + ), + T2D = case_when(TYPE2D == "No" ~ 1, + TYPE2D == "Yes" ~ 2, + TRUE ~ NA_real_ + )) + +## west_pca_tanya.keep is the same as the current westpca.keep file (same PATIENT IDs) +keep_files <- list.files("/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/", pattern = "pca.keep", full.names = TRUE) + +pops <- map_dfr(keep_files, read_tsv, col_names = FALSE) +tanya_filtered <- tanya %>% filter(SUBJECT %in% pops$X1) + +tanya_no_puka <- tanya_filtered %>% mutate(Westkeep= ifelse(ETH_SPECIFIC == "Pukapukan",0,1)) + +tanya_no_puka2 <- filter(tanya_no_puka, Westkeep == 1) + +height <- tanya_no_puka2 %>% drop_na(HEIGHT) +BMI <- tanya_no_puka2 %>% drop_na(BMI) +HDL <- tanya_no_puka2 %>% drop_na(HDL) +T2D<- tanya_no_puka2 %>% drop_na(T2D) +GOUT <- tanya_no_puka2 %>% drop_na(GOUT) + +library(dplyr) + +# There are 2 Mixed Polys, these are NPH participants (NPH group wasnt run) +data.table::setDT(height)[,list(Mean=mean(HEIGHT), Max=max(HEIGHT), Min=min(HEIGHT), Mean=as.numeric(mean(HEIGHT)), Std=sd(HEIGHT)), by=ANALYSISGROUP_EASTWEST] +data.table::setDT(BMI)[,list(Mean=mean(BMI), Max=max(BMI), Min=min(BMI), Mean=as.numeric(mean(BMI)), Std=sd(BMI)), by=ANALYSISGROUP_EASTWEST] +data.table::setDT(HDL)[,list(Mean=mean(HDL), Max=max(HDL), Min=min(HDL), Median=as.numeric(median(HDL)), Std=sd(HDL)), by=ANALYSISGROUP_EASTWEST] + +table(height$ANALYSISGROUP_EASTWEST) +table(BMI$ANALYSISGROUP_EASTWEST) +table(HDL$ANALYSISGROUP_EASTWEST) +table(T2D$ANALYSISGROUP_EASTWEST) +table(height$ANALYSISGROUP_EASTWEST) +table(height$ANALYSISGROUP_EASTWEST) +table(tanya$ANALYSISGROUP_EASTWEST) + +# Gout and T2D tables +tanya_no_puka2 %>% group_by(T2D, ANALYSISGROUP_EASTWEST) %>% tally(sort = F) +tanya_no_puka2 %>% group_by(GOUT, ANALYSISGROUP_EASTWEST) %>% tally(sort = F) +tanya_no_puka2 %>% group_by(SEX, ANALYSISGROUP_EASTWEST) %>% tally(sort = F) diff --git a/scripts/make_sex_specific_pop_files.R b/scripts/make_sex_specific_pop_files.R new file mode 100644 index 0000000..ee21a9a --- /dev/null +++ b/scripts/make_sex_specific_pop_files.R @@ -0,0 +1,60 @@ +# Make sex specific Euro and West Poly keepfiles.. + +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>% + filter(SUBJECT != "Blank") %>% + mutate(GOUT_orig_code = GOUT, GOUT = case_when(str_detect(GOUT,"Control") ~ 1, + GOUT %in% c("ACR Gout","GP Gout") ~ 2, + TRUE ~ NA_real_ + ), + T2D = case_when(TYPE2D == "No" ~ 1, + TYPE2D == "Yes" ~ 2, + TRUE ~ NA_real_ + )) + + +## Read in Euro and West (no Puka) keep files +westpca <- read.delim(file = "data/westnopukapca.keep", sep = "\t", header = F) +euro <- read.delim(file = "data/europca.keep", sep = "\t", header = F) + + + +## List of Sex specific IDs +tanya_men <- filter(tanya, SEX == 1) %>% select(PATIENT, ANALYSISGROUP_EASTWEST) +tanya_women <- filter(tanya, SEX == 2) %>% select(PATIENT, ANALYSISGROUP_EASTWEST) + +## Renaming column so that "euro" or "westpca" have matching columns. +names(tanya_men)[1] <- 'V1' +names(tanya_women)[1] <- 'V1' + + + +euro_men <- filter(tanya_men, ANALYSISGROUP_EASTWEST == "European") %>% select(V1) +euro_women <- filter(tanya_women, ANALYSISGROUP_EASTWEST == "European") %>% select(V1) + +west_men <- filter(tanya_men, ANALYSISGROUP_EASTWEST == "West Polynesian") %>% select(V1) +west_women <- filter(tanya_women, ANALYSISGROUP_EASTWEST == "West Polynesian") %>% select(V1) + + +## Make sex specific population files +euromale <- merge(euro,euro_men,all = F) +eurofemale <- merge(euro,euro_women, all = F) + +westmale <- merge(westpca,west_men, all = F) +westfemale <- merge(westpca, west_women, all = F) +sum(is.na(westfemale$V1)) +sum(is.na(westfemale$V2)) +sum(is.na(westmale$V1)) +sum(is.na(westmale$V2)) + +ifelse(westfemale$V1==westfemale$V2,"Yes","No") + +## make keep files. +write_delim(euromale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/euromalepca.keep", col_names = F, delim = "\t") + +write_delim(eurofemale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/eurofemalepca.keep", col_names = F, delim = "\t") + +write_delim(westmale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/westmalepca.keep", col_names = F, delim = "\t") + +write_delim(westfemale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/westfemalepca.keep", col_names = F, delim = "\t") + + diff --git a/scripts/model_residuals_whole_data.R b/scripts/model_residuals_whole_data.R deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R new file mode 100644 index 0000000..ba19326 --- /dev/null +++ b/scripts/model_residuals_whole_pop.R @@ -0,0 +1,143 @@ +if(!require(here)){ + install.packages("here", repos = "https://cloud.r-project.org") + library("here") +} +if(!require(fs)){ + install.packages("fs", repos = "https://cloud.r-project.org") + library(fs) +} +if(!require(tidyverse)){ + install.packages("tidyverse", repos = "https://cloud.r-project.org") + library(tidyverse) +} +if(!require(optparse)){ + install.packages("optparse", repos = "https://cloud.r-project.org") + library("optparse") +} + + +option_list = list( + make_option(c("--trait"), type="character", default=NULL, + help="Name of trait column that matches exactly from the phenotype file", metavar="character"), + make_option(c("-r", "--regression"), type="character", default="linear", + help="linear|logistic", metavar="character"), + make_option(c("-p","--pop"), type = "character", default=NULL, + help="Name of the population (no special characters except underscore).", metavar="character"), + make_option(c("--out_dir"), type = "character", default = "results", + help = "Name of the directory to store the results (no trailing slash needed).") + ); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + + +# name of trait column +#trait <- "HEIGHT" +if (is.null(opt$trait)){ + print_help(opt_parser) + stop("Trait argument must be supplied.", call.=FALSE) +} else { + trait <- opt$trait +} + + +# define model type: linear or logistic regression +#model_type <- "linear" +if(!(opt$regression == "linear" | opt$regression == "logistic")){ + print_help(opt_parser) + stop("Regression argument must be 'linear' or 'logistic'.", call.=FALSE) +} else { + model_type <- opt$regression +} + +# population name +if(is.null(opt$pop)){ + print_help(opt_parser) + stop("Population (pop) argument must be supplied.", call.=FALSE) +} else { + pop <- opt$pop +} + + +## Used for testing inside of an interactive session +#pop <- 'nph' +#model_type <- "linear" +#trait <- "HEIGHT" +#opt <-list(out_dir = "nph_results/") + +all_dat <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) # %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) +newpca <- read_delim(file = here(paste0(pop,"_results"),paste0(pop,"_pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) + +if (!trait %in% names(all_dat)){ + stop("Trait argument supplied does not match a column in the phenotypes.", call.=FALSE) +} + +### add the trait information on to the samples in the PCA +new_dat <- newpca %>% left_join(., all_dat, by = "SUBJECT") + + + +# Define list of variables for the models #### +preds <- c("AGECOL", "SEX", paste0("PC",1:10)) + +# subset the data to only include the variables needed to run the models and make it so that only individuals with complete observations are included +new_dat <- new_dat %>% select(SUBJECT, !!trait, all_of(preds)) %>% drop_na() + +# generate models stepwise adding variables +models <- list() +for(pred_idx in seq_along(preds)){ + if(pred_idx ==1){ + # create the base model + models <- paste(trait,"~",preds[[pred_idx]]) + } else{ + # add the next covariate to the base model + models[[pred_idx]] <- paste(models[[pred_idx-1]], preds[[pred_idx]], sep = " + ") + } +} + +# function to use for a given formula run the model as either linear or logistic regression +# runs the supplied model on the supplied dataset and returns the model results as a list(model, formula) +run_model <- function(model_formula, dat){ + model_results <- list() + if(model_type == "linear"){ + model_results[["model"]] <- lm(model_formula, data = dat, na.action = na.exclude) + } else if(model_type == "logistic"){ + model_results[["model"]] <- glm(model_formula, family = binomial(link='logit'), data = dat, na.action = na.exclude) + } + + model_results[["formula"]] <- model_formula + return(model_results) +} + +# test example +#run_model(model_formula = models[[1]], new_dat) + + + +# run the set of models on a the data +model_results <- purrr::map(models, run_model, new_dat) %>% set_names(., nm = paste0("model",seq_along(models))) +names(model_results) <- paste0("model",seq_along(models)) + + + +# put the residuals from all models into a dataframe for each cv +model_residuals <- map(model_results, list("model", "residuals")) +model_remove <- map(model_results, list("model","na.action")) + + +# write out residuals, columns: SUBJECT TRAIT model1 ... modelx +new_dat %>% + select(SUBJECT, !!trait) %>% # pull out ids and trait column + #slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals (step no longer needed due to step before model creation that only lets complete obs go into the model) + bind_cols(model_residuals) %>% + write_delim(file = here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")), + delim = " ", + col_names = FALSE) + + +message(paste("residuals file:", here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")))) + + + +saveRDS(list(model_residuals, model_remove, trait, model_type, models, preds), file = here(opt$out_dir,paste0(pop,"_",trait,".RDS"))) + diff --git a/scripts/pca_select_pops.R b/scripts/pca_select_pops.R new file mode 100644 index 0000000..2ba5f73 --- /dev/null +++ b/scripts/pca_select_pops.R @@ -0,0 +1,81 @@ +## 12 July 2021. + +## PCA based Pop selection. +library(tidyverse) +library(here) +library(patchwork) +#CREBRF MASTER FILE (FROM TANYA) +tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) # %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) + +orig <- read_csv(here("data/curated_data.csv")) + +east <- read_tsv(here("data/east597btanya.keep"), col_names = FALSE) %>% mutate(pop = "east") + +west <- read_tsv(here("data/west597btanya.keep"), col_names = FALSE) %>% mutate(pop = "west") + +euro <- read_tsv(here("data/euro597btanya.keep"), col_names = FALSE) %>% mutate(pop = "euro") + +nph <- read_tsv(here("data/nphtanya.keep"), col_names = FALSE) %>% mutate(pop ="nph") + +pops <- bind_rows(east, west, euro, nph) + +pops <- pops %>% left_join(orig %>% + select(SUBJECT, starts_with("PC")), + by = c("X2" = "SUBJECT")) %>% + drop_na() + + +# make plots using the original global coreExome PCs +theme_set(theme_bw()) +p1 <- pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + geom_point() + + ggtitle("Original") + + geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") + + geom_vline(xintercept = 0, colour = 'red', linetype = "dashed") + +## 19 July 2021 Re-run NPH again +new_nph <- pops %>% filter(pop == "nph" & PCA2 > 0 & PCA3 > 0.0025) %>% + select(X1,X2) %>% write_delim(file = here("data/nphpca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop = "nph") + +new_east <- pops %>% filter(pop == "east" & PCA2 > 0 & PCA3 > 0.0025) %>% + select(X1,X2) %>% write_delim(file = here("data/eastpca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop ="east") + +new_west <- pops %>% filter(pop == "west" & PCA2 > 0 & PCA3 < 0.0025) %>% + select(X1,X2) %>% write_delim(file = here("data/westpca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop = "west") + +new_euro <- pops %>% filter(pop == "euro" & PCA2 < 0) %>% + select(X1,X2) %>% write_delim(file = here("data/europca.keep"), + delim = "\t", + col_names = FALSE) %>% mutate(pop = "euro") + + + +new_pops <- bind_rows(new_east, new_west, new_euro, new_nph) + +new_pops <- new_pops %>% left_join(orig %>% + select(SUBJECT, starts_with("PC")), + by = c("X2" = "SUBJECT")) %>% + drop_na() + + +# make plots using the original global coreExome PCs + +p2 <- new_pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + + geom_point() + + ggtitle("After PCA Thresholds") + + geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") + + geom_vline(xintercept = 0, colour = 'red', linetype = "dashed") + +p1 + p2 + patchwork::plot_layout(guides = "collect") + + +# make covar files +new_pops %>% left_join(tanya %>% select(X1 = SUBJECT, AGECOL, SEX), by = "X1") %>% + relocate(AGECOL, SEX, .after = X2) %>% rename(FID = X1, IID = X2) %>% + select(-pop) %>% write_tsv("data/tanya_covar.csv") + diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh index f6ac54a..e847510 100644 --- a/scripts/run_gcta_whole_pop.sh +++ b/scripts/run_gcta_whole_pop.sh @@ -6,15 +6,16 @@ BFILE=$(basename $1 .fam) POP=$(echo $BFILE | cut -d'_' -f1) - +RESULTS=${POP}_results TRAIT=$(echo $BFILE | cut -d'_' -f2) MODEL=$2 -PHENOCOL=$(echo $MODEL + 4 | bc) - +# pulls out the from the start of the residuals - first 7 cols are: FID IID PID MID SEX AFF TRUEPHENOVALUE +PHENOCOL=$(echo $MODEL + 7 | bc) +cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}_gcta @@ -45,7 +46,7 @@ echo "****** $REGRESSION ******" # $(seq 1 to how many trait/residual columns -software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 +software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL}_gcta --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 @@ -55,10 +56,10 @@ software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --p #software/plink1.9b6.10 --bfile data/data --keep tmp/$BFILE --maf 0.01 --geno 0.05 --make-bed --out tmp/$BFILE -software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 +software/gcta64 --bfile tmp/$BFILE --blup-snp ${RESULTS}/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4 -cat ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${DIR}/GCTA/${TRAIT}_${MODEL}.hsq.tsv +cat ${RESULTS}/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${RESULTS}/GCTA/${TRAIT}_${MODEL}.hsq.tsv diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh index 78b3455..96e881f 100644 --- a/scripts/run_ldak_whole_pop.sh +++ b/scripts/run_ldak_whole_pop.sh @@ -6,29 +6,29 @@ BFILE=$(basename $1 .fam) POP=$(echo $BFILE | cut -d'_' -f1) - +RESULTS=${POP}_results TRAIT=$(echo $BFILE | cut -d'_' -f2) MODEL=$2 -PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID - +PHENOCOL=$(echo $MODEL + 7 | bc) # ignore the first 7 columns: FID IID MID PID SEX AFF TRUEPHENOVALUE +cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}_ldak ####################### LDAK ### reml -software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${POP}_results/${POP}_ldak_kinships +software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno${MODEL}_ldak --grm ${RESULTS}/${POP}_ldak_kinships ### blups -software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${CV}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO +software/LDAK/ldak5.linux --calc-blups ${RESULTS}/LDAK/${TRAIT}_${MODEL} --remlfile ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml --grm ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO @@ -38,8 +38,8 @@ software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --r ## make better results files -grep "^Her\|^Com" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.h2 +grep "^Her\|^Com" ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml > ${RESULTS}/LDAK/${TRAIT}_${MODEL}.h2 -grep "^LRT_P" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.p +grep "^LRT_P" ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml > ${RESULTS}/LDAK/${TRAIT}_${MODEL}.p diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh new file mode 100644 index 0000000..8ef0c69 --- /dev/null +++ b/scripts/whole_pop_pipeline.sh @@ -0,0 +1,122 @@ +### whole_pop_pipeline.sh + +## Run: + + +## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} + +## + +# remove duplicate markers and non-snps +# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered + +ORIG_DATA=$1 +POP=$2 +RESULTS=${POP}_results/ + + + + +echo "START" + +date + + +#### Initial setup ##### +mkdir -p tmp ${RESULTS}/ + + +software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted + + +## sort and keep our population +software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata + + +# create list of population specific independent SNPs from data +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers + +######################## GCTA +#grm + +# only run this line once per population to save time +software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 + + + +######################## PCAs + +### calculate principal components for the population using the independent markers +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex + + + + + +####################### LDAK + +echo "Start LDAK $(date)" +### calculate the weights and kinships for LDAK prior to GRM +software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 + +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", +while read line +do + TRAIT=$(echo $line | cut -d',' -f1) + REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} + ##### Generate regression residuals for each model ##### + # makes tmp/${POP}_${TRAIT}_residuals.txt + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/ + + ##### Create phone files with the residuals + ### prune and sort the data + # prune to only those with residuals + # residual are population specific with PCAs included in the model outside this script. + # combine residuals to fam file + # this step need to be repeated for each of the groups at bayesR section + # columns 8-12 are the FIVE residuals + # height, egfr, serumurate, diabetes and gout in that order + # need to join based on number fo models 2.2 - 2.8 was for 7 models + software/plink1.9b6.10 --bfile ${ORIG_DATA} --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT} + # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) + join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno + + # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) + pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + + + #### GCTA REML for each model of current trait + # run the following with 16 parallel jobs + parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + + #### LDAK for each model of current trait + parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} +done < data/pop_trait_models.csv + +WD=$(pwd) +cd ${RESULTS}/LDAK +# combine all LDAK h2 for pop into a single file +grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt +# return to the orignal directory +cd $WD + + + +echo "FINISH" + +date + + diff --git a/scripts/whole_pop_pipeline_CETP_CREBRF.sh b/scripts/whole_pop_pipeline_CETP_CREBRF.sh new file mode 100644 index 0000000..b04ac7c --- /dev/null +++ b/scripts/whole_pop_pipeline_CETP_CREBRF.sh @@ -0,0 +1,127 @@ +### whole_pop_pipeline.sh + +## Run: + + +## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} + +## + +# remove duplicate markers and non-snps +# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered + +ORIG_DATA=$1 +POP=$2 +RESULTS=${POP}_results/ + + + + +echo "START" + +date + + +#### Initial setup ##### +mkdir -p tmp ${RESULTS}/ + + +software/plink1.9b6.10 --bfile ${ORIG_DATA} --geno 0.05 --make-bed --out tmp/data_sorted + +software/plink1.9b6.10 --bfile tmp/data_sorted --missing-genotype . --merge data/PolySNPS --make-bed --out tmp/dataPolySNPS --allow-no-sex + +## sort and keep our population +software/plink1.9b6.10 --bfile tmp/dataPolySNPS --keep data/${POP}.keep --maf 0.01 --make-bed --out tmp/${POP}_sorteddata + + +# create list of population specific independent SNPs from data +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers + +#(dont' need to do) :make sure the snps of interest don't get filtered out later on +#echo RS373863828 >>${RESULTS}/${POP}_pca_markers.prune.in +#echo CETP_57004947 >> ${RESULTS}/${POP}_pca_markers.prune.in + +######################## GCTA +#grm + +# only run this line once per population to save time +software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 + + + +######################## PCAs + +### calculate principal components for the population using the independent markers +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex + + + + + +####################### LDAK + +echo "Start LDAK $(date)" +### calculate the weights and kinships for LDAK prior to GRM +software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 + +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", +while read line +do + TRAIT=$(echo $line | cut -d',' -f1) + REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} + ##### Generate regression residuals for each model ##### + # makes tmp/${POP}_${TRAIT}_residuals.txt + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/ + + ##### Create phone files with the residuals + ### prune and sort the data + # prune to only those with residuals + # residual are population specific with PCAs included in the model outside this script. + # combine residuals to fam file + # this step need to be repeated for each of the groups at bayesR section + # columns 8-12 are the FIVE residuals + # height, egfr, serumurate, diabetes and gout in that order + # need to join based on number fo models 2.2 - 2.8 was for 7 models + software/plink1.9b6.10 --bfile tmp/dataPolySNPS --keep data/${POP}.keep --make-bed --maf 0.01 --out tmp/${POP}_${TRAIT} + # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) + join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno + + # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) + pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + + + #### GCTA REML for each model of current trait + # run the following with 16 parallel jobs + parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + + #### LDAK for each model of current trait + parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} +done < data/pop_trait_models.csv + +WD=$(pwd) +cd ${RESULTS}/LDAK +# combine all LDAK h2 for pop into a single file +grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt +# return to the orignal directory +cd $WD + + + +echo "FINISH" + +date + + diff --git a/scripts/whole_pop_pipeline_no_polysnps.sh b/scripts/whole_pop_pipeline_no_polysnps.sh new file mode 100644 index 0000000..6cc87a7 --- /dev/null +++ b/scripts/whole_pop_pipeline_no_polysnps.sh @@ -0,0 +1,128 @@ +### whole_pop_pipeline.sh + +## Run: + + +## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} + +## + +# remove duplicate markers and non-snps +# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered + +ORIG_DATA=$1 +POP=$2 +RESULTS=${POP}_results/ + + + + +echo "START" + +date + + +#### Initial setup ##### +mkdir -p tmp ${RESULTS}/ + + +software/plink1.9b6.10 --bfile ${ORIG_DATA} --geno 0.05 --make-bed --out tmp/data_sorted + +# No need to add in Poly Snps right now. +# software/plink1.9b6.10 --bfile tmp/data_sorted --missing-genotype . --merge data/PolySNPS --make-bed --out tmp/dataPolySNPS --allow-no-sex + +## sort and keep our population +software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --make-bed --out tmp/${POP}_sorteddata + + +# create list of population specific independent SNPs from data +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers + +#(dont' need to do) :make sure the snps of interest don't get filtered out later on +#echo RS373863828 >>${RESULTS}/${POP}_pca_markers.prune.in +#echo CETP_57004947 >> ${RESULTS}/${POP}_pca_markers.prune.in + +######################## GCTA +#grm + +# only run this line once per population to save time +software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 + + + +######################## PCAs + +### calculate principal components for the population using the independent markers +software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex + + + + + +####################### LDAK + +echo "Start LDAK $(date)" +### calculate the weights and kinships for LDAK prior to GRM +software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata + + +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 + +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", +while read line +do + TRAIT=$(echo $line | cut -d',' -f1) + REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} + ##### Generate regression residuals for each model ##### + # makes tmp/${POP}_${TRAIT}_residuals.txt + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/ + + ##### Create phone files with the residuals + ### prune and sort the data + # prune to only those with residuals + # residual are population specific with PCAs included in the model outside this script. + # combine residuals to fam file + # this step need to be repeated for each of the groups at bayesR section + # columns 8-12 are the FIVE residuals + # height, egfr, serumurate, diabetes and gout in that order + # need to join based on number fo models 2.2 - 2.8 was for 7 models + software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --make-bed --maf 0.01 --out tmp/${POP}_${TRAIT} + # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) + join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt) > ${RESULTS}/${POP}_${TRAIT}.pheno + + # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) + pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + + + #### GCTA REML for each model of current trait + # run the following with 16 parallel jobs + parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} + + #### LDAK for each model of current trait + parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' ::: tmp/${POP}_${TRAIT}.fam ::: ${pheno_cols} +done < data/pop_trait_models.csv + +WD=$(pwd) +cd ${RESULTS}/LDAK +# combine all LDAK h2 for pop into a single file +grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt +# return to the orignal directory +cd $WD + + + +echo "FINISH" + +date + + diff --git a/scripts/whole_pop_model_residuals.R b/tanya_data/scripts/model_residuals_whole_pop.R similarity index 90% rename from scripts/whole_pop_model_residuals.R rename to tanya_data/scripts/model_residuals_whole_pop.R index c93b043..d186d45 100644 --- a/scripts/whole_pop_model_residuals.R +++ b/tanya_data/scripts/model_residuals_whole_pop.R @@ -58,12 +58,14 @@ if(is.null(opt$pop)){ pop <- opt$pop } -pop <- 'nph' -model_type <- "linear" -trait <- "HEIGHT" -opt <-list(out_dir = "nph_results/") -all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) +## Used for testing inside of an interactive session +#pop <- 'nph' +#model_type <- "linear" +#trait <- "HEIGHT" +#opt <-list(out_dir = "nph_results/") + +all_dat <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE)# %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI)) newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = "")) if (!trait %in% names(all_dat)){ @@ -125,12 +127,12 @@ new_dat %>% select(SUBJECT, !!trait) %>% # pull out ids and trait column slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals bind_cols(model_residuals) %>% - write_delim(path = here("tmp",paste0(pop,"_",trait,".residuals.txt")), + write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")), delim = " ", col_names = FALSE) - +message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) diff --git a/tanya_data/scripts/run_gcta_whole_pop.sh b/tanya_data/scripts/run_gcta_whole_pop.sh new file mode 100644 index 0000000..c40fbc6 --- /dev/null +++ b/tanya_data/scripts/run_gcta_whole_pop.sh @@ -0,0 +1,64 @@ +### run_gcta_whole_pop.sh + +## run: bash run_gcta_whole_pop.sh BFILE MODEL + +BFILE=$(basename $1 .fam) + + +POP=$(echo $BFILE | cut -d'_' -f1) + +TRAIT=$(echo $BFILE | cut -d'_' -f2) + + + +MODEL=$2 + +PHENOCOL=$(echo $MODEL + 4 | bc) + + + + + + + + + +echo "****** $REGRESSION ******" + + + + +######################## GCTA + +#grm only run this line once per population to save time + +# transfered to the phenoprep script + + + + +# software/gcta64 --bfile tmp/ --autosome --make-grm --out $DIR/GCTA/grm + + + +#reml + +# $(seq 1 to how many trait/residual columns + +software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 + + + + + +# BLUP solutions for the SNP effects + +#software/plink1.9b6.10 --bfile data/data --keep tmp/$BFILE --maf 0.01 --geno 0.05 --make-bed --out tmp/$BFILE + +software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4 + + + +cat ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv + + diff --git a/tanya_data/scripts/run_ldak_whole_pop.sh b/tanya_data/scripts/run_ldak_whole_pop.sh new file mode 100644 index 0000000..77d0442 --- /dev/null +++ b/tanya_data/scripts/run_ldak_whole_pop.sh @@ -0,0 +1,45 @@ +### run_ldak_whole_pop.sh + + + +BFILE=$(basename $1 .fam) + + +POP=$(echo $BFILE | cut -d'_' -f1) + +TRAIT=$(echo $BFILE | cut -d'_' -f2) + + + +MODEL=$2 + +PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID + + + +####################### LDAK + +### reml + +software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${POP}_results/${POP}_ldak_kinships + + + + +### blups + +software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO + + + + + + + +## make better results files + +grep "^Her\|^Com" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.h2 + +grep "^LRT_P" ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.p + + diff --git a/scripts/whole_pop_pipline.sh b/tanya_data/scripts/whole_pop_pipeline.sh similarity index 61% rename from scripts/whole_pop_pipline.sh rename to tanya_data/scripts/whole_pop_pipeline.sh index e7826d7..3db87a8 100644 --- a/scripts/whole_pop_pipline.sh +++ b/tanya_data/scripts/whole_pop_pipeline.sh @@ -23,74 +23,62 @@ date #### Initial setup ##### -mkdir -p tmp ${RESULTS}/{GCTA,LDAK} +mkdir -p tmp ${RESULTS}/ -software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted + software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted # create lise of independent SNPs from data -software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers -## sort and keep our population - - -mkdir -p tmp + software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers +## sort and keep our population software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata - - - - - ######################## GCTA #grm # only run this line once per population to save time - software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4 - - - ######################## PCAs ### calculate principal components for the population using the independent markers - -software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in + software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in ####################### LDAK +echo "Start LDAK $(date)" ### calculate the weights and kinships for LDAK prior to GRM - -software/LDAK/ldak5.linux --cut-weights $DIR/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata - - - - -software/LDAK/ldak5.linux --calc-weights-all $DIR/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata - - + software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata -software/LDAK/ldak5.linux --calc-kins-direct $DIR/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights $DIR/${POP}_ldak_sections/weights.short --power -0.25 + software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata +software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25 +echo "Start traits $(date)" +# example line to use for testing one iteration +#line="HEIGHT,linear", while read line do TRAIT=$(echo $line | cut -d',' -f1) REGRESSION_TYPE=$(echo $line | cut -d',' -f2) + echo "Start ${TRAIT} $(date)" + mkdir -p ${RESULTS}/{GCTA,LDAK} ##### Generate regression residuals for each model ##### # makes tmp/${POP}_${TRAIT}_residuals.txt - bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + + #bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} + Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT} ##### Create phone files with the residuals ### prune and sort the data @@ -101,12 +89,12 @@ do # columns 8-12 are the FIVE residuals # height, egfr, serumurate, diabetes and gout in that order # need to join based on number fo models 2.2 - 2.8 was for 7 models - software/plink1.9b6.10 --bfile data/data --keep tmp/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} + software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT} # this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols) - join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}.fam) <(sort -k1 tmp/${POP}_${TRAIT}_residuals.txt | cut -f2- ) > tmp/${POP}_${TRAIT}.pheno + join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- ) > tmp/${POP}_${TRAIT}.pheno # calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models) - pheno_cols=$( head -1 tmp/${POP}_${TRAIT}_residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) + pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u) #### GCTA REML for each model of current trait