diff --git a/R/convert_rds_to_anndata.R b/R/convert_rds_to_anndata.R new file mode 100644 index 0000000..61aeda5 --- /dev/null +++ b/R/convert_rds_to_anndata.R @@ -0,0 +1,29 @@ +## Export data to be used as input for analysis with python +## packages such as scanpy + +# Libraries ---- + +stopifnot( + require(optparse), + require(Seurat), + require(SeuratDisk), + require(tenxutils) +) + +# Options ---- + +option_list <- list( + make_option(c("--seuratobject"), default="begin.Robj", + help="A seurat object after dimensionality reduction") +) + +opt <- parse_args(OptionParser(option_list=option_list)) + +cat("Running with options:\n") +print(opt) + +output_file = gsub(".h5seurat", ".h5ad", opt$seuratobject) + +Convert(opt$seuratobject, dest = output_file, overwrite = TRUE, verbose = TRUE) + +print("Completed") diff --git a/R/export_for_python.R b/R/export_for_python.R index c0b5d1e..e7986a1 100644 --- a/R/export_for_python.R +++ b/R/export_for_python.R @@ -53,10 +53,10 @@ exportMetaData <- function(seurat_object, outdir=NULL) { quote=FALSE, sep="\t", row.names = FALSE, col.names = TRUE) } - # Read RDS seurat object -message("readRDS") +message(sprintf("readRDS: %s", opt$seuratobject)) s <- readRDS(opt$seuratobject) + message("export_for_python running with default assay: ", DefaultAssay(s)) # Write out the cell and feature names diff --git a/R/plot_group_numbers.R b/R/plot_group_numbers.R index a410cc1..d406aba 100644 --- a/R/plot_group_numbers.R +++ b/R/plot_group_numbers.R @@ -179,8 +179,8 @@ if(opt$stat %in% c("total_UMI", "ngenes")) { require(Seurat) # add the statistic from the seurat object - s <- readRDS(opt$seuratobject) - + s <- loadSeurat(path=opt$seuratobject) + ## set the default assay message("Setting default assay to: ", opt$seuratassay) DefaultAssay(s) <- opt$seuratassay diff --git a/R/plot_rdims_cluster_genes.R b/R/plot_rdims_cluster_genes.R index cbaddfc..74f1349 100644 --- a/R/plot_rdims_cluster_genes.R +++ b/R/plot_rdims_cluster_genes.R @@ -117,7 +117,7 @@ print(opt) ## read in the assay data #data <- read.table(opt$assaydata, sep="\t", header=T, as.is=T, check.names=F) message("reading in assay data") -data <- readRDS(opt$assaydata) +data <- loadSeurat(opt$assaydata) message("reading in the coordinates") plot_data <- read.table(opt$table,sep="\t",header=TRUE) diff --git a/R/plot_rdims_gene.R b/R/plot_rdims_gene.R index fbb9979..c3c4413 100644 --- a/R/plot_rdims_gene.R +++ b/R/plot_rdims_gene.R @@ -119,9 +119,7 @@ if(opt$pointpch != ".") { opt$pointpch <- as.numeric(opt$pointpch) } cat("Running with options:\n") print(opt) - -## read in the raw count matrix -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) ## set the default assay message("Setting default assay to: ", opt$seuratassay) diff --git a/R/plot_violins.R b/R/plot_violins.R index 202e9ab..46f9e66 100644 --- a/R/plot_violins.R +++ b/R/plot_violins.R @@ -59,7 +59,8 @@ genes <- read.table(opt$genetable, ## read in the raw count matrix -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + Idents(s) <- readRDS(opt$clusterids) ## set the default assay diff --git a/R/scanpy_post_process_clusters.R b/R/scanpy_post_process_clusters.R index 3142f06..a3dbc7e 100644 --- a/R/scanpy_post_process_clusters.R +++ b/R/scanpy_post_process_clusters.R @@ -34,8 +34,7 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) -message(sprintf("readRDS: %s", opt$seuratobject)) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) message("seurat_cluster.R running with default assay: ", DefaultAssay(s)) @@ -71,16 +70,16 @@ names(cluster_ids) <- clusters$barcode unique_cluster_ids <- unique(cluster_ids) print(unique_cluster_ids) -message(sprintf("saving a unique list ofe cluster ids")) +print(sprintf("saving a unique list of cluster ids")) write.table(unique_cluster_ids, file=file.path(opt$outdir,"cluster_ids.tsv"), quote=FALSE, col.names = FALSE, row.names = FALSE) cluster_colors <- gg_color_hue(length(unique_cluster_ids)) -message(sprintf("saving the cluster colors (ggplot)")) +print(sprintf("saving the cluster colors (ggplot)")) write.table(cluster_colors, file=file.path(opt$outdir,"cluster_colors.tsv"), quote=FALSE, col.names = FALSE, row.names = FALSE) -message(sprintf("saveRDS")) +print(sprintf("saveRDS of cluster ids")) saveRDS(cluster_ids, file=file.path(opt$outdir,"cluster_ids.rds")) cluster_assignments <- data.frame(barcode=as.character(names(cluster_ids)), @@ -91,6 +90,6 @@ write.table(cluster_assignments, sep="\t", col.names=T, row.names=F, quote=F) -message("seurat_cluster.R final default assay: ", DefaultAssay(s)) +print(sprintf("seurat_cluster.R final default assay: %s", DefaultAssay(s))) -message("Completed") +print("Completed") diff --git a/R/seurat_FindMarkers.R b/R/seurat_FindMarkers.R index 1a2a94a..7d65edc 100644 --- a/R/seurat_FindMarkers.R +++ b/R/seurat_FindMarkers.R @@ -78,27 +78,43 @@ plan("multiprocess", workers = opt$numcores) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) cluster_ids <- readRDS(opt$clusterids) +if (class(s@misc) == "list"){ + cat("Convert the s@misc to dataframe") + s@misc <- as.data.frame(s@misc, stringsAsFactors=FALSE) +} + have_gene_ids <- "gene_id" %in% colnames(s@misc) if(!have_gene_ids) { - cat("Missing ensembl gene_id column in @misc, reading in annotation.\n") - + cat("Missing ensembl gene_id column in @misc, reading in annotation from other source.\n") + + if(endsWith(opt$seuratobject, ".h5seurat")){ + message("Using s@assays$RNA@meta.features in Seurat object for gene annotation. This slot contains unique gene names and is produced during conversion from scanpy anndata object.") + ann <- s@assays$RNA@meta.features + ann$seurat_id = rownames(ann) + ann <- unique(ann[,c("gene_ids", "seurat_id")]) + colnames(ann) <- c("ensembl_id", "gene_name") + } else { + message("Reading in annotation from annotation.dir") + message("Please note that only genes with unique gene names will be kept.") + message("The other ones will be lost due to Seurat's process of making gene names unique!") ann <- read.table(gzfile(opt$annotation), header=T, sep="\t", as.is=T) ## subset to columns of interest ann <- unique(ann[,c("ensembl_id", "gene_name")]) + } - ## the gene names have been made unique so we want only 1:1 mappings. - gn_tab <- table(ann$gene_name) + ## the gene names have been made unique so we want only 1:1 mappings. + gn_tab <- table(ann$gene_name) - unique_gn <- names(gn_tab)[gn_tab == 1] - ann <- ann[ann$gene_name %in% unique_gn,] + unique_gn <- names(gn_tab)[gn_tab == 1] + ann <- ann[ann$gene_name %in% unique_gn,] - rownames(ann) <- ann$gene_name + rownames(ann) <- ann$gene_name } xx <- names(cluster_ids) diff --git a/R/seurat_begin.R b/R/seurat_begin.R index 96035d6..796061e 100644 --- a/R/seurat_begin.R +++ b/R/seurat_begin.R @@ -42,7 +42,6 @@ option_list <- list( c("--matrixtype"), default="10X", help="either 10X or rds" ), - make_option( c("--project"), default="SeuratAnalysis", @@ -53,6 +52,11 @@ option_list <- list( default="seurat.out.dir", help="Location for outputs files. Must exist." ), + make_option( + c("--file_type"), + default="rds", + help="Which type of file to use (rds or h5seurat)" + ), make_option( c("--metadata"), default="none", @@ -674,6 +678,11 @@ print( message("seurat_begin.R object final default assay: ", DefaultAssay(s)) # Save the R object -saveRDS(s, file=file.path(opt$outdir, "begin.rds")) +if (opt$file_type == "rds") { + outfile = file.path(opt$outdir, "begin.rds") +} else { + outfile = file.path(opt$outdir, "begin.h5seurat") +} +saveSeurat(path=outfile, format=opt$file_type) message("Completed") diff --git a/R/seurat_characteriseClusterDEGenes.R b/R/seurat_characteriseClusterDEGenes.R index 0e62490..a2f80ea 100644 --- a/R/seurat_characteriseClusterDEGenes.R +++ b/R/seurat_characteriseClusterDEGenes.R @@ -187,7 +187,8 @@ tex <- c(tex, getFigureTex(defn, deCaption,plot_dir_var=opt$plotdirvar)) ## enforce significance ## read in the seurat object -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + cluster_ids <- readRDS(opt$clusterids) Idents(s) <- cluster_ids diff --git a/R/seurat_clusterStats.R b/R/seurat_clusterStats.R index 31984ac..e78ed97 100644 --- a/R/seurat_clusterStats.R +++ b/R/seurat_clusterStats.R @@ -52,7 +52,7 @@ cat("Running with options:\n") print(opt) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) cluster_ids <- readRDS(opt$clusterids) xx <- names(cluster_ids) diff --git a/R/seurat_compare_clusters.R b/R/seurat_compare_clusters.R index f73d4ff..737862b 100644 --- a/R/seurat_compare_clusters.R +++ b/R/seurat_compare_clusters.R @@ -36,8 +36,7 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) -message(sprintf("readRDS: %s", opt$seuratobject)) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) message("seurat_cluster.R running with default assay: ", DefaultAssay(s)) diff --git a/R/seurat_dm.R b/R/seurat_dm.R index a5f65d6..bcd76c1 100644 --- a/R/seurat_dm.R +++ b/R/seurat_dm.R @@ -48,9 +48,7 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) - -message("readRDS") -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) cluster_ids <- readRDS(opt$clusterids) Idents(s) <- cluster_ids diff --git a/R/seurat_explore_hvg_and_cell_cycle.R b/R/seurat_explore_hvg_and_cell_cycle.R index 58810f9..4b5c393 100644 --- a/R/seurat_explore_hvg_and_cell_cycle.R +++ b/R/seurat_explore_hvg_and_cell_cycle.R @@ -200,7 +200,7 @@ options(future.globals.maxSize = opt$memory * 1024^2) # Input data ---- -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) ## ######################################################################### ## ## # (i) Initial normalisation, variable gene identification and scaling ### ## diff --git a/R/seurat_find_neighbors.R b/R/seurat_find_neighbors.R index 27e7eaf..56dbb75 100644 --- a/R/seurat_find_neighbors.R +++ b/R/seurat_find_neighbors.R @@ -87,7 +87,7 @@ options(future.globals.maxSize = opt$memory * 1024^2) # Input data ---- -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) if(opt$usesigcomponents) { diff --git a/R/seurat_get_assay_data.R b/R/seurat_get_assay_data.R index 4a69a97..24a3351 100644 --- a/R/seurat_get_assay_data.R +++ b/R/seurat_get_assay_data.R @@ -63,8 +63,7 @@ cat("Running with options:\n") print(opt) -## read in the raw count matrix -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) ## set the default assay message("Setting default assay to: ", opt$seuratassay) diff --git a/R/seurat_jackstraw.R b/R/seurat_jackstraw.R index dea3139..463d932 100644 --- a/R/seurat_jackstraw.R +++ b/R/seurat_jackstraw.R @@ -30,7 +30,7 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) ## In Macosko et al, we implemented a resampling test inspired by the jackStraw procedure. ## We randomly permute a subset of the data (1% by default) and rerun PCA, diff --git a/R/seurat_normalise_and_scale.R b/R/seurat_normalise_and_scale.R index c20de5a..58ca0cb 100644 --- a/R/seurat_normalise_and_scale.R +++ b/R/seurat_normalise_and_scale.R @@ -221,7 +221,7 @@ options(future.globals.maxSize = opt$memory * 1024^2) # Input data ---- -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) ## ######################################################################### ## ## # (i) Initial normalisation, variable gene identification and scaling ## ## @@ -351,6 +351,6 @@ if(opt$normalizationmethod=="sctransform") message("seurat_begin.R object final default assay: ", DefaultAssay(s)) # Save the R object -saveRDS(s, file=file.path(opt$outdir, "begin.rds")) +saveSeurat(path=opt$seuratobject) message("Completed") diff --git a/R/seurat_pca.R b/R/seurat_pca.R index 1027a91..2d95fa5 100644 --- a/R/seurat_pca.R +++ b/R/seurat_pca.R @@ -115,7 +115,7 @@ options(future.globals.maxSize = opt$memory * 1024^2) # Input data ---- -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) ## ######################################################################### ## ## ################### (vi) Dimension reduction (PCA) ###################### ## @@ -200,6 +200,6 @@ if(opt$normalizationmethod!="sctransform" & opt$jackstraw) message("seurat_begin.R object final default assay: ", DefaultAssay(s)) # Save the R object -saveRDS(s, file=file.path(opt$outdir, "begin.rds")) +saveSeurat(path=opt$seuratobject) message("Completed") diff --git a/R/seurat_summariseMarkers.R b/R/seurat_summariseMarkers.R index b493031..7eda7f2 100644 --- a/R/seurat_summariseMarkers.R +++ b/R/seurat_summariseMarkers.R @@ -40,7 +40,8 @@ if(!is.null(opt$subgroup)) { opt$subgroup <- strsplit(opt$subgroup,",")[[1]]} cat("Running with options:\n") print(opt) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + cluster_ids <- readRDS(opt$clusterids) Idents(s) <- cluster_ids diff --git a/R/seurat_summariseMarkersBetween.R b/R/seurat_summariseMarkersBetween.R index 98b0449..4f25560 100644 --- a/R/seurat_summariseMarkersBetween.R +++ b/R/seurat_summariseMarkersBetween.R @@ -48,7 +48,7 @@ print(opt) # set the run specs run_specs <- paste(opt$numpcs,opt$resolution,opt$algorithm,opt$testuse,sep="_") -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) cluster_ids <- readRDS(opt$clusterids) message("Setting the default seurat assay to: ", opt$seuratassay) diff --git a/R/seurat_tsne.R b/R/seurat_tsne.R index c94a41a..a15b660 100644 --- a/R/seurat_tsne.R +++ b/R/seurat_tsne.R @@ -45,8 +45,8 @@ cat("Running with options:\n") print(opt) -message("readRDS") -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + cluster_ids <- readRDS(opt$clusterids) Idents(s) <- cluster_ids print(head(Idents(s))) diff --git a/R/seurat_umap.R b/R/seurat_umap.R index 2df6391..7cdab7b 100644 --- a/R/seurat_umap.R +++ b/R/seurat_umap.R @@ -50,9 +50,8 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) +s <- loadSeurat(path=opt$seuratobject) -message("readRDS") -s <- readRDS(opt$seuratobject) s@graphs <- readRDS(opt$seuratgraphs) message("seurat_umap.R running with default assay: ", DefaultAssay(s)) diff --git a/R/singleR_extra_plots.R b/R/singleR_extra_plots.R index 98fb522..44b546a 100644 --- a/R/singleR_extra_plots.R +++ b/R/singleR_extra_plots.R @@ -35,8 +35,8 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) -message(sprintf("readRDS: %s", opt$seuratobject)) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + a <- ifelse("SCT" %in% names(s), yes = "SCT", no = "RNA") sce <- as.SingleCellExperiment(s, assay = a) # Seems to be using SCT assay (so does it take the defualt?) diff --git a/R/singleR_plots.R b/R/singleR_plots.R index a132088..83537bd 100644 --- a/R/singleR_plots.R +++ b/R/singleR_plots.R @@ -38,8 +38,8 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) -message(sprintf("readRDS: %s", opt$seuratobject)) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + a <- ifelse("SCT" %in% names(s), yes = "SCT", no = "RNA") sce <- as.SingleCellExperiment(s, assay = a) # Seems to be using SCT assay (so does it take the defualt?) diff --git a/R/singleR_run.R b/R/singleR_run.R index bc8364e..a95bc72 100644 --- a/R/singleR_run.R +++ b/R/singleR_run.R @@ -35,8 +35,8 @@ opt <- parse_args(OptionParser(option_list=option_list)) cat("Running with options:\n") print(opt) -message(sprintf("readRDS: %s", opt$seuratobject)) -s <- readRDS(opt$seuratobject) +s <- loadSeurat(path=opt$seuratobject) + a <- ifelse("SCT" %in% names(s), yes = "SCT", no = "RNA") sce <- as.SingleCellExperiment(s, assay = a) # Seems to be using SCT assay (so does it take the defualt?) diff --git a/pipelines/pipeline_seurat.py b/pipelines/pipeline_seurat.py index 0ddcc38..cd72bae 100644 --- a/pipelines/pipeline_seurat.py +++ b/pipelines/pipeline_seurat.py @@ -212,7 +212,6 @@ def taskSummary(infile, outfile): run.append(str(v)) tab = pd.DataFrame(list(zip(tasks,run)),columns=["task","run"]) - print(tab) tab.to_latex(buf=outfile, index=False) @@ -296,6 +295,7 @@ def createSeuratObject(infile, outfile): --matrixtype=%(matrix_type)s --project=%(sample_name)s --outdir=%(outdir)s + --file_type=%(input_format)s --groupby=%(qc_groupby)s --mingenes=%(qc_initial_mingenes)s --mincells=%(qc_mincells)s @@ -512,12 +512,13 @@ def seuratPCA(infile, outfile): # ############### Predict cell-types using singleR ################### # # #################################################################### # +file_suffix = str(PARAMS["input_format"]) @active_if(PARAMS["headstart_seurat_object"]) @transform(os.path.join(str(PARAMS["headstart_path"]), - "*.seurat.dir/begin.rds"), - regex(r".*/(.*).seurat.dir/begin.rds"), - r"\1.seurat.dir/begin.rds") + "*.seurat.dir/begin."+file_suffix), + regex(r".*/(.*).seurat.dir/begin."+file_suffix), + rf"\1.seurat.dir/begin."+file_suffix) def headstart(infile, outfile): ''' link in the begin.rds objects @@ -544,7 +545,10 @@ def headstart(infile, outfile): def genSingleRjobs(): '''generate the singleR jobs''' - seurat_objects = glob.glob("*.seurat.dir/begin.rds") + if PARAMS["input_format"] == "rds": + seurat_objects = glob.glob("*.seurat.dir/begin.rds") + else: + seurat_objects = glob.glob("*.seurat.dir/begin.h5seurat") references = [x.strip() for x in PARAMS["singleR_reference"].split(",")] @@ -610,7 +614,6 @@ def singleR(infile, outfile): spec, SPEC = TASK.get_vars(infile, outfile, PARAMS) - # set the job threads and memory job_threads, job_memory, r_memory = TASK.get_resources( memory=PARAMS["resources_memory_extreme"], @@ -693,10 +696,22 @@ def plotExtraSingleR(infile, outfile): # ############ Begin per-parameter combination analysis runs ################ # # ########################################################################### # +USE_RDS = 0 +USE_H5 = 0 +if PARAMS["input_format"] == "rds": + SEURAT_OBJECTS = "*.seurat.dir/begin.rds" + S_REGEX = regex(r"(.*)/begin.rds") + USE_RDS = 1 +else: + SEURAT_OBJECTS = "*.seurat.dir/begin.h5seurat" + S_REGEX = regex(r"(.*)/begin.h5seurat") + USE_H5 = 1 + +@active_if(USE_RDS) @follows(seuratPCA, headstart) -@transform("*.seurat.dir/begin.rds", - regex(r"(.*)/begin.rds"), +@transform(SEURAT_OBJECTS, + S_REGEX, r"\1/export_for_python.sentinel") def exportForPython(infile, outfile): ''' @@ -779,6 +794,58 @@ def exportForPython(infile, outfile): P.run(statement) IOTools.touch_file(outfile) +@active_if(USE_H5) +@follows(seuratPCA, headstart) +@transform(SEURAT_OBJECTS, + S_REGEX, + r"\1/export_for_python.sentinel") +def convertRDStoAnndata(infile, outfile): + '''Convert Seurat h5ad to anndata h5ad ''' + + spec, SPEC = TASK.get_vars(infile, outfile, PARAMS) + + if PARAMS["headstart_export_for_python"]: + + source_folder = os.path.join(PARAMS["headstart_path"], + spec.sample_dir) + + source_files = ["begin.h5ad", + "export_for_python.*"] + + source_paths = [ os.path.join(source_folder, x) + for x in source_files ] + + for source in source_paths: + if "*" in source: + files = glob.glob(source) + else: + files = [ source ] + + for sf in files: + if os.path.exists(sf): + os.symlink(os.path.relpath(sf, + start=spec.sample_dir), + os.path.join(spec.sample_dir, + os.path.basename(sf))) + else: + # set the job threads and memory + job_threads, job_memory, r_memory = TASK.get_resources( + memory=PARAMS["resources_memory_standard"]) + + statement = '''Rscript %(tenx_dir)s/R/convert_rds_to_anndata.R + --seuratobject=%(seurat_object)s + &> %(log_file)s + ''' % dict(PARAMS, **SPEC, **locals()) + + P.run(statement) + IOTools.touch_file(outfile) + + +if PARAMS["input_format"] == "rds": + TASK_EXPORT = exportForPython +else: + TASK_EXPORT = convertRDStoAnndata + def genAnndata(): @@ -790,7 +857,10 @@ def genAnndata(): for sample in samples: - infile = os.path.join(sample, "begin.rds") + if PARAMS["input_format"] == "rds": + infile = os.path.join(sample, "begin.rds") + else: + infile = os.path.join(sample, "begin.h5seurat") for comps in components: @@ -800,8 +870,7 @@ def genAnndata(): outfile = os.path.join(sample, outdir, subdir, outname) yield [infile, outfile] - -@follows(exportForPython) +@follows(TASK_EXPORT) @files(genAnndata) def anndata(infile, outfile): ''' @@ -812,8 +881,14 @@ def anndata(infile, outfile): reductiontype = PARAMS["dimreduction_method"] - reduced_dims_matrix_file = os.path.join(spec.sample_dir, - "embedding." + reductiontype + ".tsv.gz") + if PARAMS["input_format"] == "rds": + reduced_dims_matrix_file = os.path.join(spec.sample_dir, + "embedding." + reductiontype + ".tsv.gz") + input_files = '''--reduced_dims_matrix_file=%(reduced_dims_matrix_file)s ''' + else: + anndata_file = SPEC["seurat_object"].replace(".h5seurat", ".h5ad") + input_files = '''--anndata_file=%(anndata_file)s + --reduction_name=%(dimreduction_method)s ''' barcode_file = os.path.join(spec.sample_dir, "barcodes.tsv.gz") @@ -841,7 +916,7 @@ def anndata(infile, outfile): memory=PARAMS["resources_memory_extreme"]) statement = '''python %(tenx_dir)s/python/make_anndata.py - --reduced_dims_matrix_file=%(reduced_dims_matrix_file)s + %(input_files)s --barcode_file=%(barcode_file)s --outdir=%(outdir)s --comps=%(comps)s @@ -871,7 +946,10 @@ def genScanpyClusterJobs(): for sample in samples: - infile = os.path.join(sample, "begin.rds") + if PARAMS["input_format"] == "rds": + infile = os.path.join(sample, "begin.rds") + else: + infile = os.path.join(sample, "begin.h5seurat") for comps in components: @@ -932,7 +1010,7 @@ def cluster(infile, outfile): spec, SPEC = TASK.get_vars(infile, outfile, PARAMS) job_threads, job_memory, r_memory = TASK.get_resources( - memory=PARAMS["resources_memory_low"]) + memory=PARAMS["resources_memory_standard"]) scanpy_cluster_tsv = infile.replace(".sentinel",".tsv.gz") @@ -1083,7 +1161,7 @@ def paga(infile, outfile): @active_if(PARAMS["run_phate"]) -@follows(exportForPython,cluster) +@follows(TASK_EXPORT, cluster) @transform(anndata, regex(r"(.*)/(.*)/anndata.dir/anndata.sentinel"), r"\1/\2/phate.dir/phate.sentinel") @@ -1113,22 +1191,29 @@ def phate(infile, outfile): cluster_resolutions = ",".join(spec.resolutions) - barcode_file = os.path.join(spec.sample_dir, - "barcodes.tsv.gz") - - if PARAMS["phate_assay"] == "reduced.dimensions": - - embeddings = ".".join(["embedding", - PARAMS["dimreduction_method"], - "tsv.gz"]) - - assay_data = os.path.join(spec.sample_dir, embeddings) - - elif PARAMS["phate_assay"] == "scaled.data": - assay_data = os.path.join(spec.sample_dir, "assay.scale.data.tsv.gz") + if USE_H5: + # path to anndata object + assay_data = os.path.join(spec.sample_dir, "begin.h5ad") + if PARAMS["phate_assay"] == "reduced.dimensions": + # add red dimension name to extract from anndata + input_phate = '''--reduction_name=%(dimreduction_method)s + --input_type="anndata" ''' else: - raise ValueError("PHATE assay not recognised") + barcode_file = os.path.join(spec.sample_dir, + "barcodes.tsv.gz") + input_phate = ''' --barcode_file=%(barcode_file)s + --input_type="tsv" ''' + if PARAMS["phate_assay"] == "reduced.dimensions": + embeddings = ".".join(["embedding", + PARAMS["dimreduction_method"], + "tsv.gz"]) + assay_data = os.path.join(spec.sample_dir, embeddings) + + elif PARAMS["phate_assay"] == "scaled.data": + assay_data = os.path.join(spec.sample_dir, "assay.scale.data.tsv.gz") + else: + raise ValueError("PHATE assay not recognised") k = PARAMS["phate_k"] @@ -1140,13 +1225,13 @@ def phate(infile, outfile): statement = '''python %(tenx_dir)s/python/run_phate.py --data=%(assay_data)s --assay=%(phate_assay)s - --barcode_file=%(barcode_file)s --outdir=%(outdir)s --resolution=%(cluster_resolutions)s --cluster_assignments=%(cluster_assignment_files)s --cluster_colors=%(cluster_color_files)s --k=%(k)s --gif=%(phate_gif)s + %(input_phate)s &> %(log_file)s ''' % dict(PARAMS, **SPEC, **locals()) @@ -1201,9 +1286,9 @@ def UMAP(infile, outfile): # ########################################################################### # @active_if(PARAMS["run_diffusionmap"]) -@transform(cluster, - regex(r"(.*)/(.*)/(.*)/cluster.sentinel"), - r"\1/\2/\3/diffusionmap.dir/dm.sentinel") +@transform(anndata, + regex(r"(.*)/(.*)/anndata.dir/anndata.sentinel"), + r"\1/\2/diffusionmap.dir/dm.sentinel") def diffusionMap(infile, outfile): ''' Run the diffusion map analysis on a saved seurat object. @@ -1309,7 +1394,7 @@ def knownMarkerViolins(infile, outfile): # ########################################################################### # @active_if(PARAMS["run_velocity"]) -@follows(paga) +@follows(paga, phate) @transform(RDIMS_VIS_TASK, regex(r"(.*)/(.*)/(.*).sentinel"), r"\1/velocity.dir/scvelo.sentinel") @@ -1407,6 +1492,10 @@ def scvelo(infile, outfile): runs["phate"] = {"method": "phate", "table": phate_table, "rdim1": "PHATE1", "rdim2": "PHATE2"} + if USE_H5: + input_type = ''' --input_type="anndata" ''' + else: + input_type = ''' --input_type="tsv" ''' statements = [] @@ -1432,6 +1521,7 @@ def scvelo(infile, outfile): --rdims=%(r_tab)s --rdim1=%(r_1)s --rdim2=%(r_2)s + %(input_type)s &> %(this_log_file)s ''' % dict(PARAMS, **SPEC, **locals()) @@ -1447,7 +1537,7 @@ def scvelo(infile, outfile): @transform(RDIMS_VIS_TASK, regex(r"(.*)/(.*)/(.*)/(.*).sentinel"), - add_inputs(exportForPython), + add_inputs(TASK_EXPORT), r"\1/\2/rdims.visualisation.dir/plot.rdims.factor.sentinel") def plotRdimsFactors(infiles, outfile): ''' @@ -1602,7 +1692,6 @@ def plotRdimsClusters(infile, outfile): @active_if(PARAMS["run_diffusionmap"]) -@follows(diffusionMap) @transform(cluster, regex(r"(.*)/(.*).dir/(.*).dir/(.*).sentinel"), add_inputs(diffusionMap), @@ -1865,7 +1954,7 @@ def plotRdimsGenes(infile, outfile): # ################# plot per-cluster summary statistics ##################### # # ########################################################################### # -@follows(exportForPython) +@follows(TASK_EXPORT) @transform(cluster, regex(r"(.*)/cluster.sentinel"), r"\1/group.numbers.dir/plot.group.numbers.sentinel") @@ -2056,7 +2145,7 @@ def findMarkers(infile, outfile): # set the job threads and memory job_threads, job_memory, r_memory = TASK.get_resources( - memory=PARAMS["resources_memory_standard"], + memory=PARAMS["resources_memory_high"], cpu=PARAMS["findmarkers_numcores"]) for i in spec.clusters: @@ -2181,7 +2270,7 @@ def summariseMarkers(infile, outfile): # set the job threads and memory job_threads, job_memory, r_memory = TASK.get_resources( - memory=PARAMS["resources_memory_low"], + memory=PARAMS["resources_memory_high"], cpu=1) # make sumamary tables and plots of the differentially expressed genes @@ -3629,8 +3718,6 @@ def _add_figure(plot_file=None, \\input %(tenx_dir)s/latex/endmatter.tex' ''' - print(statement) - # Deliberately run twice - necessary for LaTeX compilation.. draft_mode = "-draftmode" P.run(statement % dict(PARAMS, **SPEC, **locals())) diff --git a/pipelines/pipeline_seurat/pipeline.yml b/pipelines/pipeline_seurat/pipeline.yml index 0401734..4dfb0c4 100644 --- a/pipelines/pipeline_seurat/pipeline.yml +++ b/pipelines/pipeline_seurat/pipeline.yml @@ -26,6 +26,15 @@ matrix: # type: 10X +input: + # currently supported are + # ----------------------- + # + # "rds", for which the pipeline expects a + # file called 'begin.rds' + # "h5seurat", for which the pipeline expects + # a file called 'begin.h5seurat' + format: rds # compute resource options # ------------------------ @@ -44,7 +53,8 @@ resources: memory_low: 10G # Memory_standard is used if not further specified memory_standard: 20G - # Memory_high is used for tasks: findMarkers, summariseMarkers, + # Memory_high is used for tasks: beginSeurat, compareClusters, + # paga, UMAP, topClusterMarkers, findMarkers, summariseMarkers # diffusionMap memory_high: 50G # Memory_extreme is used for the scaleing and PCA tasks diff --git a/pipelines/pipeline_utils/TASK.py b/pipelines/pipeline_utils/TASK.py index 8aedbf4..23b184b 100644 --- a/pipelines/pipeline_utils/TASK.py +++ b/pipelines/pipeline_utils/TASK.py @@ -36,12 +36,10 @@ def get_vars(infile, outfile, PARAMS, make_outdir=True): SPEC = { "sample_name": parts[0].split(".")[0], "sample_dir": parts[0], - "seurat_object": os.path.join(parts[0], "begin.rds"), "outdir": os.path.dirname(outfile), "outname": os.path.basename(outfile), } - if PARAMS["runspecs_cluster_resolutions"]: SPEC["resolutions"] = [x.strip() @@ -54,6 +52,10 @@ def get_vars(infile, outfile, PARAMS, make_outdir=True): if PARAMS["runspecs_predefined_clusters"]: SPEC["resolutions"].append("predefined") + if PARAMS["input_format"] == "rds": + SPEC["seurat_object"] = os.path.join(parts[0], "begin.rds") + else: + SPEC["seurat_object"] = os.path.join(parts[0], "begin.h5seurat") if not infile is None: infile = os.path.relpath(infile) diff --git a/pipelines/pipeline_utils/spec.py b/pipelines/pipeline_utils/spec.py index 87436c6..7ccb51d 100644 --- a/pipelines/pipeline_utils/spec.py +++ b/pipelines/pipeline_utils/spec.py @@ -11,13 +11,16 @@ def get(infile, outfile, PARAMS): nparts = len(parts) spec = { "sample_name": parts[0].split(".")[0], - "seurat_object": os.path.join(parts[0], "begin.rds"), "outdir": os.path.dirname(outfile), "indir": os.path.dirname(infile), "resolutions" : [x.strip() for x in PARAMS["runspecs_cluster_resolutions"].split(",")] } + if PARAMS["input_format"] == "rds": + spec["seurat_object"] = os.path.join(parts[0], "begin.rds") + else: + spec["seurat_object"] = os.path.join(parts[0], "begin.h5seurat") # take care of making the output directory. if not os.path.exists(spec["outdir"]): diff --git a/python/make_anndata.py b/python/make_anndata.py index 99d6a4a..4733423 100644 --- a/python/make_anndata.py +++ b/python/make_anndata.py @@ -19,7 +19,6 @@ L.addHandler(log_handler) L.setLevel(logging.INFO) - # ########################################################################### # # ######################## Parse the arguments ############################## # # ########################################################################### # @@ -27,8 +26,12 @@ L.info("parsing arguments") parser = argparse.ArgumentParser() -parser.add_argument("--reduced_dims_matrix_file", default="reduced_dims.tsv.gz", type=str, +parser.add_argument("--reduced_dims_matrix_file", default="none", type=str, help="File with reduced dimensions") +parser.add_argument("--anndata_file", default="none", type=str, + help="h5ad file with anndata") +parser.add_argument("--reduction_name", default="pca", type=str, + help="Name of the dimension reduction") parser.add_argument("--barcode_file", default="barcodes.tsv.gz", type=str, help="File with the cell barcodes") parser.add_argument("--outdir",default=1, type=str, @@ -61,29 +64,51 @@ # ########################################################################### # -# ######################### Make the anndata object ######################### # +# ######################## Make anndata object ############################## # # ########################################################################### # -# Read matrix of reduced dimensions, create anndata and add dimensions -reduced_dims_mat = pd.read_csv(args.reduced_dims_matrix_file, sep="\t") -reduced_dims_mat.index = [x for x in pd.read_csv(args.barcode_file, header=None)[0]] +# select correct PCs +select_comps = args.comps.split(",") -adata = anndata.AnnData(obs=[x for x in reduced_dims_mat.index]) -adata.obs.index = reduced_dims_mat.index -adata.obs.rename(columns={0:'barcode'}, inplace=True) +if not args.anndata_file == "none": + L.info("Using converted h5ad from SeuratDisk") + adata_input = anndata.read(args.anndata_file) + adata_input.obs['barcode'] = pd.Series(adata_input.obs.index.copy(), + index=adata_input.obs.index.copy()) + + # get name for obsm + obsm_use = 'X_' + str(args.reduction_name) + indeces = [int(i)-1 for i in select_comps] + adata = anndata.AnnData(obs=adata_input.obs.copy()) + adata.obsm[obsm_use] = adata_input.obsm[obsm_use].copy()[:,indeces] + + rd_colnames = [str(args.reduction_name)+"_"+str(i) for i in select_comps] + L.info("Using comps " + ', '.join(rd_colnames)) -# Add dimensions to anndata -colnames = list(reduced_dims_mat.columns) -colname_prefix = list(set([re.sub(r'_[0-9]+', '', i) for i in colnames]))[0] + "_" -select_comps = args.comps.split(",") -select_comps = [ colname_prefix + item for item in select_comps] -reduced_dims_mat = reduced_dims_mat[select_comps] -n_pcs = len(select_comps) +elif not args.reduced_dims_matrix_file == "none": + # Read matrix of reduced dimensions, create anndata and add dimensions + reduced_dims_mat = pd.read_csv(args.reduced_dims_matrix_file, sep="\t") + reduced_dims_mat.index = [x for x in pd.read_csv(args.barcode_file, header=None)[0]] -L.info("Using comps " + ', '.join(list(reduced_dims_mat.columns))) + adata = anndata.AnnData(obs=[x for x in reduced_dims_mat.index]) + adata.obs.index = reduced_dims_mat.index + adata.obs.rename(columns={0:'barcode'}, inplace=True) -adata.obsm['X_rdims'] = reduced_dims_mat.to_numpy(dtype="float32") + # Add dimensions to anndata + colnames = list(reduced_dims_mat.columns) + colname_prefix = list(set([re.sub(r'_[0-9]+', '', i) for i in colnames]))[0] + "_" + select_comps = args.comps.split(",") + select_comps = [ colname_prefix + item for item in select_comps] + reduced_dims_mat = reduced_dims_mat[select_comps] + + L.info("Using comps " + ', '.join(list(reduced_dims_mat.columns))) + + adata.obsm['X_rdims'] = reduced_dims_mat.to_numpy(dtype="float32") + obsm_use = 'X_rdims' + +else: + L.info("No input file present.") # ########################################################################### # @@ -92,6 +117,7 @@ # Run neighbors L.info( "Using " + str(args.k) + " neighbors") +L.info( "Computing neighbors based on obsm: " + str(obsm_use)) if args.method == "scanpy": @@ -101,7 +127,7 @@ neighbors(adata, n_neighbors = args.k, metric = args.metric, - use_rep = 'X_rdims') + use_rep = obsm_use) elif args.method == "hnsw": @@ -117,7 +143,7 @@ neighbors(adata, n_neighbors = args.k, n_pcs = None, - use_rep = "X_rdims", + use_rep = obsm_use, knn = True, random_state = 0, method = 'hnsw', @@ -135,4 +161,11 @@ adata.write(os.path.join(args.outdir, "anndata.h5ad")) +if args.anndata_file != "none": + # write out metadata for other tasks + metadata_out = adata.obs.copy() + out_folder = os.path.dirname(args.anndata_file) + metadata_out.to_csv(os.path.join(out_folder, "metadata.tsv.gz"), + sep="\t", index=False) + L.info("Complete") diff --git a/python/run_cluster.py b/python/run_cluster.py index c681f87..8904530 100644 --- a/python/run_cluster.py +++ b/python/run_cluster.py @@ -84,7 +84,9 @@ else: raise ValueError("Clustering algorithm not recognised") -adata.obs.to_csv(os.path.join(args.outdir, - "scanpy.clusters.tsv.gz"), sep="\t", index=False) +select_cols = ['barcode', args.algorithm] +adata.obs[select_cols].to_csv(os.path.join(args.outdir, + "scanpy.clusters.tsv.gz"), + sep="\t", index=False) L.info("Complete") diff --git a/python/run_phate.py b/python/run_phate.py index f414e52..9d775b5 100644 --- a/python/run_phate.py +++ b/python/run_phate.py @@ -15,7 +15,7 @@ import sys import scprep import phate - +import anndata # ########################################################################### # @@ -53,32 +53,43 @@ help="number of neighbors") parser.add_argument("--gif", default="No", type=str, help="output a GIF") - +parser.add_argument("--reduction_name", default="pca", type=str, + help="Name of the reduced dimseions") +parser.add_argument("--input_type", default="tsv", type=str, + help="anndata or tsv") args = parser.parse_args() -# ########################################################################### # -# ############## Create outdir and set results file ######################### # -# ########################################################################### # - - +L.info("Running with arguments:") +print(args) # ########################################################################### # # ############################## Run PHATE ################################## # # ########################################################################### # -if args.assay == "reduced.dimensions": - # Read matrix of reduced dimensions, create anndata and add dimensions - data = pd.read_csv(args.data, sep="\t", header=0) - -if args.assay == "scaled.data": - # Read matrix of reduced dimensions, create anndata and add dimensions - data = pd.read_csv(args.data, sep="\t", header=None) - - # we need to transpose the data for PHATE - data = data.transpose() - +if args.input_type == "tsv": + if args.assay == "reduced.dimensions": + # Read matrix of reduced dimensions, create anndata and add dimensions + data = pd.read_csv(args.data, sep="\t", header=0) + + if args.assay == "scaled.data": + # Read matrix of reduced dimensions, create anndata and add dimensions + data = pd.read_csv(args.data, sep="\t", header=None) + # we need to transpose the data for PHATE + data = data.transpose() + +if args.input_type == "anndata": + # Read in anndata and extract correct data type + adata = anndata.read(args.data) + if args.assay == "reduced.dimensions": + obsm_use = 'X_' + str(args.reduction_name) + data = pd.DataFrame(adata.obsm[obsm_use].copy(), + index=adata.obs.index.copy()) + if args.assay == "scaled.data": + data = pd.DataFrame(adata.X.copy(), + index=adata.index.copy()) + data = data.transpose() phate_operator = phate.PHATE(n_jobs=-2, knn=args.k) x2 = phate_operator.fit_transform(data) @@ -113,7 +124,11 @@ rdims_phate = pd.DataFrame(x2, columns=["PHATE1","PHATE2"]) -rdims_phate["barcode"] = pd.read_csv(args.barcode_file, header=None)[0].values +if args.input_type == "tsv": + rdims_phate["barcode"] = pd.read_csv(args.barcode_file, header=None)[0].values +if args.input_type == "anndata": + rdims_phate["barcode"] = adata.obs.index.values + rdims_phate.to_csv(os.path.join(args.outdir,"phate.tsv.gz"), sep="\t") @@ -145,7 +160,10 @@ rdims_phate = pd.DataFrame(x3, columns=["PHATE1","PHATE2", "PHATE3"]) -rdims_phate["barcode"] = pd.read_csv(args.barcode_file, header=None)[0].values +if args.input_type == "tsv": + rdims_phate["barcode"] = pd.read_csv(args.barcode_file, header=None)[0].values +if args.input_type == "anndata": + rdims_phate["barcode"] = adata.obs.index.values rdims_phate.to_csv(os.path.join(args.outdir,"phate_3D.tsv.gz"), sep="\t") diff --git a/python/run_scvelo.py b/python/run_scvelo.py index 673c9b2..27cfdc1 100644 --- a/python/run_scvelo.py +++ b/python/run_scvelo.py @@ -56,6 +56,9 @@ help="reduced dimension 1") parser.add_argument("--rdim2", default="UMAP2", type=str, help="reduced dimension 2") +parser.add_argument("--input_type", default="tsv", type=str, + help="whether gene and dim reduction information are " + "read from tsv or anndata") args = parser.parse_args() @@ -92,17 +95,32 @@ else: raise ValueError("either a loom file or dropEst directory must be specified") -# Add the variable and observation information -feat = pd.read_csv(os.path.join(metadata_dir,"features.tsv.gz"), header=None) -feat.columns = ["gene_symbol"] -feat.index = feat["gene_symbol"] -samples = pd.read_csv(os.path.join(metadata_dir, "barcodes.tsv.gz"),header=None) -metadata = pd.read_csv(os.path.join(metadata_dir,"metadata.tsv.gz"), sep="\t") -metadata.index = metadata.barcode.values +if args.input_type == "tsv" or not args.dropest_dir == "none": + # Add the variable and observation information from flat files + # or from dropest input directory -adata.vars = feat -adata.obs = metadata.loc[adata.obs.index,] + feat = pd.read_csv(os.path.join(metadata_dir,"features.tsv.gz"), header=None) + feat.columns = ["gene_symbol"] + feat.index = feat["gene_symbol"] + samples = pd.read_csv(os.path.join(metadata_dir, "barcodes.tsv.gz"),header=None) + metadata = pd.read_csv(os.path.join(metadata_dir,"metadata.tsv.gz"), sep="\t") + metadata.index = metadata.barcode.values + + adata.vars = feat + adata.obs = metadata.loc[adata.obs.index,] + +else: + # This is used for loom files + h5ad runs + # Need to use .raw slot here to have all genes included! + adata_info = ad.read(os.path.join(args.rdims.split("/")[0], "begin.h5ad")) + #adata_info.raw.var['gene_symbol'] = adata_info.raw.var.index.values + #adata.vars = adata_info.raw.var.copy() + #adata.obs = adata_info + #adata = scv.utils.merge(adata, adata_meta) + ## the merge is causing issues with a corrupted neighbor graph + metadata = adata_info.obs.copy() + adata.obs = metadata.loc[adata.obs.index,] L.info("AnnData initialised") diff --git a/python/untitled b/python/untitled deleted file mode 100644 index e69de29..0000000 diff --git a/tenxutils/R/Seurat.R b/tenxutils/R/Seurat.R index 2f9f012..8da0197 100644 --- a/tenxutils/R/Seurat.R +++ b/tenxutils/R/Seurat.R @@ -126,3 +126,50 @@ getCellCycleGenes <- function(sgenes_file = NULL, list(s.genes = sgenes, g2m.genes = g2mgenes) } + + +## Function to load seurat object from either rds or h5seurat +#' This function can load the Seurat object from either rds or +#' h5seurat format +#' +#' @param path The path to the stored object +#' @param format A string indicating whether it is 'rds' or 'h5seurat'. If left empty (defaults to "none"), the ending of the path is used to determine file format. +loadSeurat <- function(path, + format = "none") +{ + if (endsWith(path, ".rds") | format == "rds") { + message(sprintf("readRDS: %s", opt$seuratobject)) + s <- readRDS(path) + } else if (endsWith(path, ".h5seurat") | format == "h5seurat") { + message(sprintf("LoadH5Seurat: %s", opt$seuratobject)) + stopifnot(require(SeuratDisk)) + s <- LoadH5Seurat(path) + } else { + stop("Input format not supported. The format or path ending needs to be rds/.rds or h5seurat/.h5seurat.") + } +} + +## Function to save seurat objects to either rds or h5seurat +#' This function can save the Seurat object to either rds or +#' h5seurat format +#' +#' @param path The path where the object should be stored +#' @param format A string indicating whether it is 'rds' or 'h5seurat'. If left empty (defaults to "none"), the ending of the path is used to determine file format. + +saveSeurat <- function(path, + format = "none") +{ + if (endsWith(path, ".rds") | format == "rds") { + message(sprintf("readRDS: %s", opt$seuratobject)) + saveRDS(s, file=file.path(opt$outdir, "begin.rds")) + } else if (endsWith(path, ".h5seurat") | format == "h5seurat") { + message(sprintf("LoadH5Seurat: %s", opt$seuratobject)) + stopifnot(require(SeuratDisk)) + SaveH5Seurat(s, file=file.path(opt$outdir, "begin.h5seurat"), overwrite = TRUE) + } else { + stop("Output format not supported. The format or path ending needs to be rds/.rds or h5seurat/.h5seurat.") + } +} + + +