Skip to content

Commit

Permalink
update cli message
Browse files Browse the repository at this point in the history
  • Loading branch information
myushen committed Oct 24, 2024
1 parent 6a0ec5b commit 156d9d1
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 51 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ biocViews:
Transcription,
Transcriptomics
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
LazyDataCompression: xz
URL: https://github.com/stemangiola/CuratedAtlasQueryR
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

S3method(as.sparse,DelayedMatrix)
export(SAMPLE_DATABASE_URL)
export(calculate_pseudobulk)
export(get_SingleCellExperiment)
export(get_database_url)
export(get_default_cache_dir)
Expand All @@ -26,6 +27,7 @@ importFrom(SingleCellExperiment,"reducedDims<-")
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SingleCellExperiment,reducedDims)
importFrom(SingleCellExperiment,rowData)
importFrom(SummarizedExperiment,"assay<-")
importFrom(SummarizedExperiment,"assayNames<-")
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
Expand Down
4 changes: 2 additions & 2 deletions R/counts_per_million.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {

# Save SCE to the cache directory counts folder
input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)
input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir, replace=TRUE)

data <- input_sce_obj

Expand All @@ -39,6 +39,6 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
# Check if there is a memory issue
assays(sce) <- assays(sce) |> map(DelayedArray::realize)

sce |> saveHDF5SummarizedExperiment(output_dir)
sce |> saveHDF5SummarizedExperiment(output_dir, replace = TRUE)
}

123 changes: 75 additions & 48 deletions R/import_metadata_and_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@
#' Directories store counts and counts per million in the provided cache directory.
#' @importFrom checkmate check_true check_character check_subset assert
#' @importFrom dplyr select distinct pull
#' @importFrom cli cli_alert_info
#' @importFrom cli cli_alert_info cli_alert_warning
#' @importFrom rlang .data
#' @importFrom SingleCellExperiment reducedDims rowData reducedDims<-
#' @importFrom S4Vectors metadata metadata<-
#' @importFrom SummarizedExperiment assay
#' @importFrom SummarizedExperiment assay assay<-
#' @importFrom stringr str_detect
#' @importFrom tidySingleCellExperiment aggregate_cells
#' @importFrom tidybulk quantile_normalise_abundance
#' @importFrom HDF5Array saveHDF5SummarizedExperiment
#' @examples
#' data(sample_sce_obj)
#' import_one_sce(sample_sce_obj,
Expand Down Expand Up @@ -74,8 +73,7 @@ import_one_sce <- function(
if (isTRUE(pseudobulk)) {
assert(
all(pseudobulk_sample %in% (colData(sce_obj) |> colnames()) ),
"Character in pseudobulk_sample must contain sample_ and cell_type_harmonised columns
in the SingleCellExperiment column metadata")
"Sample_ and cell_type_harmonised columns must be in the SingleCellExperiment colData")

assert(c(pseudobulk_sample, "file_id") %in% (names(metadata_tbl)) |> all() ,
"SingleCellExperiment metadata must at least contain sample_, cell_type_harmonised,
Expand All @@ -92,10 +90,14 @@ import_one_sce <- function(
}

# Check whether count H5 directory has been generated
all(!file_id_db %in% dir(original_dir)) |>
check_true() |>
assert("The filename for count assay (file_id_db) already exists in the cache directory.")

if (any(file_id_db %in% dir(original_dir))) {
cli_alert_warning(
single_line_str(
"Import API says: The filename for count assay (file_id_db) already exists in the cache directory. "
)
)
}

# Check the metadata contains cell_, file_id_db, sample_ with correct types
check_true("cell_" %in% colnames(metadata_tbl))
check_true("file_id_db" %in% names(metadata_tbl))
Expand All @@ -107,9 +109,14 @@ import_one_sce <- function(

# Check cell_ values are not duplicated when join with parquet
cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble()
(!any(metadata_tbl$cell_ %in% cells$cell_)) |>
assert("Cell names (cell_) should not clash with cells that already exist in the atlas.")

if ((any(metadata_tbl$cell_ %in% cells$cell_)))
cli_alert_warning(
single_line_str(
"Import API says:
Cells in your SingleCellExperiment already exists in the atlas."
)
)

# Check age_days is either -99 or greater than 365
if (any(colnames(metadata_tbl) == "age_days")) {
assert(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365),
Expand Down Expand Up @@ -139,43 +146,63 @@ import_one_sce <- function(
# convert metadata_tbl to parquet if above checkpoints pass
arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet"))

# generate pseudobulk counts and quantile_normalised counts
if (isTRUE(pseudobulk)) {
file_id <- metadata_tbl$file_id |> unique() |> as.character()

cli_alert_info("Generating pseudobulk counts from {file_id}. ")
pseudobulk_counts <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised))

assay_name <- pseudobulk_counts |> assays() |> names()
normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |>
preprocessCore::normalize.quantiles.determine.target()

normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance(
method="preprocesscore_normalize_quantiles_use_target",
target_distribution = normalised_counts_best_distribution
)

assay(normalised_counts, assay_name) <- NULL
names(assays(normalised_counts)) <- "quantile_normalised"

if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) {
cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE)
}

if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) {
cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE)
}

path <- file.path(cache_dir, "pseudobulk")
pseudobulk_counts_path <- file.path(path, "original", basename(file_id))
pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id))

saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path )
saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path )

cli_alert_info("pseudobulk are generated in {.path {path}}. ")
# Generate pseudobulk
if (isTRUE(pseudobulk)) sce_obj |> calculate_pseudobulk(cache_dir = cache_dir)
}


#' Generate pseudobulk counts and quantile_normalised counts
#' @param sce_data A SingleCellExperiment object from RDS, the metadata slot of which
#' must contain `cell_` and `dataset_id`
#' @param cache_dir Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store pseudobulk counts
#' @return Pseudobulk counts in `HDF5` format stored in the cache directory
#' @export
#' @importFrom S4Vectors metadata
#' @importFrom dplyr select distinct pull
#' @importFrom cli cli_alert_info cli_alert_warning
#' @importFrom S4Vectors metadata
#' @importFrom SummarizedExperiment assay assay<- assays
#' @importFrom tidySingleCellExperiment aggregate_cells
#' @importFrom tidybulk quantile_normalise_abundance
#' @importFrom HDF5Array saveHDF5SummarizedExperiment

calculate_pseudobulk <- function(sce_data,
cache_dir = get_default_cache_dir()) {
metadata_tbl <- metadata(sce_data)$data
file_id <- metadata_tbl$file_id |> unique() |> as.character()

if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) {
cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE)
}

if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) {
cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE)
}

cli_alert_info("Generating pseudobulk counts from {file_id}. ")
pseudobulk_counts <- sce_data |> aggregate_cells(c(sample_, cell_type_harmonised))

assay_name <- pseudobulk_counts |> assays() |> names()
normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |>
preprocessCore::normalize.quantiles.determine.target()

normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance(
method="preprocesscore_normalize_quantiles_use_target",
target_distribution = normalised_counts_best_distribution
)

assay(normalised_counts, assay_name) <- NULL
names(assays(normalised_counts)) <- "quantile_normalised"

path <- file.path(cache_dir, "pseudobulk")
pseudobulk_counts_path <- file.path(path, "original", basename(file_id))
pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id))

saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path, replace = TRUE )
saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path , replace = TRUE)

cli_alert_info("pseudobulk are generated in {.path {path}}. ")
}


0 comments on commit 156d9d1

Please sign in to comment.