update cli message

stemangiola · Oct 24, 2024 · 156d9d1 · 156d9d1
1 parent 6a0ec5b
commit 156d9d1
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 51 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -127,7 +127,7 @@ biocViews:
     Transcription,
     Transcriptomics
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 LazyDataCompression: xz
 URL: https://github.com/stemangiola/CuratedAtlasQueryR
 BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method(as.sparse,DelayedMatrix)
 export(SAMPLE_DATABASE_URL)
+export(calculate_pseudobulk)
 export(get_SingleCellExperiment)
 export(get_database_url)
 export(get_default_cache_dir)
@@ -26,6 +27,7 @@ importFrom(SingleCellExperiment,"reducedDims<-")
 importFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(SingleCellExperiment,reducedDims)
 importFrom(SingleCellExperiment,rowData)
+importFrom(SummarizedExperiment,"assay<-")
 importFrom(SummarizedExperiment,"assayNames<-")
 importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")

diff --git a/R/counts_per_million.R b/R/counts_per_million.R
@@ -12,7 +12,7 @@
 get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
 
   # Save SCE to the cache directory counts folder
-  input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)
+  input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir, replace=TRUE)
 
   data <- input_sce_obj
 
@@ -39,6 +39,6 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
   # Check if there is a memory issue 
   assays(sce) <- assays(sce) |> map(DelayedArray::realize)
 
-  sce |> saveHDF5SummarizedExperiment(output_dir)
+  sce |> saveHDF5SummarizedExperiment(output_dir, replace = TRUE)
 } 
 
diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R
@@ -13,14 +13,13 @@
 #' Directories store counts and counts per million in the provided cache directory.
 #' @importFrom checkmate check_true check_character check_subset assert
 #' @importFrom dplyr select distinct pull
-#' @importFrom cli cli_alert_info
+#' @importFrom cli cli_alert_info cli_alert_warning
 #' @importFrom rlang .data
 #' @importFrom SingleCellExperiment reducedDims rowData reducedDims<-
 #' @importFrom S4Vectors metadata metadata<-
-#' @importFrom SummarizedExperiment assay
+#' @importFrom SummarizedExperiment assay assay<-
 #' @importFrom stringr str_detect
-#' @importFrom tidySingleCellExperiment aggregate_cells
-#' @importFrom tidybulk quantile_normalise_abundance
+#' @importFrom HDF5Array saveHDF5SummarizedExperiment
 #' @examples
 #' data(sample_sce_obj)
 #' import_one_sce(sample_sce_obj,
@@ -74,8 +73,7 @@ import_one_sce <- function(
   if (isTRUE(pseudobulk)) {
     assert(
       all(pseudobulk_sample %in% (colData(sce_obj) |> colnames()) ),
-      "Character in pseudobulk_sample must contain sample_ and cell_type_harmonised columns 
-      in the SingleCellExperiment column metadata")
+      "Sample_ and cell_type_harmonised columns must be in the SingleCellExperiment colData")
 
     assert(c(pseudobulk_sample, "file_id") %in% (names(metadata_tbl)) |> all() ,
            "SingleCellExperiment metadata must at least contain sample_, cell_type_harmonised,
@@ -92,10 +90,14 @@ import_one_sce <- function(
   }
 
   # Check whether count H5 directory has been generated
-  all(!file_id_db %in% dir(original_dir)) |>
-    check_true() |>
-    assert("The filename for count assay (file_id_db) already exists in the cache directory.")
-
+  if (any(file_id_db %in% dir(original_dir))) {
+    cli_alert_warning(
+      single_line_str(
+        "Import API says: The filename for count assay (file_id_db) already exists in the cache directory. "
+      )
+    )
+  }
+
   # Check the metadata contains cell_, file_id_db, sample_ with correct types
   check_true("cell_" %in% colnames(metadata_tbl))
   check_true("file_id_db" %in% names(metadata_tbl)) 
@@ -107,9 +109,14 @@ import_one_sce <- function(
 
   # Check cell_ values are not duplicated when join with parquet
   cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble()
-  (!any(metadata_tbl$cell_ %in% cells$cell_)) |> 
-    assert("Cell names (cell_) should not clash with cells that already exist in the atlas.")
-
+  if ((any(metadata_tbl$cell_ %in% cells$cell_))) 
+    cli_alert_warning(
+      single_line_str(
+        "Import API says:
+        Cells in your SingleCellExperiment already exists in the atlas."
+      )
+    )
+
   # Check age_days is either -99 or greater than 365
   if (any(colnames(metadata_tbl) == "age_days")) {
     assert(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365),
@@ -139,43 +146,63 @@ import_one_sce <- function(
   # convert metadata_tbl to parquet if above checkpoints pass
   arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet"))
 
-  # generate pseudobulk counts and quantile_normalised counts
-  if (isTRUE(pseudobulk)) {
-    file_id <- metadata_tbl$file_id |> unique() |> as.character()
-
-    cli_alert_info("Generating pseudobulk counts from {file_id}. ")
-    pseudobulk_counts <- sce_obj |> aggregate_cells(c(sample_, cell_type_harmonised)) 
-
-    assay_name <- pseudobulk_counts |> assays() |> names()
-    normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |>
-      preprocessCore::normalize.quantiles.determine.target()
-
-    normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance(
-      method="preprocesscore_normalize_quantiles_use_target",
-      target_distribution = normalised_counts_best_distribution
-    )
-
-    assay(normalised_counts, assay_name) <- NULL
-    names(assays(normalised_counts)) <- "quantile_normalised"
-
-    if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) {
-      cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE)
-    }
-
-    if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) {
-      cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE)
-    }
-
-    path <- file.path(cache_dir, "pseudobulk")
-    pseudobulk_counts_path <- file.path(path, "original", basename(file_id))
-    pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id))
-
-    saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path )
-    saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path )
-
-    cli_alert_info("pseudobulk are generated in {.path {path}}. ")
+  # Generate pseudobulk
+  if (isTRUE(pseudobulk)) sce_obj |> calculate_pseudobulk(cache_dir = cache_dir)
+}
+
+
+#' Generate pseudobulk counts and quantile_normalised counts
+#' @param sce_data A SingleCellExperiment object from RDS, the metadata slot of which
+#' must contain `cell_` and `dataset_id`
+#' @param cache_dir Optional character vector of length 1. A file path on
+#'   your local system to a directory (not a file) that will be used to store pseudobulk counts
+#' @return Pseudobulk counts in `HDF5` format stored in the cache directory
+#' @export
+#' @importFrom S4Vectors metadata
+#' @importFrom dplyr select distinct pull
+#' @importFrom cli cli_alert_info cli_alert_warning
+#' @importFrom S4Vectors metadata
+#' @importFrom SummarizedExperiment assay assay<- assays
+#' @importFrom tidySingleCellExperiment aggregate_cells
+#' @importFrom tidybulk quantile_normalise_abundance
+#' @importFrom HDF5Array saveHDF5SummarizedExperiment
+
+calculate_pseudobulk <- function(sce_data,
+                                 cache_dir = get_default_cache_dir()) {
+  metadata_tbl <- metadata(sce_data)$data
+  file_id <- metadata_tbl$file_id |> unique() |> as.character()
+
+  if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) {
+    cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE)
+  }
+
+  if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) {
+    cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE)
   }
 
+  cli_alert_info("Generating pseudobulk counts from {file_id}. ")
+  pseudobulk_counts <- sce_data |> aggregate_cells(c(sample_, cell_type_harmonised)) 
+
+  assay_name <- pseudobulk_counts |> assays() |> names()
+  normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |>
+    preprocessCore::normalize.quantiles.determine.target()
+
+  normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance(
+    method="preprocesscore_normalize_quantiles_use_target",
+    target_distribution = normalised_counts_best_distribution
+  )
+
+  assay(normalised_counts, assay_name) <- NULL
+  names(assays(normalised_counts)) <- "quantile_normalised"
+
+  path <- file.path(cache_dir, "pseudobulk")
+  pseudobulk_counts_path <- file.path(path, "original", basename(file_id))
+  pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id))
+
+  saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path, replace = TRUE )
+  saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path , replace = TRUE)
+
+  cli_alert_info("pseudobulk are generated in {.path {path}}. ")
 }