navinlabcode
diff --git a/‎R/calcConsensus.R
+12-3 b/‎R/calcConsensus.R
+12-3
diff --git a/‎R/calcRatios.R
+15-7 b/‎R/calcRatios.R
+15-7
diff --git a/‎R/countBreakpoints.R
+2-1 b/‎R/countBreakpoints.R
+2-1
diff --git a/‎R/filterCells.R
+14-9 b/‎R/filterCells.R
+14-9
diff --git a/‎R/findClusters.R
+51-23 b/‎R/findClusters.R
+51-23
diff --git a/‎R/findNormalCells.R
+23-8 b/‎R/findNormalCells.R
+23-8
diff --git a/‎R/findSuggestedK.R
+35-15 b/‎R/findSuggestedK.R
+35-15
@@ -1,21 +1,30 @@
-#' Calculate a consensus matrix of segment ratios based on metadata
+#' Calculate a consensus matrix of segment means based on \code{colData}
 #'
 #' @param scCNA The scCNA object.
 #' @param assay String with the name of the assay to pull data from to calculate
 #' the consensus matrix.
-#' @param consensus_by The column from metadata that will be used
+#' @param consensus_by A string with the column from colData that will be used
 #'  to isolate the cells by factor and calculate the consensus.
+#' @param fun A string indicating the summarizing function to be used.
 #' @param BPPARAM A \linkS4class{BiocParallelParam} specifying how the function
 #' should be parallelized.
 #'
+#' @details Consensus profiles are calculated by averaging or taking the median
+#'  of the ith segment mean of all single cells assigned to the same element of
+#'  \link{colData},
+#'
 #' @return
 #' @export
 #'
 #' @examples
 calcConsensus <- function(scCNA,
                           assay = "segment_ratios",
                           consensus_by = "subclones",
+                          fun = c("median", "mean"),
                           BPPARAM = bpparam()) {
+
+  fun <- match.arg(fun)
+
   if (consensus_by == 'subclones' &
       is.null(SummarizedExperiment::colData(scCNA)$subclones)) {
     stop("Calculating consensus requires cluster information. use findClusters(scCNA)")
@@ -49,7 +58,7 @@ calcConsensus <- function(scCNA,
 
   consensus_list <-
     BiocParallel::bplapply(long_list, function(x) {
-      apply(x, 2, median)
+      apply(x, 2, fun)
     }, BPPARAM = BPPARAM)
 
   cs_df <- as.data.frame(t(do.call(rbind, consensus_list)))
 
@@ -1,24 +1,32 @@
 #' Calculates the ratios from a matrix of counts
 #'
 #' @param scCNA The scCNA object
-#' @param assay The assay that will be used
-#' @param fun Character. Function used to calculate the ratios.
-#' Defaults to "median"
+#' @param assay String with the name of the assay to pull data from to calculate
+#' the ratios.
+#' @param fun A string indicating the summarizing function to be used.
 #'
-#' @return A ratio matrix within the slot \code{assay(scCNA, 'ratios')}
-#' can be accessed with \code{copykit::ratios(scCNA)}.
+#' @details Calculates a sample-wise normalization of the selected assay by the
+#' mean bin counts returns ratios where a value of 1 corresponds to the neutral
+#'copy number state of the sample
+#'
+#' @return A ratio matrix within the slot assay(scCNA, 'ratios')
+#' can be accessed with \code{ratios}.
 #' @export
 #'
 #' @importFrom SummarizedExperiment assay
 #'
 #' @examples
 calcRatios <- function(scCNA,
-                       assay = "ft",
-                       fun = "mean") {
+                       assay = c("ft", "bin_counts"),
+                       fun = c("mean", "median")) {
+
   if (assay %!in% c("ft", "bin_counts")) {
     stop("Assay must be either 'ft' or 'bin_counts'")
   }
 
+  assay <- match.arg(assay)
+  fun <- match.arg(fun)
+
   counts <- SummarizedExperiment::assay(scCNA, assay)
 
   ratios_df <- sweep(counts, 2, apply(counts, 2, fun), '/')
 
@@ -1,6 +1,7 @@
 #' Counting breakpoints from
 #'
-#' Considers changes in the segment ratios as breakpoints. Counts the breakpoints for each chromosome separately.
+#' Considers changes in the segment ratios as breakpoints.
+#' Counts the breakpoints for each chromosome separately.
 #'
 #' @param scCNA
 #'
 
@@ -1,24 +1,28 @@
 #' Filter noise cells
 #'
-#' filterCells uses a k-nearest-neighbor approach to remove cells
-#' with random CNA profiles, largely due to noise data.
-#' It calculates a correlation matrix and sets a resolution
-#' below which non neighbors will be classified as noise cells.
+#' Uses a nearest neighbor approach to find noise copy number profiles within the
+#' segment means.
 #'
 #' @author Hua-Jun Wu
 #' @author Darlan Conterno Minussi
 #'
+#' @detail \code{filterCells} Calculates a correlation matrix across the segment
+#' means among all cells and takes the mean of its k-nearest neighbors correlation.
+#' A threshold (argument resolution) is used for the minimum acceptable mean
+#' correlation among the cell and its neighbors. Values below the set resolution
+#' will be classified as noise cells.
+#'
 #' @param scCNA scCNA object.
 #' @param assay String with the name of the assay to pull data from to filter cells.
-#' @param k K-nearest-neighbor, defaults to 5.
-#' @param resolution Set's how strict the correlation cut off will be. Defaults to 0.8.
+#' @param k A numeric scalar with the number k-nearest-neighbor cells to calculate the
+#' mean correlation
+#' @param resolution A numeric scalar that set's how strict the correlation cut off will be.
 #'
-#' @return Adds a filtered cells label to the scCNA metadata.
+#' @return Adds a column named 'filtered' to \code{\link[SummarizedExperiment]{colData}}
 #' Cells that pass the filtering criteria receive the label "kept",
 #' whereas cells that do not pass the filtering criteria
 #' receive the label "removed".
 #'
-#' @return Metadata can be accessed with \code{SummarizedExperiment::colData(scCNA)}
 #' @export
 #'
 #' @examples
@@ -29,6 +33,7 @@ filterCells <- function(scCNA,
                         assay = 'segment_ratios',
                         k = 5,
                         resolution = 0.9) {
+
   if (!is.numeric(resolution)) {
     stop("Resolution needs to be a number between 0 and 1")
   }
@@ -63,7 +68,7 @@ filterCells <- function(scCNA,
                                               cor < resolution ~ "removed"))
 
   message(
-    "Adding information to metadata. Access with SummarizedExperiment::colData(scCNA)."
+    "Adding information to metadata. Access with colData(scCNA)."
   )
   if (identical(SummarizedExperiment::colData(scCNA)$sample,
                 dst_knn_df$sample)) {
 
@@ -1,33 +1,59 @@
 #' Find Clusters
 #'
-#' Search for clusters in the scCNA data by
-#' using a graph based approach. \code{findClusters()}
-#' builds an SNN graph of the k-nearest neighbors and
-#' attempts to find two different configuration of clusters.
-#' Major and minor subpopulations.
-#' Major clusters are found by looking at the graph connected components,
-#'  whereas the minor clusters use the hdbscan or leiden
-#'   algorithm to detect connected communities within the major clusters.
-#' \code{findClusters()} generates the graph by using the
-#' UMAP embedding that can be obtained after running \code{runUmap()}.
-#'
+#' Search for clusters in the scCNA data.
 #'
 #' @author Darlan Conterno Minussi
 #'
 #' @param scCNA scCNA object.
 #' @param embedding String with the name of the reducedDim to pull data from.
-#' @param method Which method should be used for clustering,
-#' options are "hdbscan" or "leiden". Defaults to "hdbscan".
-#' @param k_superclones k-nearest-neighbor value.
-#' Used to find the major clusters.
-#' @param k_subclones k-nearest-neighbor value.
-#' Used to find the minor clusters
-#' @param seed Seed passed on to pseudorandom dependent functions (Defaults to 17).
+#' @param method A string with method used for clustering.
+#' @param k_superclones A numeric scalar k-nearest-neighbor value.
+#' Used to find the superclones.
+#' @param k_subclones A numeric scalar k-nearest-neighbor value.
+#' Used to find the subclones
+#' @param seed A numeric scalar seed passed on to pseudo-random dependent functions.
+#'
+#' @details \code{findClusters} uses the reduced dimensional embedding resulting
+#'  from \code{\link{runUmap}} to perform clustering at two levels, hereby referred
+#'  to as superclones, and subclones. When clustering for superclones findClusters
+#'  creates a graph representation of the dataset reduced dimension embedding
+#'  using a shared nearest neighbor algorithm (SNN) \code{\link[scran]{buildSNNGraph}},
+#'  from this graph the connected components are extracted and generally
+#'  represent high-level structures that share large, lineage defining copy
+#'  number events. At a more fine-grained resolution, CopyKit can also be
+#'  used to detect subclones, i. e. groups of cells containing a unique
+#'  copy number event per cluster, to do so the umap embedding is again
+#'  used as the pre-processing step, this time to perform a density-based
+#'  clustering with hdbscan \code{\link[dbscan]{hdbscan}}. Network clustering
+#'  algorithms on top of the SNN graph such as the leiden algorithm
+#'  \code{\link[leidenbase]{leiden_find_partition}}.
+#'
+#'  \itemize{
+#'  \item{hdbscan}: hdbscan is an outlier aware clustering algorithm, since
+#'  extensive filtering of the dataset can be applied before clustering with
+#'  \code{\link{filterCells}}, any cell classified as an outlier is inferred
+#'  to the same cluster group as its closest, non-outlier, nearest-neighbor
+#'   according to Euclidean distance.
+#'  }
+#'
+#' @return Cluster information is added to \code{\link[SummarizedExperiment]{colData}}
+#' in columns superclones or subclones. Superclones are prefixed by 's' whereas subclones
+#' are prefixed by 'c'
+#'
+#' @seealso \code{\link{findSuggestedK}} to obtain suggestions of k_subclones values.
 #'
-#' @return Metadata cluster information that can be found in
-#' \code{SummarizedExperiment::colData(scCNA)$superclones}
-#' for the major clusters and \code{SummarizedExperiment::colData(scCNA)$subclones}
-#' for the minor clusters.
+#' @references Laks, E., McPherson, A., Zahn, H., et al. (2019). Clonal Decomposition
+#' and DNA Replication States Defined by Scaled Single-Cell Genome Sequencing.
+#' Cell, 179(5), 1207–1221.e22. https://doi.org/10.1016/j.cell.2019.10.026
+#'
+#' Leland McInnes and John Healy and James Melville. UMAP: Uniform Manifold
+#' Approximation and Projection for Dimension Reduction. arXiv:1802.03426
+#'
+#' Lun ATL, McCarthy DJ, Marioni JC (2016). “A step-by-step workflow for low-level
+#' analysis of single-cell RNA-seq data with Bioconductor.”
+#' F1000Res., 5, 2122. doi: 10.12688/f1000research.9501.2.
+#'
+#' @seealso \code{\link[dbscan]{hdbscan}} For hdbscan clustering.
 #'
 #' @export
 #' @import leidenbase
@@ -45,11 +71,13 @@
 
 findClusters <- function(scCNA,
                          embedding = "umap",
-                         method = "hdbscan",
+                         method = c("hdbscan", "leiden"),
                          k_superclones = NULL,
                          k_subclones = NULL,
                          seed = 17) {
 
+  method <- match.arg(method)
+
   # obtaining data from reducedDim slot
   if (!is.null(SingleCellExperiment::reducedDim(scCNA, embedding))) {
 
 
@@ -1,15 +1,30 @@
-#' identifies possible normal cells in the dataset
-#'  based on coefficient of variation.
+#' findNormalCells
+#'
+#' Find cells that are not aneuploid in the dataset.
 #'
 #' @param scCNA scCNA object
 #' @param assay String with the name of the assay to pull data from to find normal cells.
-#' @param resolution Numeric. Threshold which will be used to detect normal cells.
-#' @param remove_XY Boolean. Removes chrX and chrY from the analysis. Recommended.
-#' @param simul Add a simulated normal dataset to boost
-#' identifying normal cells when a dataset has a small proportion of those.
+#' @param resolution A numeric scalar used as threshold to detect normal cells. See details.
+#' @param remove_XY A boolean that removes chrX and chrY from the analysis. Recommended.
+#' @param simul A boolean that if TRUE adds a simulated normal dataset to boost
+#' identifying normal cells in datasets with small proportions of normal cells.
+#'
+#' @details performs a sample-wise calculation of the segment means coefficient
+#'  of variation and fits a normal mixture model to the observed distribution f
+#'  rom all cells. To increase the sensitivity of the model, the expected
+#'  distribution of the coefficient of variation for diploid cells is simulated
+#'  for a thousand cells (mean = 0, sd = 0.01). This way, CopyKit can adequately
+#'  detect normal cells even in datasets with limited amounts of diploid cells
+#'  and guarantees that no aneuploid cell will be removed from datasets without
+#'  any normal cells. The distribution with the smallest coefficient of variance
+#'  is assumed to be originating from normal cells. Cells are classified as normal
+#'  if they have a coefficient of variance smaller than the mean plus five times
+#'  the standard deviation of the normal cell distribution.
+#'
+#' @return information is added to \code{\link[SummarizedExperiment]{colData}}
+#' in a columns named 'is_normal' being TRUE if a cell is detected as normal and
+#' FALSE if the cell is detected as aneuploid.
 #'
-#' @return Adds is_normal column to the scCNA metadata.
-#' Can be accessed with colData(scCNA)
 #' @export
 #'
 #' @importFrom tibble enframe
 
@@ -1,18 +1,37 @@
-#' Finds the suggested K value to be used for subclone clustering
+#' findSuggestedK
+#'
+#' Performs a grid search over a range of k values to assess cluster stability.
 #'
 #' @param scCNA  scCNA object.
-#' @param embedding String with the name of the reducedDim to pull data from.
-#' @param k_range Range of values to be tested.
-#'  Defaults to 7 to the sqrt of the number of cells
-#' @param method Method which where the values will be tested.
-#' Only "hdbscan" available.
-#' @param seed Seed (Defaults to 17).
-#' @param B Number of bootstrapping. Defaults to 100.
-#' Higher values yield better results at a cost of performance
+#' @param embedding String with the name of the reducedDim embedding to pull data from.
+#' @param k_range A numeric range of values to be tested.
+#' @param method A string with the method of clustering to be tested.
+#' @param seed A numerical scalar with a seed value to be passed on to
+#' \code{\link[uwot]{umap}}.
+#' @param B A numeric with the number of bootstrapping iterations passed on to
+#' \code{\link[fpc]{clusterboot}}. Higher values yield better results at a cost
+#' of performance
 #' @param BPPARAM A \linkS4class{BiocParallelParam} specifying how the function
 #' should be parallelized.
+#'
+#' @details performs a grid-search over a range of k values and returns the value
+#' that maximizes the jaccard similarity. Importantly, while this approach does
+#' not guarantee optimal clustering, it provides a guide that maximizes cluster
+#' stability.
+#'
 #' @return Adds a table with the mean jaccard coefficient of clusters for each
-#' tested k and the suggested k value to be used for clustering to the scCNA metadata.
+#' tested k and the suggested k value to be used for clustering to
+#' \code{\link[SummarizedExperiment]{metadata}}
+#'
+#' @seealso \code{\link[fpc]{clusterboot}}
+#'
+#' @references Hennig, C. (2007) Cluster-wise assessment of cluster stability.
+#' Computational Statistics and Data Analysis, 52, 258-271.
+#'
+#' Hennig, C. (2008) Dissolution point and isolation robustness: robustness
+#' criteria for general cluster analysis methods.
+#' Journal of Multivariate Analysis 99, 1154-1176.
+#'
 #' @export
 #'
 #' @importFrom fpc clusterboot
@@ -24,11 +43,12 @@
 #' @examples
 findSuggestedK <- function(scCNA,
                            embedding = 'umap',
-                         k_range = 7:sqrt(ncol(segment_ratios(scCNA))),
-                         method = "hdbscan",
-                         seed = 17,
-                         B = 100,
-                         BPPARAM = bpparam()) {
+                           k_range = 7:sqrt(ncol(segment_ratios(scCNA))),
+                           method = "hdbscan",
+                           seed = 17,
+                           B = 200,
+                           BPPARAM = bpparam())
+{
 
   # obtaining data from reducedDim slot
   if (is.null(SingleCellExperiment::reducedDim(scCNA, embedding))) {