Skip to content

Commit 90e4388

Browse files
committed
better documentation
1 parent c661426 commit 90e4388

17 files changed

+346
-152
lines changed

R/calcConsensus.R

+12-3
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,30 @@
1-
#' Calculate a consensus matrix of segment ratios based on metadata
1+
#' Calculate a consensus matrix of segment means based on \code{colData}
22
#'
33
#' @param scCNA The scCNA object.
44
#' @param assay String with the name of the assay to pull data from to calculate
55
#' the consensus matrix.
6-
#' @param consensus_by The column from metadata that will be used
6+
#' @param consensus_by A string with the column from colData that will be used
77
#' to isolate the cells by factor and calculate the consensus.
8+
#' @param fun A string indicating the summarizing function to be used.
89
#' @param BPPARAM A \linkS4class{BiocParallelParam} specifying how the function
910
#' should be parallelized.
1011
#'
12+
#' @details Consensus profiles are calculated by averaging or taking the median
13+
#' of the ith segment mean of all single cells assigned to the same element of
14+
#' \link{colData},
15+
#'
1116
#' @return
1217
#' @export
1318
#'
1419
#' @examples
1520
calcConsensus <- function(scCNA,
1621
assay = "segment_ratios",
1722
consensus_by = "subclones",
23+
fun = c("median", "mean"),
1824
BPPARAM = bpparam()) {
25+
26+
fun <- match.arg(fun)
27+
1928
if (consensus_by == 'subclones' &
2029
is.null(SummarizedExperiment::colData(scCNA)$subclones)) {
2130
stop("Calculating consensus requires cluster information. use findClusters(scCNA)")
@@ -49,7 +58,7 @@ calcConsensus <- function(scCNA,
4958

5059
consensus_list <-
5160
BiocParallel::bplapply(long_list, function(x) {
52-
apply(x, 2, median)
61+
apply(x, 2, fun)
5362
}, BPPARAM = BPPARAM)
5463

5564
cs_df <- as.data.frame(t(do.call(rbind, consensus_list)))

R/calcRatios.R

+15-7
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,32 @@
11
#' Calculates the ratios from a matrix of counts
22
#'
33
#' @param scCNA The scCNA object
4-
#' @param assay The assay that will be used
5-
#' @param fun Character. Function used to calculate the ratios.
6-
#' Defaults to "median"
4+
#' @param assay String with the name of the assay to pull data from to calculate
5+
#' the ratios.
6+
#' @param fun A string indicating the summarizing function to be used.
77
#'
8-
#' @return A ratio matrix within the slot \code{assay(scCNA, 'ratios')}
9-
#' can be accessed with \code{copykit::ratios(scCNA)}.
8+
#' @details Calculates a sample-wise normalization of the selected assay by the
9+
#' mean bin counts returns ratios where a value of 1 corresponds to the neutral
10+
#'copy number state of the sample
11+
#'
12+
#' @return A ratio matrix within the slot assay(scCNA, 'ratios')
13+
#' can be accessed with \code{ratios}.
1014
#' @export
1115
#'
1216
#' @importFrom SummarizedExperiment assay
1317
#'
1418
#' @examples
1519
calcRatios <- function(scCNA,
16-
assay = "ft",
17-
fun = "mean") {
20+
assay = c("ft", "bin_counts"),
21+
fun = c("mean", "median")) {
22+
1823
if (assay %!in% c("ft", "bin_counts")) {
1924
stop("Assay must be either 'ft' or 'bin_counts'")
2025
}
2126

27+
assay <- match.arg(assay)
28+
fun <- match.arg(fun)
29+
2230
counts <- SummarizedExperiment::assay(scCNA, assay)
2331

2432
ratios_df <- sweep(counts, 2, apply(counts, 2, fun), '/')

R/countBreakpoints.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#' Counting breakpoints from
22
#'
3-
#' Considers changes in the segment ratios as breakpoints. Counts the breakpoints for each chromosome separately.
3+
#' Considers changes in the segment ratios as breakpoints.
4+
#' Counts the breakpoints for each chromosome separately.
45
#'
56
#' @param scCNA
67
#'

R/filterCells.R

+14-9
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
#' Filter noise cells
22
#'
3-
#' filterCells uses a k-nearest-neighbor approach to remove cells
4-
#' with random CNA profiles, largely due to noise data.
5-
#' It calculates a correlation matrix and sets a resolution
6-
#' below which non neighbors will be classified as noise cells.
3+
#' Uses a nearest neighbor approach to find noise copy number profiles within the
4+
#' segment means.
75
#'
86
#' @author Hua-Jun Wu
97
#' @author Darlan Conterno Minussi
108
#'
9+
#' @detail \code{filterCells} Calculates a correlation matrix across the segment
10+
#' means among all cells and takes the mean of its k-nearest neighbors correlation.
11+
#' A threshold (argument resolution) is used for the minimum acceptable mean
12+
#' correlation among the cell and its neighbors. Values below the set resolution
13+
#' will be classified as noise cells.
14+
#'
1115
#' @param scCNA scCNA object.
1216
#' @param assay String with the name of the assay to pull data from to filter cells.
13-
#' @param k K-nearest-neighbor, defaults to 5.
14-
#' @param resolution Set's how strict the correlation cut off will be. Defaults to 0.8.
17+
#' @param k A numeric scalar with the number k-nearest-neighbor cells to calculate the
18+
#' mean correlation
19+
#' @param resolution A numeric scalar that set's how strict the correlation cut off will be.
1520
#'
16-
#' @return Adds a filtered cells label to the scCNA metadata.
21+
#' @return Adds a column named 'filtered' to \code{\link[SummarizedExperiment]{colData}}
1722
#' Cells that pass the filtering criteria receive the label "kept",
1823
#' whereas cells that do not pass the filtering criteria
1924
#' receive the label "removed".
2025
#'
21-
#' @return Metadata can be accessed with \code{SummarizedExperiment::colData(scCNA)}
2226
#' @export
2327
#'
2428
#' @examples
@@ -29,6 +33,7 @@ filterCells <- function(scCNA,
2933
assay = 'segment_ratios',
3034
k = 5,
3135
resolution = 0.9) {
36+
3237
if (!is.numeric(resolution)) {
3338
stop("Resolution needs to be a number between 0 and 1")
3439
}
@@ -63,7 +68,7 @@ filterCells <- function(scCNA,
6368
cor < resolution ~ "removed"))
6469

6570
message(
66-
"Adding information to metadata. Access with SummarizedExperiment::colData(scCNA)."
71+
"Adding information to metadata. Access with colData(scCNA)."
6772
)
6873
if (identical(SummarizedExperiment::colData(scCNA)$sample,
6974
dst_knn_df$sample)) {

R/findClusters.R

+51-23
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,59 @@
11
#' Find Clusters
22
#'
3-
#' Search for clusters in the scCNA data by
4-
#' using a graph based approach. \code{findClusters()}
5-
#' builds an SNN graph of the k-nearest neighbors and
6-
#' attempts to find two different configuration of clusters.
7-
#' Major and minor subpopulations.
8-
#' Major clusters are found by looking at the graph connected components,
9-
#' whereas the minor clusters use the hdbscan or leiden
10-
#' algorithm to detect connected communities within the major clusters.
11-
#' \code{findClusters()} generates the graph by using the
12-
#' UMAP embedding that can be obtained after running \code{runUmap()}.
13-
#'
3+
#' Search for clusters in the scCNA data.
144
#'
155
#' @author Darlan Conterno Minussi
166
#'
177
#' @param scCNA scCNA object.
188
#' @param embedding String with the name of the reducedDim to pull data from.
19-
#' @param method Which method should be used for clustering,
20-
#' options are "hdbscan" or "leiden". Defaults to "hdbscan".
21-
#' @param k_superclones k-nearest-neighbor value.
22-
#' Used to find the major clusters.
23-
#' @param k_subclones k-nearest-neighbor value.
24-
#' Used to find the minor clusters
25-
#' @param seed Seed passed on to pseudorandom dependent functions (Defaults to 17).
9+
#' @param method A string with method used for clustering.
10+
#' @param k_superclones A numeric scalar k-nearest-neighbor value.
11+
#' Used to find the superclones.
12+
#' @param k_subclones A numeric scalar k-nearest-neighbor value.
13+
#' Used to find the subclones
14+
#' @param seed A numeric scalar seed passed on to pseudo-random dependent functions.
15+
#'
16+
#' @details \code{findClusters} uses the reduced dimensional embedding resulting
17+
#' from \code{\link{runUmap}} to perform clustering at two levels, hereby referred
18+
#' to as superclones, and subclones. When clustering for superclones findClusters
19+
#' creates a graph representation of the dataset reduced dimension embedding
20+
#' using a shared nearest neighbor algorithm (SNN) \code{\link[scran]{buildSNNGraph}},
21+
#' from this graph the connected components are extracted and generally
22+
#' represent high-level structures that share large, lineage defining copy
23+
#' number events. At a more fine-grained resolution, CopyKit can also be
24+
#' used to detect subclones, i. e. groups of cells containing a unique
25+
#' copy number event per cluster, to do so the umap embedding is again
26+
#' used as the pre-processing step, this time to perform a density-based
27+
#' clustering with hdbscan \code{\link[dbscan]{hdbscan}}. Network clustering
28+
#' algorithms on top of the SNN graph such as the leiden algorithm
29+
#' \code{\link[leidenbase]{leiden_find_partition}}.
30+
#'
31+
#' \itemize{
32+
#' \item{hdbscan}: hdbscan is an outlier aware clustering algorithm, since
33+
#' extensive filtering of the dataset can be applied before clustering with
34+
#' \code{\link{filterCells}}, any cell classified as an outlier is inferred
35+
#' to the same cluster group as its closest, non-outlier, nearest-neighbor
36+
#' according to Euclidean distance.
37+
#' }
38+
#'
39+
#' @return Cluster information is added to \code{\link[SummarizedExperiment]{colData}}
40+
#' in columns superclones or subclones. Superclones are prefixed by 's' whereas subclones
41+
#' are prefixed by 'c'
42+
#'
43+
#' @seealso \code{\link{findSuggestedK}} to obtain suggestions of k_subclones values.
2644
#'
27-
#' @return Metadata cluster information that can be found in
28-
#' \code{SummarizedExperiment::colData(scCNA)$superclones}
29-
#' for the major clusters and \code{SummarizedExperiment::colData(scCNA)$subclones}
30-
#' for the minor clusters.
45+
#' @references Laks, E., McPherson, A., Zahn, H., et al. (2019). Clonal Decomposition
46+
#' and DNA Replication States Defined by Scaled Single-Cell Genome Sequencing.
47+
#' Cell, 179(5), 1207–1221.e22. https://doi.org/10.1016/j.cell.2019.10.026
48+
#'
49+
#' Leland McInnes and John Healy and James Melville. UMAP: Uniform Manifold
50+
#' Approximation and Projection for Dimension Reduction. arXiv:1802.03426
51+
#'
52+
#' Lun ATL, McCarthy DJ, Marioni JC (2016). “A step-by-step workflow for low-level
53+
#' analysis of single-cell RNA-seq data with Bioconductor.”
54+
#' F1000Res., 5, 2122. doi: 10.12688/f1000research.9501.2.
55+
#'
56+
#' @seealso \code{\link[dbscan]{hdbscan}} For hdbscan clustering.
3157
#'
3258
#' @export
3359
#' @import leidenbase
@@ -45,11 +71,13 @@
4571

4672
findClusters <- function(scCNA,
4773
embedding = "umap",
48-
method = "hdbscan",
74+
method = c("hdbscan", "leiden"),
4975
k_superclones = NULL,
5076
k_subclones = NULL,
5177
seed = 17) {
5278

79+
method <- match.arg(method)
80+
5381
# obtaining data from reducedDim slot
5482
if (!is.null(SingleCellExperiment::reducedDim(scCNA, embedding))) {
5583

R/findNormalCells.R

+23-8
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,30 @@
1-
#' identifies possible normal cells in the dataset
2-
#' based on coefficient of variation.
1+
#' findNormalCells
2+
#'
3+
#' Find cells that are not aneuploid in the dataset.
34
#'
45
#' @param scCNA scCNA object
56
#' @param assay String with the name of the assay to pull data from to find normal cells.
6-
#' @param resolution Numeric. Threshold which will be used to detect normal cells.
7-
#' @param remove_XY Boolean. Removes chrX and chrY from the analysis. Recommended.
8-
#' @param simul Add a simulated normal dataset to boost
9-
#' identifying normal cells when a dataset has a small proportion of those.
7+
#' @param resolution A numeric scalar used as threshold to detect normal cells. See details.
8+
#' @param remove_XY A boolean that removes chrX and chrY from the analysis. Recommended.
9+
#' @param simul A boolean that if TRUE adds a simulated normal dataset to boost
10+
#' identifying normal cells in datasets with small proportions of normal cells.
11+
#'
12+
#' @details performs a sample-wise calculation of the segment means coefficient
13+
#' of variation and fits a normal mixture model to the observed distribution f
14+
#' rom all cells. To increase the sensitivity of the model, the expected
15+
#' distribution of the coefficient of variation for diploid cells is simulated
16+
#' for a thousand cells (mean = 0, sd = 0.01). This way, CopyKit can adequately
17+
#' detect normal cells even in datasets with limited amounts of diploid cells
18+
#' and guarantees that no aneuploid cell will be removed from datasets without
19+
#' any normal cells. The distribution with the smallest coefficient of variance
20+
#' is assumed to be originating from normal cells. Cells are classified as normal
21+
#' if they have a coefficient of variance smaller than the mean plus five times
22+
#' the standard deviation of the normal cell distribution.
23+
#'
24+
#' @return information is added to \code{\link[SummarizedExperiment]{colData}}
25+
#' in a columns named 'is_normal' being TRUE if a cell is detected as normal and
26+
#' FALSE if the cell is detected as aneuploid.
1027
#'
11-
#' @return Adds is_normal column to the scCNA metadata.
12-
#' Can be accessed with colData(scCNA)
1328
#' @export
1429
#'
1530
#' @importFrom tibble enframe

R/findSuggestedK.R

+35-15
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,37 @@
1-
#' Finds the suggested K value to be used for subclone clustering
1+
#' findSuggestedK
2+
#'
3+
#' Performs a grid search over a range of k values to assess cluster stability.
24
#'
35
#' @param scCNA scCNA object.
4-
#' @param embedding String with the name of the reducedDim to pull data from.
5-
#' @param k_range Range of values to be tested.
6-
#' Defaults to 7 to the sqrt of the number of cells
7-
#' @param method Method which where the values will be tested.
8-
#' Only "hdbscan" available.
9-
#' @param seed Seed (Defaults to 17).
10-
#' @param B Number of bootstrapping. Defaults to 100.
11-
#' Higher values yield better results at a cost of performance
6+
#' @param embedding String with the name of the reducedDim embedding to pull data from.
7+
#' @param k_range A numeric range of values to be tested.
8+
#' @param method A string with the method of clustering to be tested.
9+
#' @param seed A numerical scalar with a seed value to be passed on to
10+
#' \code{\link[uwot]{umap}}.
11+
#' @param B A numeric with the number of bootstrapping iterations passed on to
12+
#' \code{\link[fpc]{clusterboot}}. Higher values yield better results at a cost
13+
#' of performance
1214
#' @param BPPARAM A \linkS4class{BiocParallelParam} specifying how the function
1315
#' should be parallelized.
16+
#'
17+
#' @details performs a grid-search over a range of k values and returns the value
18+
#' that maximizes the jaccard similarity. Importantly, while this approach does
19+
#' not guarantee optimal clustering, it provides a guide that maximizes cluster
20+
#' stability.
21+
#'
1422
#' @return Adds a table with the mean jaccard coefficient of clusters for each
15-
#' tested k and the suggested k value to be used for clustering to the scCNA metadata.
23+
#' tested k and the suggested k value to be used for clustering to
24+
#' \code{\link[SummarizedExperiment]{metadata}}
25+
#'
26+
#' @seealso \code{\link[fpc]{clusterboot}}
27+
#'
28+
#' @references Hennig, C. (2007) Cluster-wise assessment of cluster stability.
29+
#' Computational Statistics and Data Analysis, 52, 258-271.
30+
#'
31+
#' Hennig, C. (2008) Dissolution point and isolation robustness: robustness
32+
#' criteria for general cluster analysis methods.
33+
#' Journal of Multivariate Analysis 99, 1154-1176.
34+
#'
1635
#' @export
1736
#'
1837
#' @importFrom fpc clusterboot
@@ -24,11 +43,12 @@
2443
#' @examples
2544
findSuggestedK <- function(scCNA,
2645
embedding = 'umap',
27-
k_range = 7:sqrt(ncol(segment_ratios(scCNA))),
28-
method = "hdbscan",
29-
seed = 17,
30-
B = 100,
31-
BPPARAM = bpparam()) {
46+
k_range = 7:sqrt(ncol(segment_ratios(scCNA))),
47+
method = "hdbscan",
48+
seed = 17,
49+
B = 200,
50+
BPPARAM = bpparam())
51+
{
3252

3353
# obtaining data from reducedDim slot
3454
if (is.null(SingleCellExperiment::reducedDim(scCNA, embedding))) {

0 commit comments

Comments
 (0)