diff --git a/R/util.R b/R/util.R index 1a33921..ba25275 100644 --- a/R/util.R +++ b/R/util.R @@ -16,6 +16,131 @@ adjustedRandIndex <- function (x, y) return(ARI) } +RARI <- function(x, y, dist_x = NULL, dist_y = NULL) + { + if (all(is.null(dist_x), is.null(dist_y))) { + return(mclust::adjustedRandIndex(x, y)) + } + if (xor(is.null(dist_x), is.null(dist_y))) { + stop("if specifying distance matrices, both dist_x and dist_y need non-NULL arguments") + } + x <- as.vector(x) + y <- as.vector(y) + if (length(x) != length(y)) { + stop("arguments must be vectors of the same length") + } + #------------------------------------------------------------------------------- + # Finding which objects in x and y belong to which cluster. This is needed to + # compute pairwise inter-cluster distances. + #------------------------------------------------------------------------------- + x <- as.numeric(factor(x)) + y <- as.numeric(factor(y)) + names(x) <- 1:length(x) + names(y) <- 1:length(y) + + clustIndexX <- lapply(sort(unique(x)), function(i){names(x[x %in% i])}) + clustIndexY <- lapply(sort(unique(y)), function(i){names(y[y %in% i])}) + #------------------------------------------------------------------------------- + # Coercing dist_x and dist_y into matrices for easier indexing. Setting the + # diagonal to NA to remove bias from the intra-cluster distance calculation. + #------------------------------------------------------------------------------- + dist_x <- as.matrix(dist_x) + dist_y <- as.matrix(dist_y) + rownames(dist_x) <- 1:length(x) + colnames(dist_x) <- 1:length(x) + rownames(dist_y) <- 1:length(y) + colnames(dist_y) <- 1:length(y) + diag(dist_x) <- NA + diag(dist_y) <- NA + #------------------------------------------------------------------------------- + # Separately for each cluster solution--x and y--, loop through each cluster and + # compute the average distance from each of the N objects to all objects in that + # cluster. + #------------------------------------------------------------------------------- + temp <- vector("numeric", length(x)) + avgClustDistX <- vector("list", length(clustIndexX)) + for (i in seq_along(clustIndexX)) { + for (j in seq_along(x)) { + temp[j] <- mean(dist_x[j, clustIndexX[[i]]], na.rm = TRUE) + } + avgClustDistX[[i]] <- temp + } + temp <- vector("numeric", length(y)) + avgClustDistY <- vector("list", length(clustIndexY)) + for (i in seq_along(clustIndexY)) { + for (j in seq_along(y)) { + temp[j] <- mean(dist_y[j, clustIndexY[[i]]], na.rm = TRUE) + } + avgClustDistY[[i]] <- temp + } + avgClustDistX <- data.frame(do.call(cbind, avgClustDistX)) + avgClustDistY <- data.frame(do.call(cbind, avgClustDistY)) + avgClustDistX$x <- x + avgClustDistY$y <- y + #------------------------------------------------------------------------------- + # Computing the matrix of average intra/inter distances between clusters, then + # ranking these distances. If two or more clusters are equidistant from a third + # cluster, the "min" rank method will collapse these clusters in the rank mismatch + # matrix as described on page 4 in Pinto et al. (2007). + #------------------------------------------------------------------------------- + clustDistMatX <- do.call(rbind, lapply(1:length(clustIndexX), function(i){tapply(avgClustDistX[, i], avgClustDistX$x, mean)})) + clustDistMatY <- do.call(rbind, lapply(1:length(clustIndexY), function(i){tapply(avgClustDistY[, i], avgClustDistY$y, mean)})) + clustDistMatX <- apply(clustDistMatX, 2, rank, ties.method = "min") - 1 # subtracting 1 for easier indexing. + clustDistMatY <- apply(clustDistMatY, 2, rank, ties.method = "min") - 1 # subtracting 1 for easier indexing. + #------------------------------------------------------------------------------- + # Computing all object-level pairwise differences in cluster distance ranks and + # setting the resulting N by N diagonal to NA to remove the bias in calculating + # an object's distance from itself. + #------------------------------------------------------------------------------- + clustDistRankX <- sapply(1:length(x), function(i){sapply(1:length(x), function(j){clustDistMatX[x[i], x[j]]})}) + clustDistRankY <- sapply(1:length(y), function(i){sapply(1:length(y), function(j){clustDistMatY[y[i], y[j]]})}) + diag(clustDistRankX) <- NA + diag(clustDistRankY) <- NA + #------------------------------------------------------------------------------- + # rmm is the rank mismatch matrix in Table 3 of Pinto et al. (2007) indexed + # from 0 to number of clusters - 1. + #------------------------------------------------------------------------------- + clustDistRankX <- as.vector(clustDistRankX) # collapsing the data for the table function. + clustDistRankY <- as.vector(clustDistRankY) # collapsing the data for the table function. + + rmm <- table(clustDistRankX, clustDistRankY) + #------------------------------------------------------------------------------- + # mdd is the mean diagonal deviance from eqn. 5 in Pinto et al. (2007). The 'for' + # loop calculates the numerator of this eqn. Note: The Rand index = 1 - mdd. + #------------------------------------------------------------------------------- + temp <- vector("numeric", ncol(rmm)) + mddNum <- vector("list", nrow(rmm)) + for (i in 1:nrow(rmm)) { + for (j in 1:ncol(rmm)) { + temp[j] <- rmm[i, j] * abs(i / nrow(rmm) - j / ncol(rmm)) + } + mddNum[[i]] <- temp + } + mdd <- sum(unlist(mddNum)) / (length(x)^2 - length(x)) + #------------------------------------------------------------------------------- + # rmmi is the expected value of the rank mismatch matrix under independence: + # eqn. 6 in Pinto et al. (2007). + #------------------------------------------------------------------------------- + rmmi <- suppressWarnings(chisq.test(x = matrix(rmm, nrow = length(rownames(rmm))), correct = FALSE)$expected) + #------------------------------------------------------------------------------- + # mddi is the mean diagonal deviance under independence (eqn. 5). RARI is the + # ranked adjusted Rand index: eqn. 7 in Pinto et al. (2007). + #------------------------------------------------------------------------------- + temp <- vector("numeric", ncol(rmmi)) + mddNum <- vector("list", nrow(rmmi)) + for (i in 1:nrow(rmmi)) { + for (j in 1:ncol(rmmi)) { + temp[j] <- rmmi[i, j] * abs(i / nrow(rmmi) - j / ncol(rmmi)) + } + mddNum[[i]] <- temp + } + mddi <- sum(unlist(mddNum)) / (length(x)^2 - length(x)) + + RARI <- (mddi - mdd) / mddi + + return(RARI) +} + classError <- function(classification, truth) { q <- function(map, len, x) @@ -1191,4 +1316,4 @@ as.densityMclust.Mclust <- function(x, ...) x$density <- dens(modelName = x$modelName, data = x$data, parameters = x$parameters, logarithm = FALSE) return(x) -} \ No newline at end of file +} diff --git a/man/RARI.rd b/man/RARI.rd new file mode 100644 index 0000000..bb2f77e --- /dev/null +++ b/man/RARI.rd @@ -0,0 +1,59 @@ +\name{RARI} +\alias{RARI} +\title{ + Ranked Adjusted Rand Index +} +\description{ + Computes the ranked adjusted Rand index comparing two classifications with supplementary distance matrices. +} +\usage{ +RARI(x, y, dist_x, dist_y) +} +\arguments{ + \item{x}{ + A numeric or character vector of class labels. + } + \item{y}{ + A numeric or character vector of class labels. + The length of \code{y} should be the same as that of \code{x}. + } + \item{dist_x}{ + A distance matrix of class 'dist' or 'matrix' for \code{x} class labels. + } + \item{dist_y}{ + A distance matrix of class 'dist' or 'matrix' for \code{y} class labels. + } +} +\value{ + The ranked adjusted Rand index comparing the two partitions and their distance matrices (a scalar). + This index has zero expected value only in the case of (a) random partition, (b) when in one clustering + all entities are in the same clustering and in the other, every entitity is in its own cluster, and (c) + all clusters are equidistant from each other. The ranked adjusted Rand index is bounded above by 1 in the + case of (a) perfect agreement between two partitions and (b) equally ranked relative distances between clusters. + + If the arguments \code{dist_x} and \code{dist_y} are \code{NULL}, the output is identical to the adjusted Rand index. +} + +\section{References}{ + Pinto, F.R., Carrico, J.A., Ramirez, M., and Almeida, J.S. (2007). Ranked Adjusted + Rand: integrating distance and partition information in a measure of clustering + agreement. \emph{BMC Bioinformatics 8:44}. doi: 10.1186/1471-2105-8-44 +} +\seealso{ + \code{\link{classError}}, + \code{\link{mapClass}}, + \code{\link{table}} +} +\examples{ +data(iris) +x <- iris$Species +y <- iris$Species +dist_x <- dist(iris[, 1:4]) # Using all measures in the distance matrix +dist_y <- dist(iris[, 2]) # Using only Sepal.Width in the distance matrix + +adjustedRandIndex(x, y) +RARI(x, y, dist_x, dist_y) +} +\keyword{cluster} +% docclass is function +% Converted by Sd2Rd version 1.21.