Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 126 additions & 1 deletion R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,131 @@ adjustedRandIndex <- function (x, y)
return(ARI)
}

RARI <- function(x, y, dist_x = NULL, dist_y = NULL)
{
if (all(is.null(dist_x), is.null(dist_y))) {
return(mclust::adjustedRandIndex(x, y))
}
if (xor(is.null(dist_x), is.null(dist_y))) {
stop("if specifying distance matrices, both dist_x and dist_y need non-NULL arguments")
}
x <- as.vector(x)
y <- as.vector(y)
if (length(x) != length(y)) {
stop("arguments must be vectors of the same length")
}
#-------------------------------------------------------------------------------
# Finding which objects in x and y belong to which cluster. This is needed to
# compute pairwise inter-cluster distances.
#-------------------------------------------------------------------------------
x <- as.numeric(factor(x))
y <- as.numeric(factor(y))
names(x) <- 1:length(x)
names(y) <- 1:length(y)

clustIndexX <- lapply(sort(unique(x)), function(i){names(x[x %in% i])})
clustIndexY <- lapply(sort(unique(y)), function(i){names(y[y %in% i])})
#-------------------------------------------------------------------------------
# Coercing dist_x and dist_y into matrices for easier indexing. Setting the
# diagonal to NA to remove bias from the intra-cluster distance calculation.
#-------------------------------------------------------------------------------
dist_x <- as.matrix(dist_x)
dist_y <- as.matrix(dist_y)
rownames(dist_x) <- 1:length(x)
colnames(dist_x) <- 1:length(x)
rownames(dist_y) <- 1:length(y)
colnames(dist_y) <- 1:length(y)
diag(dist_x) <- NA
diag(dist_y) <- NA
#-------------------------------------------------------------------------------
# Separately for each cluster solution--x and y--, loop through each cluster and
# compute the average distance from each of the N objects to all objects in that
# cluster.
#-------------------------------------------------------------------------------
temp <- vector("numeric", length(x))
avgClustDistX <- vector("list", length(clustIndexX))
for (i in seq_along(clustIndexX)) {
for (j in seq_along(x)) {
temp[j] <- mean(dist_x[j, clustIndexX[[i]]], na.rm = TRUE)
}
avgClustDistX[[i]] <- temp
}
temp <- vector("numeric", length(y))
avgClustDistY <- vector("list", length(clustIndexY))
for (i in seq_along(clustIndexY)) {
for (j in seq_along(y)) {
temp[j] <- mean(dist_y[j, clustIndexY[[i]]], na.rm = TRUE)
}
avgClustDistY[[i]] <- temp
}
avgClustDistX <- data.frame(do.call(cbind, avgClustDistX))
avgClustDistY <- data.frame(do.call(cbind, avgClustDistY))
avgClustDistX$x <- x
avgClustDistY$y <- y
#-------------------------------------------------------------------------------
# Computing the matrix of average intra/inter distances between clusters, then
# ranking these distances. If two or more clusters are equidistant from a third
# cluster, the "min" rank method will collapse these clusters in the rank mismatch
# matrix as described on page 4 in Pinto et al. (2007).
#-------------------------------------------------------------------------------
clustDistMatX <- do.call(rbind, lapply(1:length(clustIndexX), function(i){tapply(avgClustDistX[, i], avgClustDistX$x, mean)}))
clustDistMatY <- do.call(rbind, lapply(1:length(clustIndexY), function(i){tapply(avgClustDistY[, i], avgClustDistY$y, mean)}))
clustDistMatX <- apply(clustDistMatX, 2, rank, ties.method = "min") - 1 # subtracting 1 for easier indexing.
clustDistMatY <- apply(clustDistMatY, 2, rank, ties.method = "min") - 1 # subtracting 1 for easier indexing.
#-------------------------------------------------------------------------------
# Computing all object-level pairwise differences in cluster distance ranks and
# setting the resulting N by N diagonal to NA to remove the bias in calculating
# an object's distance from itself.
#-------------------------------------------------------------------------------
clustDistRankX <- sapply(1:length(x), function(i){sapply(1:length(x), function(j){clustDistMatX[x[i], x[j]]})})
clustDistRankY <- sapply(1:length(y), function(i){sapply(1:length(y), function(j){clustDistMatY[y[i], y[j]]})})
diag(clustDistRankX) <- NA
diag(clustDistRankY) <- NA
#-------------------------------------------------------------------------------
# rmm is the rank mismatch matrix in Table 3 of Pinto et al. (2007) indexed
# from 0 to number of clusters - 1.
#-------------------------------------------------------------------------------
clustDistRankX <- as.vector(clustDistRankX) # collapsing the data for the table function.
clustDistRankY <- as.vector(clustDistRankY) # collapsing the data for the table function.

rmm <- table(clustDistRankX, clustDistRankY)
#-------------------------------------------------------------------------------
# mdd is the mean diagonal deviance from eqn. 5 in Pinto et al. (2007). The 'for'
# loop calculates the numerator of this eqn. Note: The Rand index = 1 - mdd.
#-------------------------------------------------------------------------------
temp <- vector("numeric", ncol(rmm))
mddNum <- vector("list", nrow(rmm))
for (i in 1:nrow(rmm)) {
for (j in 1:ncol(rmm)) {
temp[j] <- rmm[i, j] * abs(i / nrow(rmm) - j / ncol(rmm))
}
mddNum[[i]] <- temp
}
mdd <- sum(unlist(mddNum)) / (length(x)^2 - length(x))
#-------------------------------------------------------------------------------
# rmmi is the expected value of the rank mismatch matrix under independence:
# eqn. 6 in Pinto et al. (2007).
#-------------------------------------------------------------------------------
rmmi <- suppressWarnings(chisq.test(x = matrix(rmm, nrow = length(rownames(rmm))), correct = FALSE)$expected)
#-------------------------------------------------------------------------------
# mddi is the mean diagonal deviance under independence (eqn. 5). RARI is the
# ranked adjusted Rand index: eqn. 7 in Pinto et al. (2007).
#-------------------------------------------------------------------------------
temp <- vector("numeric", ncol(rmmi))
mddNum <- vector("list", nrow(rmmi))
for (i in 1:nrow(rmmi)) {
for (j in 1:ncol(rmmi)) {
temp[j] <- rmmi[i, j] * abs(i / nrow(rmmi) - j / ncol(rmmi))
}
mddNum[[i]] <- temp
}
mddi <- sum(unlist(mddNum)) / (length(x)^2 - length(x))

RARI <- (mddi - mdd) / mddi

return(RARI)
}

classError <- function(classification, truth)
{
q <- function(map, len, x)
Expand Down Expand Up @@ -1191,4 +1316,4 @@ as.densityMclust.Mclust <- function(x, ...)
x$density <- dens(modelName = x$modelName, data = x$data,
parameters = x$parameters, logarithm = FALSE)
return(x)
}
}
59 changes: 59 additions & 0 deletions man/RARI.rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
\name{RARI}
\alias{RARI}
\title{
Ranked Adjusted Rand Index
}
\description{
Computes the ranked adjusted Rand index comparing two classifications with supplementary distance matrices.
}
\usage{
RARI(x, y, dist_x, dist_y)
}
\arguments{
\item{x}{
A numeric or character vector of class labels.
}
\item{y}{
A numeric or character vector of class labels.
The length of \code{y} should be the same as that of \code{x}.
}
\item{dist_x}{
A distance matrix of class 'dist' or 'matrix' for \code{x} class labels.
}
\item{dist_y}{
A distance matrix of class 'dist' or 'matrix' for \code{y} class labels.
}
}
\value{
The ranked adjusted Rand index comparing the two partitions and their distance matrices (a scalar).
This index has zero expected value only in the case of (a) random partition, (b) when in one clustering
all entities are in the same clustering and in the other, every entitity is in its own cluster, and (c)
all clusters are equidistant from each other. The ranked adjusted Rand index is bounded above by 1 in the
case of (a) perfect agreement between two partitions and (b) equally ranked relative distances between clusters.

If the arguments \code{dist_x} and \code{dist_y} are \code{NULL}, the output is identical to the adjusted Rand index.
}

\section{References}{
Pinto, F.R., Carrico, J.A., Ramirez, M., and Almeida, J.S. (2007). Ranked Adjusted
Rand: integrating distance and partition information in a measure of clustering
agreement. \emph{BMC Bioinformatics 8:44}. doi: 10.1186/1471-2105-8-44
}
\seealso{
\code{\link{classError}},
\code{\link{mapClass}},
\code{\link{table}}
}
\examples{
data(iris)
x <- iris$Species
y <- iris$Species
dist_x <- dist(iris[, 1:4]) # Using all measures in the distance matrix
dist_y <- dist(iris[, 2]) # Using only Sepal.Width in the distance matrix

adjustedRandIndex(x, y)
RARI(x, y, dist_x, dist_y)
}
\keyword{cluster}
% docclass is function
% Converted by Sd2Rd version 1.21.