From 6d3eb14b619ca51f64c10f7534c62c259798a202 Mon Sep 17 00:00:00 2001 From: Mikael Poul Johannesson Date: Mon, 28 May 2018 16:44:16 -0400 Subject: [PATCH 1/5] Made manyTopics run in parallel --- R/manyTopics.R | 129 ++++++++++++++++++++++++++++---------------- man/manyTopics.Rd | 3 +- man/paretosingle.Rd | 16 ++++++ 3 files changed, 102 insertions(+), 46 deletions(-) create mode 100644 man/paretosingle.Rd diff --git a/R/manyTopics.R b/R/manyTopics.R index aff27459f..0acff3e65 100644 --- a/R/manyTopics.R +++ b/R/manyTopics.R @@ -1,36 +1,44 @@ -#function to choose the pareto dominant model, or randomly choose among model candidates that are not weakly dominanted by other models -#utility function for manyTopics - +#' Function to choose the pareto dominant model +#' +#' Function to choose the pareto dominant model, or randomly choose +#' among model candidates that are not weakly dominanted by other +#' models utility function for manyTopics. +#' +#' @param z An stm model object from \code{\link{selectModels()}}. +#' paretosingle <- function(z) { - m<-matrix(NA,nrow=length(z$semcoh),ncol=2) + m <- matrix(NA, nrow = length(z$semcoh), ncol = 2) - for(i in 1:nrow(m)){ - - m[i,1]<-as.numeric(mean(unlist(z$semcoh[i]))) + for (i in 1:nrow(m)) { - if(!z$exclusivity[[1]][1]=="Exclusivity not calculated for models with content covariates"){ - m[i,2]<-as.numeric(mean(unlist(z$exclusivity[i]))) - } else { - stop("manyTopics function not yet designed for models with content variable.") + m[i, 1] <- as.numeric(mean(unlist(z$semcoh[i]))) + + if (!z$exclusivity[[1]][1] == "Exclusivity not calculated for models with content covariates") { + stop("manyTopics function not yet designed for models with content variable.") } + + m[i, 2] <- as.numeric(mean(unlist(z$exclusivity[i]))) + } - - d1max <- max(m[,1]) - d2max <- max(m[,2]) - weakcandidates <- m[,1]==d1max | m[,2]==d2max - strongcandidates <- m[,1]==d1max & m[,2]==d2max - s = which(strongcandidates) - w = which(weakcandidates) - if (length(s)>0) { + + d1max <- max(m[, 1]) + d2max <- max(m[, 2]) + weakcandidates <- m[, 1] == d1max | m[, 2] == d2max + strongcandidates <- m[, 1] == d1max & m[, 2] == d2max + s <- which(strongcandidates) + w <- which(weakcandidates) + + if (length(s) > 0) { x = s - } - else { + } else { x = w } - if (length(x)==1) {return(x)} - else { - return(sample(x,size=1)) + + if (length(x) == 1) { + return(x) + } else { + return(sample(x, size = 1)) } } @@ -124,28 +132,59 @@ paretosingle <- function(z) { #' #Please note that the way to extract a result for manyTopics is different from selectModel. #' } #' @export -manyTopics <- function(documents, vocab, K, prevalence=NULL, content=NULL, - data = NULL,max.em.its = 100, verbose = TRUE, - init.type = "LDA", - emtol = 1e-05, seed = NULL, runs = 50, - frexw = 0.7, net.max.em.its = 2, - netverbose = FALSE, M = 10,...) { - out<-list() - semcoh<-exclusivity<-list() - for(i in 1:length(K)) { - - models<- selectModel(documents, vocab, K[i], prevalence, content, data = data, - max.em.its = max.em.its, verbose = verbose, init.type = init.type, emtol = emtol, seed = seed, runs = runs, - frexw = frexw, net.max.em.its = net.max.em.its, netverbose = netverbose, M = M, - ...) - j<-paretosingle(models) +manyTopics <- function(documents, vocab, K, prevalence = NULL, + content = NULL, data = NULL, max.em.its = 100, + verbose = TRUE, init.type = "LDA", + emtol = 1e-05, seed = NULL, runs = 50, + frexw = 0.7, net.max.em.its = 2, + netverbose = FALSE, M = 10, N = 1, + cores = parallel::detectCores(), + ...) { + + args <- list( + documents = documents, + vocab = vocab, + prevalence = prevalence, + content = content, + data = data, + max.em.its = max.em.its, + verbose = verbose, + init.type = init.type, + emtol = emtol, + seed = seed, + runs = runs, + frexw = frexw, + net.max.em.its = net.max.em.its, + netverbose = netverbose, + M = M + ) + + if (verbose & cores == 1) { + message("Progress will not be shown when using multiple cores.") + args$verbose <- FALSE + } - out[[i]]<-models$runout[[j]] - exclusivity[[i]]<-models$exclusivity[[j]] - semcoh[[i]]<-models$semcoh[[j]] - j<-NULL + selectModel2 <- function(K, args, ...) { + do.call("selectModel", c(K, args, ...)) } + + if (cores == 1) { + models <- lapply(K, selectModel2, args) + } else { + models <- parallel::mclapply(K, selectModel2, args, mc.cores = cores) + } + + j <- sapply(models, paretosingle) + out <- lapply(seq_along(models), function(x) models[[x]]$runout[[j[x]]]) + exclusivity <- lapply(seq_along(models), function(x) models[[x]]$exclusivity[[j[x]]]) + semcoh <- lapply(seq_along(models), function(x) models[[x]]$semcoh[[j[x]]]) + + toreturn <- list( + out = out, + exclusivity = exclusivity, + semcoh = semcoh + ) + class(toreturn) <- "manyTopics" - toreturn<-list(out=out,exclusivity=exclusivity,semcoh=semcoh) return(toreturn) -} \ No newline at end of file +} diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd index 50c651c02..7a3ecd9d0 100644 --- a/man/manyTopics.Rd +++ b/man/manyTopics.Rd @@ -8,7 +8,8 @@ numbers of topics.} manyTopics(documents, vocab, K, prevalence = NULL, content = NULL, data = NULL, max.em.its = 100, verbose = TRUE, init.type = "LDA", emtol = 1e-05, seed = NULL, runs = 50, frexw = 0.7, - net.max.em.its = 2, netverbose = FALSE, M = 10, ...) + net.max.em.its = 2, netverbose = FALSE, M = 10, N = 1, + cores = parallel::detectCores(), ...) } \arguments{ \item{documents}{The documents to be modeled. Object must be a list of with diff --git a/man/paretosingle.Rd b/man/paretosingle.Rd new file mode 100644 index 000000000..c8189f53a --- /dev/null +++ b/man/paretosingle.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/manyTopics.R +\name{paretosingle} +\alias{paretosingle} +\title{Function to choose the pareto dominant model} +\usage{ +paretosingle(z) +} +\arguments{ +\item{z}{An stm model object from \code{\link{selectModels()}}.} +} +\description{ +Function to choose the pareto dominant model, or randomly choose +among model candidates that are not weakly dominanted by other +models utility function for manyTopics. +} From 679c8bb85faf0033edbed2316178a22e85f84f36 Mon Sep 17 00:00:00 2001 From: Mikael Poul Johannesson Date: Tue, 29 May 2018 13:36:07 -0400 Subject: [PATCH 2/5] made it non-verbose when run in parallel --- R/manyTopics.R | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R/manyTopics.R b/R/manyTopics.R index 0acff3e65..c042da2aa 100644 --- a/R/manyTopics.R +++ b/R/manyTopics.R @@ -14,7 +14,7 @@ paretosingle <- function(z) { m[i, 1] <- as.numeric(mean(unlist(z$semcoh[i]))) - if (!z$exclusivity[[1]][1] == "Exclusivity not calculated for models with content covariates") { + if (z$exclusivity[[1]][1] == "Exclusivity not calculated for models with content covariates") { stop("manyTopics function not yet designed for models with content variable.") } @@ -158,23 +158,26 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL, netverbose = netverbose, M = M ) - - if (verbose & cores == 1) { + + if (verbose & cores > 1) { message("Progress will not be shown when using multiple cores.") - args$verbose <- FALSE } selectModel2 <- function(K, args, ...) { - do.call("selectModel", c(K, args, ...)) + do.call("selectModel", c(K, args, ...)) } if (cores == 1) { models <- lapply(K, selectModel2, args) } else { - models <- parallel::mclapply(K, selectModel2, args, mc.cores = cores) + models <- parallel::mclapply( + K, selectModel2, args, + mc.cores = cores, mc.silent = TRUE + ) } j <- sapply(models, paretosingle) + out <- lapply(seq_along(models), function(x) models[[x]]$runout[[j[x]]]) exclusivity <- lapply(seq_along(models), function(x) models[[x]]$exclusivity[[j[x]]]) semcoh <- lapply(seq_along(models), function(x) models[[x]]$semcoh[[j[x]]]) @@ -188,3 +191,4 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL, return(toreturn) } + From 8a74ae65ef7f74d6c8c1f6b24eb5c564ab8daa25 Mon Sep 17 00:00:00 2001 From: Mikael Poul Johannesson Date: Tue, 29 May 2018 13:49:45 -0400 Subject: [PATCH 3/5] updated the manyTopics example --- R/manyTopics.R | 51 ++++++++++++++++++++++++++++++----------------- man/manyTopics.Rd | 50 ++++++++++++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 35 deletions(-) diff --git a/R/manyTopics.R b/R/manyTopics.R index c042da2aa..c36754914 100644 --- a/R/manyTopics.R +++ b/R/manyTopics.R @@ -111,25 +111,41 @@ paretosingle <- function(z) { #' @examples #' #' \dontrun{ -#' -#' temp<-textProcessor(documents=gadarian$open.ended.response,metadata=gadarian) -#' meta<-temp$meta -#' vocab<-temp$vocab -#' docs<-temp$documents -#' out <- prepDocuments(docs, vocab, meta) -#' docs<-out$documents -#' vocab<-out$vocab -#' meta <-out$meta +#' processed <- textProcessor( +#' documents = gadarian$open.ended.response, +#' metadata = gadarian +#' ) +#' out <- prepDocuments( +#' documents = processed$documents, +#' vocab = processed$vocab, +#' meta = processed$meta +#' ) #' #' set.seed(02138) -#' storage<-manyTopics(docs,vocab,K=3:4, prevalence=~treatment + s(pid_rep),data=meta, runs=10) -#' #This chooses the output, a single run of STM that was selected, -#' #from the runs of the 3 topic model -#' t<-storage$out[[1]] -#' #This chooses the output, a single run of STM that was selected, -#' #from the runs of the 4 topic model -#' t<-storage$out[[2]] -#' #Please note that the way to extract a result for manyTopics is different from selectModel. +#' +#' model_storage <- manyTopics( +#' documents = out$documents, +#' vocab= out$vocab, +#' K = 3:4, +#' prevalence = ~ treatment + s(pid_rep), +#' data = out$meta, +#' runs = 5, +#' cores = 3 +#' ) +#' +#' # Please note that the way to extract a result for manyTopics is +#' # different from selectModel. All the K fitted models are listed in +#' # `model_storage$out` and can be extracted with `]]`. For instance, to +#' # extract the first model with three topics, you can run: +#' fit_k3 <- model_storage$out[[1]] +#' +#' # Similarly, to extract the second model object with four topics, you can +#' # run: +#' fit_k4 <- model_storage$out[[2]] +#' +#' # These extracted models are stm model objects +#' class(fit_k3) +#' plot(fit_k3) #' } #' @export manyTopics <- function(documents, vocab, K, prevalence = NULL, @@ -177,7 +193,6 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL, } j <- sapply(models, paretosingle) - out <- lapply(seq_along(models), function(x) models[[x]]$runout[[j[x]]]) exclusivity <- lapply(seq_along(models), function(x) models[[x]]$exclusivity[[j[x]]]) semcoh <- lapply(seq_along(models), function(x) models[[x]]$semcoh[[j[x]]]) diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd index 7a3ecd9d0..775545686 100644 --- a/man/manyTopics.Rd +++ b/man/manyTopics.Rd @@ -97,24 +97,40 @@ Does not work with models that have a content variable (at this point). \examples{ \dontrun{ - -temp<-textProcessor(documents=gadarian$open.ended.response,metadata=gadarian) -meta<-temp$meta -vocab<-temp$vocab -docs<-temp$documents -out <- prepDocuments(docs, vocab, meta) -docs<-out$documents -vocab<-out$vocab -meta <-out$meta +processed <- textProcessor( + documents = gadarian$open.ended.response, + metadata = gadarian +) +out <- prepDocuments( + documents = processed$documents, + vocab = processed$vocab, + meta = processed$meta +) set.seed(02138) -storage<-manyTopics(docs,vocab,K=3:4, prevalence=~treatment + s(pid_rep),data=meta, runs=10) -#This chooses the output, a single run of STM that was selected, -#from the runs of the 3 topic model -t<-storage$out[[1]] -#This chooses the output, a single run of STM that was selected, -#from the runs of the 4 topic model -t<-storage$out[[2]] -#Please note that the way to extract a result for manyTopics is different from selectModel. + +model_storage <- manyTopics( + documents = out$documents, + vocab= out$vocab, + K = 3:4, + prevalence = ~ treatment + s(pid_rep), + data = out$meta, + runs = 5, + cores = 3 +) + +# Please note that the way to extract a result for manyTopics is +# different from selectModel. All the K fitted models are listed in +# `model_storage$out` and can be extracted with `]]`. For instance, to +# extract the first model with three topics, you can run: +fit_k3 <- model_storage$out[[1]] + +# Similarly, to extract the second model object with four topics, you can +# run: +fit_k4 <- model_storage$out[[2]] + +# These extracted models are stm model objects +class(fit_k3) +plot(fit_k3) } } From 2b14ef355d1a6c31bd4aafee91bce075998a2e98 Mon Sep 17 00:00:00 2001 From: Mikael Poul Johannesson Date: Tue, 29 May 2018 14:03:51 -0400 Subject: [PATCH 4/5] removed cores=2 from example --- R/manyTopics.R | 3 +-- man/manyTopics.Rd | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/manyTopics.R b/R/manyTopics.R index c36754914..e918d4734 100644 --- a/R/manyTopics.R +++ b/R/manyTopics.R @@ -129,8 +129,7 @@ paretosingle <- function(z) { #' K = 3:4, #' prevalence = ~ treatment + s(pid_rep), #' data = out$meta, -#' runs = 5, -#' cores = 3 +#' runs = 5 #' ) #' #' # Please note that the way to extract a result for manyTopics is diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd index 775545686..7f053032a 100644 --- a/man/manyTopics.Rd +++ b/man/manyTopics.Rd @@ -115,8 +115,7 @@ model_storage <- manyTopics( K = 3:4, prevalence = ~ treatment + s(pid_rep), data = out$meta, - runs = 5, - cores = 3 + runs = 5 ) # Please note that the way to extract a result for manyTopics is From 2217218ed7993140568cb42e7a347fb5b8dd38b7 Mon Sep 17 00:00:00 2001 From: Mikael Poul Johannesson Date: Tue, 29 May 2018 21:33:23 -0400 Subject: [PATCH 5/5] added param cores to manyTopics docs --- R/manyTopics.R | 6 ++++-- man/manyTopics.Rd | 5 ++++- man/paretosingle.Rd | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/R/manyTopics.R b/R/manyTopics.R index e918d4734..26317916d 100644 --- a/R/manyTopics.R +++ b/R/manyTopics.R @@ -4,7 +4,7 @@ #' among model candidates that are not weakly dominanted by other #' models utility function for manyTopics. #' -#' @param z An stm model object from \code{\link{selectModels()}}. +#' @param z An stm model object from \code{selectModel()}. #' paretosingle <- function(z) { @@ -98,6 +98,8 @@ paretosingle <- function(z) { #' models. #' @param M Number of words used to calculate semantic coherence and #' exclusivity. Defaults to 10. +#' @param cores Number of CPU cores to use for parallel +#' computation. Defaults to the number of cores available. #' @param \dots Additional options described in details of stm. #' @return #' @@ -152,7 +154,7 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL, verbose = TRUE, init.type = "LDA", emtol = 1e-05, seed = NULL, runs = 50, frexw = 0.7, net.max.em.its = 2, - netverbose = FALSE, M = 10, N = 1, + netverbose = FALSE, M = 10, cores = parallel::detectCores(), ...) { diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd index 7f053032a..432ab52e8 100644 --- a/man/manyTopics.Rd +++ b/man/manyTopics.Rd @@ -8,7 +8,7 @@ numbers of topics.} manyTopics(documents, vocab, K, prevalence = NULL, content = NULL, data = NULL, max.em.its = 100, verbose = TRUE, init.type = "LDA", emtol = 1e-05, seed = NULL, runs = 50, frexw = 0.7, - net.max.em.its = 2, netverbose = FALSE, M = 10, N = 1, + net.max.em.its = 2, netverbose = FALSE, M = 10, cores = parallel::detectCores(), ...) } \arguments{ @@ -71,6 +71,9 @@ models.} \item{M}{Number of words used to calculate semantic coherence and exclusivity. Defaults to 10.} +\item{cores}{Number of CPU cores to use for parallel +computation. Defaults to the number of cores available.} + \item{\dots}{Additional options described in details of stm.} } \value{ diff --git a/man/paretosingle.Rd b/man/paretosingle.Rd index c8189f53a..019fd3aa0 100644 --- a/man/paretosingle.Rd +++ b/man/paretosingle.Rd @@ -7,7 +7,7 @@ paretosingle(z) } \arguments{ -\item{z}{An stm model object from \code{\link{selectModels()}}.} +\item{z}{An stm model object from \code{selectModel()}.} } \description{ Function to choose the pareto dominant model, or randomly choose