From 6d3eb14b619ca51f64c10f7534c62c259798a202 Mon Sep 17 00:00:00 2001
From: Mikael Poul Johannesson <mikajoh@gmail.com>
Date: Mon, 28 May 2018 16:44:16 -0400
Subject: [PATCH 1/5] Made manyTopics run in parallel

---
 R/manyTopics.R      | 129 ++++++++++++++++++++++++++++----------------
 man/manyTopics.Rd   |   3 +-
 man/paretosingle.Rd |  16 ++++++
 3 files changed, 102 insertions(+), 46 deletions(-)
 create mode 100644 man/paretosingle.Rd

diff --git a/R/manyTopics.R b/R/manyTopics.R
index aff27459f..0acff3e65 100644
--- a/R/manyTopics.R
+++ b/R/manyTopics.R
@@ -1,36 +1,44 @@
-#function to choose the pareto dominant model, or randomly choose among model candidates that are not weakly dominanted by other models
-#utility function for manyTopics
-
+#' Function to choose the pareto dominant model
+#' 
+#' Function to choose the pareto dominant model, or randomly choose
+#' among model candidates that are not weakly dominanted by other
+#' models utility function for manyTopics.
+#' 
+#' @param z An stm model object from \code{\link{selectModels()}}.
+#' 
 paretosingle <- function(z) {
   
-  m<-matrix(NA,nrow=length(z$semcoh),ncol=2)
+  m <- matrix(NA, nrow = length(z$semcoh), ncol = 2)
   
-  for(i in 1:nrow(m)){
-    
-    m[i,1]<-as.numeric(mean(unlist(z$semcoh[i])))
+  for (i in 1:nrow(m)) {
     
-    if(!z$exclusivity[[1]][1]=="Exclusivity not calculated for models with content covariates"){
-      m[i,2]<-as.numeric(mean(unlist(z$exclusivity[i])))
-    } else {
-      stop("manyTopics function not yet designed for models with content variable.") 
+    m[i, 1] <- as.numeric(mean(unlist(z$semcoh[i])))
+
+    if (!z$exclusivity[[1]][1] == "Exclusivity not calculated for models with content covariates") {
+      stop("manyTopics function not yet designed for models with content variable.")
     }
+    
+    m[i, 2] <- as.numeric(mean(unlist(z$exclusivity[i])))
+    
   }
- 
-  d1max <- max(m[,1])
-  d2max <- max(m[,2])
-  weakcandidates <- m[,1]==d1max | m[,2]==d2max
-  strongcandidates <- m[,1]==d1max & m[,2]==d2max
-  s = which(strongcandidates)
-  w = which(weakcandidates)
-  if (length(s)>0) {
+  
+  d1max <- max(m[, 1])
+  d2max <- max(m[, 2])
+  weakcandidates <- m[, 1] == d1max | m[, 2] == d2max
+  strongcandidates <- m[, 1] == d1max & m[, 2] == d2max
+  s <- which(strongcandidates)
+  w <- which(weakcandidates)
+  
+  if (length(s) > 0) {
     x = s
-  }
-  else {
+  } else {
     x = w
   }
-  if (length(x)==1) {return(x)}
-  else {
-    return(sample(x,size=1))
+  
+  if (length(x) == 1) {
+    return(x)
+  } else {
+    return(sample(x, size = 1))
   }
 }
 
@@ -124,28 +132,59 @@ paretosingle <- function(z) {
 #' #Please note that the way to extract a result for manyTopics is different from selectModel.
 #' }
 #' @export
-manyTopics <- function(documents, vocab, K, prevalence=NULL, content=NULL, 
-                       data = NULL,max.em.its = 100, verbose = TRUE, 
-                       init.type = "LDA", 
-                       emtol = 1e-05, seed = NULL, runs = 50, 
-                       frexw = 0.7, net.max.em.its = 2, 
-                       netverbose = FALSE, M = 10,...) {
-  out<-list()
-  semcoh<-exclusivity<-list()
-  for(i in 1:length(K)) {
-    
-    models<- selectModel(documents, vocab, K[i], prevalence, content, data = data, 
-                         max.em.its = max.em.its, verbose = verbose, init.type = init.type, emtol = emtol, seed = seed, runs = runs, 
-                         frexw = frexw, net.max.em.its = net.max.em.its, netverbose = netverbose, M = M, 
-                         ...)
-    j<-paretosingle(models)
+manyTopics <- function(documents, vocab, K, prevalence = NULL,
+                       content = NULL,  data = NULL, max.em.its = 100,
+                       verbose = TRUE,  init.type = "LDA",
+                       emtol = 1e-05, seed = NULL, runs = 50,
+                       frexw = 0.7, net.max.em.its = 2,
+                       netverbose = FALSE, M = 10, N = 1,
+                       cores = parallel::detectCores(),
+                       ...) {
+
+  args <- list(
+    documents = documents,
+    vocab = vocab,
+    prevalence = prevalence,
+    content = content,
+    data = data, 
+    max.em.its = max.em.its,
+    verbose = verbose,
+    init.type = init.type,
+    emtol = emtol,
+    seed = seed,
+    runs = runs,
+    frexw = frexw,
+    net.max.em.its = net.max.em.its,
+    netverbose = netverbose,
+    M = M
+  )
+
+  if (verbose & cores == 1) {
+    message("Progress will not be shown when using multiple cores.")
+    args$verbose <- FALSE
+  }
 
-    out[[i]]<-models$runout[[j]]
-    exclusivity[[i]]<-models$exclusivity[[j]]
-    semcoh[[i]]<-models$semcoh[[j]]
-    j<-NULL
+  selectModel2 <- function(K, args, ...) {
+    do.call("selectModel", c(K, args, ...))  
   }
+
+  if (cores == 1) {
+    models <- lapply(K, selectModel2, args)
+  } else {
+    models <- parallel::mclapply(K, selectModel2, args, mc.cores = cores)
+  }
+
+  j <- sapply(models, paretosingle)
+  out <- lapply(seq_along(models), function(x) models[[x]]$runout[[j[x]]])
+  exclusivity <- lapply(seq_along(models), function(x) models[[x]]$exclusivity[[j[x]]])
+  semcoh <- lapply(seq_along(models), function(x) models[[x]]$semcoh[[j[x]]])
+  
+  toreturn <- list(
+    out = out,
+    exclusivity = exclusivity,
+    semcoh = semcoh
+  )
+  class(toreturn) <- "manyTopics"
   
-  toreturn<-list(out=out,exclusivity=exclusivity,semcoh=semcoh)
   return(toreturn)
-}
\ No newline at end of file
+}
diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd
index 50c651c02..7a3ecd9d0 100644
--- a/man/manyTopics.Rd
+++ b/man/manyTopics.Rd
@@ -8,7 +8,8 @@ numbers of topics.}
 manyTopics(documents, vocab, K, prevalence = NULL, content = NULL,
   data = NULL, max.em.its = 100, verbose = TRUE, init.type = "LDA",
   emtol = 1e-05, seed = NULL, runs = 50, frexw = 0.7,
-  net.max.em.its = 2, netverbose = FALSE, M = 10, ...)
+  net.max.em.its = 2, netverbose = FALSE, M = 10, N = 1,
+  cores = parallel::detectCores(), ...)
 }
 \arguments{
 \item{documents}{The documents to be modeled.  Object must be a list of with
diff --git a/man/paretosingle.Rd b/man/paretosingle.Rd
new file mode 100644
index 000000000..c8189f53a
--- /dev/null
+++ b/man/paretosingle.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/manyTopics.R
+\name{paretosingle}
+\alias{paretosingle}
+\title{Function to choose the pareto dominant model}
+\usage{
+paretosingle(z)
+}
+\arguments{
+\item{z}{An stm model object from \code{\link{selectModels()}}.}
+}
+\description{
+Function to choose the pareto dominant model, or randomly choose
+among model candidates that are not weakly dominanted by other
+models utility function for manyTopics.
+}

From 679c8bb85faf0033edbed2316178a22e85f84f36 Mon Sep 17 00:00:00 2001
From: Mikael Poul Johannesson <mikajoh@gmail.com>
Date: Tue, 29 May 2018 13:36:07 -0400
Subject: [PATCH 2/5] made it non-verbose when run in parallel

---
 R/manyTopics.R | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/R/manyTopics.R b/R/manyTopics.R
index 0acff3e65..c042da2aa 100644
--- a/R/manyTopics.R
+++ b/R/manyTopics.R
@@ -14,7 +14,7 @@ paretosingle <- function(z) {
     
     m[i, 1] <- as.numeric(mean(unlist(z$semcoh[i])))
 
-    if (!z$exclusivity[[1]][1] == "Exclusivity not calculated for models with content covariates") {
+    if (z$exclusivity[[1]][1] == "Exclusivity not calculated for models with content covariates") {
       stop("manyTopics function not yet designed for models with content variable.")
     }
     
@@ -158,23 +158,26 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL,
     netverbose = netverbose,
     M = M
   )
-
-  if (verbose & cores == 1) {
+  
+  if (verbose & cores > 1) {
     message("Progress will not be shown when using multiple cores.")
-    args$verbose <- FALSE
   }
 
   selectModel2 <- function(K, args, ...) {
-    do.call("selectModel", c(K, args, ...))  
+    do.call("selectModel", c(K, args, ...))
   }
 
   if (cores == 1) {
     models <- lapply(K, selectModel2, args)
   } else {
-    models <- parallel::mclapply(K, selectModel2, args, mc.cores = cores)
+    models <- parallel::mclapply(
+      K, selectModel2, args,
+      mc.cores = cores, mc.silent = TRUE
+    )
   }
 
   j <- sapply(models, paretosingle)
+
   out <- lapply(seq_along(models), function(x) models[[x]]$runout[[j[x]]])
   exclusivity <- lapply(seq_along(models), function(x) models[[x]]$exclusivity[[j[x]]])
   semcoh <- lapply(seq_along(models), function(x) models[[x]]$semcoh[[j[x]]])
@@ -188,3 +191,4 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL,
   
   return(toreturn)
 }
+

From 8a74ae65ef7f74d6c8c1f6b24eb5c564ab8daa25 Mon Sep 17 00:00:00 2001
From: Mikael Poul Johannesson <mikajoh@gmail.com>
Date: Tue, 29 May 2018 13:49:45 -0400
Subject: [PATCH 3/5] updated the manyTopics example

---
 R/manyTopics.R    | 51 ++++++++++++++++++++++++++++++-----------------
 man/manyTopics.Rd | 50 ++++++++++++++++++++++++++++++----------------
 2 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/R/manyTopics.R b/R/manyTopics.R
index c042da2aa..c36754914 100644
--- a/R/manyTopics.R
+++ b/R/manyTopics.R
@@ -111,25 +111,41 @@ paretosingle <- function(z) {
 #' @examples
 #' 
 #' \dontrun{
-#' 
-#' temp<-textProcessor(documents=gadarian$open.ended.response,metadata=gadarian)
-#' meta<-temp$meta
-#' vocab<-temp$vocab
-#' docs<-temp$documents
-#' out <- prepDocuments(docs, vocab, meta)
-#' docs<-out$documents
-#' vocab<-out$vocab
-#' meta <-out$meta
+#' processed <- textProcessor(
+#'   documents = gadarian$open.ended.response,
+#'   metadata = gadarian
+#' )
+#' out <- prepDocuments(
+#'   documents = processed$documents,
+#'   vocab = processed$vocab,
+#'   meta = processed$meta
+#' )
 #' 
 #' set.seed(02138)
-#' storage<-manyTopics(docs,vocab,K=3:4, prevalence=~treatment + s(pid_rep),data=meta, runs=10)
-#' #This chooses the output, a single run of STM that was selected,
-#' #from the runs of the 3 topic model
-#' t<-storage$out[[1]]
-#' #This chooses the output, a single run of STM that was selected,
-#' #from the runs of the 4 topic model
-#' t<-storage$out[[2]]
-#' #Please note that the way to extract a result for manyTopics is different from selectModel.
+#' 
+#' model_storage <- manyTopics(
+#'   documents = out$documents,
+#'   vocab= out$vocab,
+#'   K = 3:4,
+#'   prevalence = ~ treatment + s(pid_rep), 
+#'   data = out$meta,
+#'   runs = 5,
+#'   cores = 3
+#' )
+#' 
+#' # Please note that the way to extract a result for manyTopics is
+#' # different from selectModel. All the K fitted models are listed in
+#' # `model_storage$out` and can be extracted with `]]`. For instance, to
+#' # extract the first model with three topics, you can run:
+#' fit_k3 <- model_storage$out[[1]]
+#' 
+#' # Similarly, to extract the second model object with four topics, you can
+#' # run:
+#' fit_k4 <- model_storage$out[[2]]
+#' 
+#' # These extracted models are stm model objects
+#' class(fit_k3)
+#' plot(fit_k3)
 #' }
 #' @export
 manyTopics <- function(documents, vocab, K, prevalence = NULL,
@@ -177,7 +193,6 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL,
   }
 
   j <- sapply(models, paretosingle)
-
   out <- lapply(seq_along(models), function(x) models[[x]]$runout[[j[x]]])
   exclusivity <- lapply(seq_along(models), function(x) models[[x]]$exclusivity[[j[x]]])
   semcoh <- lapply(seq_along(models), function(x) models[[x]]$semcoh[[j[x]]])
diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd
index 7a3ecd9d0..775545686 100644
--- a/man/manyTopics.Rd
+++ b/man/manyTopics.Rd
@@ -97,24 +97,40 @@ Does not work with models that have a content variable (at this point).
 \examples{
 
 \dontrun{
-
-temp<-textProcessor(documents=gadarian$open.ended.response,metadata=gadarian)
-meta<-temp$meta
-vocab<-temp$vocab
-docs<-temp$documents
-out <- prepDocuments(docs, vocab, meta)
-docs<-out$documents
-vocab<-out$vocab
-meta <-out$meta
+processed <- textProcessor(
+  documents = gadarian$open.ended.response,
+  metadata = gadarian
+)
+out <- prepDocuments(
+  documents = processed$documents,
+  vocab = processed$vocab,
+  meta = processed$meta
+)
 
 set.seed(02138)
-storage<-manyTopics(docs,vocab,K=3:4, prevalence=~treatment + s(pid_rep),data=meta, runs=10)
-#This chooses the output, a single run of STM that was selected,
-#from the runs of the 3 topic model
-t<-storage$out[[1]]
-#This chooses the output, a single run of STM that was selected,
-#from the runs of the 4 topic model
-t<-storage$out[[2]]
-#Please note that the way to extract a result for manyTopics is different from selectModel.
+
+model_storage <- manyTopics(
+  documents = out$documents,
+  vocab= out$vocab,
+  K = 3:4,
+  prevalence = ~ treatment + s(pid_rep), 
+  data = out$meta,
+  runs = 5,
+  cores = 3
+)
+
+# Please note that the way to extract a result for manyTopics is
+# different from selectModel. All the K fitted models are listed in
+# `model_storage$out` and can be extracted with `]]`. For instance, to
+# extract the first model with three topics, you can run:
+fit_k3 <- model_storage$out[[1]]
+
+# Similarly, to extract the second model object with four topics, you can
+# run:
+fit_k4 <- model_storage$out[[2]]
+
+# These extracted models are stm model objects
+class(fit_k3)
+plot(fit_k3)
 }
 }

From 2b14ef355d1a6c31bd4aafee91bce075998a2e98 Mon Sep 17 00:00:00 2001
From: Mikael Poul Johannesson <mikajoh@gmail.com>
Date: Tue, 29 May 2018 14:03:51 -0400
Subject: [PATCH 4/5] removed cores=2 from example

---
 R/manyTopics.R    | 3 +--
 man/manyTopics.Rd | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/R/manyTopics.R b/R/manyTopics.R
index c36754914..e918d4734 100644
--- a/R/manyTopics.R
+++ b/R/manyTopics.R
@@ -129,8 +129,7 @@ paretosingle <- function(z) {
 #'   K = 3:4,
 #'   prevalence = ~ treatment + s(pid_rep), 
 #'   data = out$meta,
-#'   runs = 5,
-#'   cores = 3
+#'   runs = 5
 #' )
 #' 
 #' # Please note that the way to extract a result for manyTopics is
diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd
index 775545686..7f053032a 100644
--- a/man/manyTopics.Rd
+++ b/man/manyTopics.Rd
@@ -115,8 +115,7 @@ model_storage <- manyTopics(
   K = 3:4,
   prevalence = ~ treatment + s(pid_rep), 
   data = out$meta,
-  runs = 5,
-  cores = 3
+  runs = 5
 )
 
 # Please note that the way to extract a result for manyTopics is

From 2217218ed7993140568cb42e7a347fb5b8dd38b7 Mon Sep 17 00:00:00 2001
From: Mikael Poul Johannesson <mikajoh@gmail.com>
Date: Tue, 29 May 2018 21:33:23 -0400
Subject: [PATCH 5/5] added param cores to manyTopics docs

---
 R/manyTopics.R      | 6 ++++--
 man/manyTopics.Rd   | 5 ++++-
 man/paretosingle.Rd | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/R/manyTopics.R b/R/manyTopics.R
index e918d4734..26317916d 100644
--- a/R/manyTopics.R
+++ b/R/manyTopics.R
@@ -4,7 +4,7 @@
 #' among model candidates that are not weakly dominanted by other
 #' models utility function for manyTopics.
 #' 
-#' @param z An stm model object from \code{\link{selectModels()}}.
+#' @param z An stm model object from \code{selectModel()}.
 #' 
 paretosingle <- function(z) {
   
@@ -98,6 +98,8 @@ paretosingle <- function(z) {
 #' models.
 #' @param M Number of words used to calculate semantic coherence and
 #' exclusivity.  Defaults to 10.
+#' @param cores Number of CPU cores to use for parallel
+#'   computation. Defaults to the number of cores available.
 #' @param \dots Additional options described in details of stm.
 #' @return 
 #' 
@@ -152,7 +154,7 @@ manyTopics <- function(documents, vocab, K, prevalence = NULL,
                        verbose = TRUE,  init.type = "LDA",
                        emtol = 1e-05, seed = NULL, runs = 50,
                        frexw = 0.7, net.max.em.its = 2,
-                       netverbose = FALSE, M = 10, N = 1,
+                       netverbose = FALSE, M = 10,
                        cores = parallel::detectCores(),
                        ...) {
 
diff --git a/man/manyTopics.Rd b/man/manyTopics.Rd
index 7f053032a..432ab52e8 100644
--- a/man/manyTopics.Rd
+++ b/man/manyTopics.Rd
@@ -8,7 +8,7 @@ numbers of topics.}
 manyTopics(documents, vocab, K, prevalence = NULL, content = NULL,
   data = NULL, max.em.its = 100, verbose = TRUE, init.type = "LDA",
   emtol = 1e-05, seed = NULL, runs = 50, frexw = 0.7,
-  net.max.em.its = 2, netverbose = FALSE, M = 10, N = 1,
+  net.max.em.its = 2, netverbose = FALSE, M = 10,
   cores = parallel::detectCores(), ...)
 }
 \arguments{
@@ -71,6 +71,9 @@ models.}
 \item{M}{Number of words used to calculate semantic coherence and
 exclusivity.  Defaults to 10.}
 
+\item{cores}{Number of CPU cores to use for parallel
+computation. Defaults to the number of cores available.}
+
 \item{\dots}{Additional options described in details of stm.}
 }
 \value{
diff --git a/man/paretosingle.Rd b/man/paretosingle.Rd
index c8189f53a..019fd3aa0 100644
--- a/man/paretosingle.Rd
+++ b/man/paretosingle.Rd
@@ -7,7 +7,7 @@
 paretosingle(z)
 }
 \arguments{
-\item{z}{An stm model object from \code{\link{selectModels()}}.}
+\item{z}{An stm model object from \code{selectModel()}.}
 }
 \description{
 Function to choose the pareto dominant model, or randomly choose