-
Notifications
You must be signed in to change notification settings - Fork 28
/
utils.R
210 lines (183 loc) · 6.63 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#' ---
#' title: Utilities shared between R code
#' author: G.J.J. van den Burg
#' date: 2019-09-29
#' license: See the LICENSE file.
#' copyright: 2019, The Alan Turing Institute
#' ---
library(RJSONIO)
printf <- function(...) invisible(cat(sprintf(...)));
#' Load a TCPDBench dataset
#'
#' This function reads in a JSON dataset in TCPDBench format (see TCPD
#' repository for schema) and creates a matrix representation of the dataset.
#' The dataset is scaled in the process.
#'
#' @param filename Path to the JSON file
#' @return List object with the raw data in the \code{original} field, the time
#' index in the \code{time} field, and the data matrix in the \code{mat} field.
#'
load.dataset <- function(filename)
{
data <- fromJSON(filename)
# reformat the data to a data frame with a time index and the data values
tidx <- data$time$index
exp <- 0:(data$n_obs - 1)
if (all(tidx == exp) && length(tidx) == length(exp)) {
tidx <- NULL
} else {
tidx <- data$time$index
}
mat <- NULL
for (j in 1:data$n_dim) {
s <- data$series[[j]]
v <- NULL
for (i in 1:data$n_obs) {
val <- s$raw[[i]]
if (is.null(val)) {
v <- c(v, NA)
} else {
v <- c(v, val)
}
}
mat <- cbind(mat, v)
}
# We normalize to avoid issues with numerical precision.
mat <- scale(mat)
out <- list(original=data,
time=tidx,
mat=mat)
return(out)
}
#' Prepare the experiment output
#'
#' This function creates a list of the necessary output data. This includes the
#' exact command that was run, dataset and script information, the hostname,
#' output status, any errors if present, and the detected change point location
#' and runtime.
#'
#' @param data the raw data loaded from the JSON file
#' @param data.filename the path to the dataset filename
#' @param status the output status code of the experiment. Currently in use are
#' 'SUCCESS' for when an experiment exited successfully, 'TIMEOUT' if the
#' experiment exceeded a limit on runtime, 'SKIP' if the method was supplied
#' with improper hyperparameters, and 'FAIL' if an error occurred.
#' @param error a description of the error, if one occurred
#' @param params input parameters (including defaults) to the method
#' @param locations detected change point locations (important: these locations
#' are 0-based, whereas R array indices are 1-based. It is important to convert
#' them accordingly. Change point locations should be integers on the interval
#' [0, T-1], including both endpoints).
#' @param runtime the runtime of the method.
#'
#' @return list with all the necessary output fields.
prepare.result <- function(data, data.filename, status, error,
params, locations, runtime) {
out <- list(error=NULL)
cmd.args <- commandArgs(trailingOnly=F)
# the full command used
out$command <- paste(cmd.args, collapse=' ')
# get the name of the current script
file.arg <- "--file="
out$script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)])
# hash of the script
script.hash <- tools::md5sum(out$script)
names(script.hash) <- NULL
out$script_md5 <- script.hash
# hostname of the machine
hostname <- Sys.info()['nodename']
names(hostname) <- NULL
out$hostname <- hostname
# dataset name
out$dataset <- data$name
# dataset hash
data.hash <- tools::md5sum(data.filename)
names(data.hash) <- NULL
out$dataset_md5 <- data.hash
# status of running the script
out$status <- status
# error (if any)
if (!is.null(error))
out$error <- error
# parameters used
out$parameters <- params
# result
out$result <- list(cplocations=locations, runtime=runtime)
return(out)
}
#' Combine default parameters and command line arguments
#'
#' @param args the command line arguments
#' @param defaults default algorithm parameters
#' @return a combined list with both the default parameter settings and those
#' provided on the command line. If a parameter is in the default list that is
#' specified on the command line the command line parameter takes precedence.
make.param.list <- function(args, defaults)
{
params <- defaults
args.copy <- args
args.copy["input"] <- NULL
args.copy["output"] <- NULL
params <- modifyList(params, args.copy, keep.null=T)
return(params)
}
#' Write output to a file or stdout
#'
#' This function takes an output list generated by \code{\link{prepare.result}}
#' and writes it out as JSON to a file if provided or stdout otherwise.
#'
#' @param out experimental results as a list
#' @param filename (optional) output file to write to
#'
dump.output <- function(out, filename) {
json.out <- toJSON(out, pretty=T)
if (!is.null(filename))
write(json.out, filename)
else
cat(json.out, '\n')
}
#' Exit with SKIP status due to multidimensional data
#'
#' This is a shorthand for \code{\link{exit.with.error}} where the error is
#' already set for methods that don't handle multidimensional data. Writes out
#' the data and exits.
#'
#' @param data original data loaded by \code{\link{load.dataset}}
#' @param args command line arguments
#' @param params combined hyperparameters generated by
#' \code{\link{make.param.list}}
exit.error.multidim <- function(data, args, params) {
status = 'SKIP'
error = 'This method has no support for multidimensional data.'
out <- prepare.result(data, args$input, status, error, params, NULL, NA)
dump.output(out, args$output)
quit(save='no')
}
#' Exit with FAIL status and a custom error message
#'
#' @param data original data loaded by \code{\link{load.dataset}}
#' @param args command line arguments
#' @param params combined hyperparameters generated by
#' \code{\link{make.param.list}}
#' @param error custom error message
exit.with.error <- function(data, args, params, error) {
status = 'FAIL'
out <- prepare.result(data, args$input, status, error, params, NULL, NULL)
dump.output(out, args$output)
quit(save='no')
}
#' Exit with SUCCESS status
#'
#' @param data original data loaded by \code{\link{load.dataset}}
#' @param args command line arguments
#' @param params combined hyperparameters generated by
#' \code{\link{make.param.list}}
#' @param locations detected change point locations (0-based!)
#' @param runtime runtime in seconds
exit.success <- function(data, args, params, locations, runtime) {
status = 'SUCCESS'
error = NULL
out <- prepare.result(data, args$input, status, error, params, locations,
runtime)
dump.output(out, args$output)
}