execs/R/utils.R

#' ---
#' title: Utilities shared between R code
#' author: G.J.J. van den Burg
#' date: 2019-09-29
#' license: See the LICENSE file.
#' copyright: 2019, The Alan Turing Institute
#' ---

library(RJSONIO)

printf <- function(...) invisible(cat(sprintf(...)));

#' Load a TCPDBench dataset
#'
#' This function reads in a JSON dataset in TCPDBench format (see TCPD 
#' repository for schema) and creates a matrix representation of the dataset.  
#' The dataset is scaled in the process.
#'
#' @param filename Path to the JSON file
#' @return List object with the raw data in the \code{original} field, the time 
#' index in the \code{time} field, and the data matrix in the \code{mat} field.
#'
load.dataset <- function(filename)
{
    data <- fromJSON(filename)

    # reformat the data to a data frame with a time index and the data values
    tidx <- data$time$index
    exp <- 0:(data$n_obs - 1)
    if (all(tidx == exp) && length(tidx) == length(exp)) {
        tidx <- NULL
    } else {
        tidx <- data$time$index
    }

    mat <- NULL

    for (j in 1:data$n_dim) {
        s <- data$series[[j]]
        v <- NULL
        for (i in 1:data$n_obs) {
            val <- s$raw[[i]]
            if (is.null(val)) {
                v <- c(v, NA)
            } else {
                v <- c(v, val)
            }
        }
        mat <- cbind(mat, v)
    }

    # We normalize to avoid issues with numerical precision.
    mat <- scale(mat)

    out <- list(original=data,
                time=tidx,
                mat=mat)
    return(out)
}

#' Prepare the experiment output
#'
#' This function creates a list of the necessary output data. This includes the 
#' exact command that was run, dataset and script information, the hostname, 
#' output status, any errors if present, and the detected change point location 
#' and runtime.
#'
#' @param data the raw data loaded from the JSON file
#' @param data.filename the path to the dataset filename
#' @param status the output status code of the experiment. Currently in use are 
#' 'SUCCESS' for when an experiment exited successfully, 'TIMEOUT' if the 
#' experiment exceeded a limit on runtime, 'SKIP' if the method was supplied 
#' with improper hyperparameters, and 'FAIL' if an error occurred.
#' @param error a description of the error, if one occurred
#' @param params input parameters (including defaults) to the method
#' @param locations detected change point locations (important: these locations 
#' are 0-based, whereas R array indices are 1-based. It is important to convert 
#' them accordingly. Change point locations should be integers on the interval 
#' [0, T-1], including both endpoints).
#' @param runtime the runtime of the method.
#'
#' @return list with all the necessary output fields.
prepare.result <- function(data, data.filename, status, error,
                           params, locations, runtime) {
    out <- list(error=NULL)
    cmd.args <- commandArgs(trailingOnly=F)

    # the full command used
    out$command <- paste(cmd.args, collapse=' ')

    # get the name of the current script
    file.arg <- "--file="
    out$script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)])

    # hash of the script
    script.hash <- tools::md5sum(out$script)
    names(script.hash) <- NULL
    out$script_md5 <- script.hash

    # hostname of the machine
    hostname <- Sys.info()['nodename']
    names(hostname) <- NULL
    out$hostname <- hostname

    # dataset name
    out$dataset <- data$name

    # dataset hash
    data.hash <- tools::md5sum(data.filename)
    names(data.hash) <- NULL
    out$dataset_md5 <- data.hash

    # status of running the script
    out$status <- status

    # error (if any)
    if (!is.null(error))
        out$error <- error

    # parameters used
    out$parameters <- params

    # result
    out$result <- list(cplocations=locations, runtime=runtime)

    return(out)
}

#' Combine default parameters and command line arguments
#'
#' @param args the command line arguments
#' @param defaults default algorithm parameters
#' @return a combined list with both the default parameter settings and those 
#' provided on the command line. If a parameter is in the default list that is 
#' specified on the command line the command line parameter takes precedence.
make.param.list <- function(args, defaults)
{
    params <- defaults

    args.copy <- args
    args.copy["input"] <- NULL
    args.copy["output"] <- NULL

    params <- modifyList(params, args.copy, keep.null=T)
    return(params)
}

#' Write output to a file or stdout
#'
#' This function takes an output list generated by \code{\link{prepare.result}} 
#' and writes it out as JSON to a file if provided or stdout otherwise.
#'
#' @param out experimental results as a list
#' @param filename (optional) output file to write to
#'
dump.output <- function(out, filename) {
    json.out <- toJSON(out, pretty=T)
    if (!is.null(filename))
        write(json.out, filename)
    else
        cat(json.out, '\n')
}

#' Exit with SKIP status due to multidimensional data
#'
#' This is a shorthand for \code{\link{exit.with.error}} where the error is 
#' already set for methods that don't handle multidimensional data. Writes out 
#' the data and exits.
#'
#' @param data  original data loaded by \code{\link{load.dataset}}
#' @param args command line arguments
#' @param params combined hyperparameters generated by 
#' \code{\link{make.param.list}}
exit.error.multidim <- function(data, args, params) {
    status = 'SKIP'
    error = 'This method has no support for multidimensional data.'
    out <- prepare.result(data, args$input, status, error, params, NULL, NA)
    dump.output(out, args$output)
    quit(save='no')
}

#' Exit with FAIL status and a custom error message
#'
#' @param data  original data loaded by \code{\link{load.dataset}}
#' @param args command line arguments
#' @param params combined hyperparameters generated by 
#' \code{\link{make.param.list}}
#' @param error custom error message
exit.with.error <- function(data, args, params, error) {
    status = 'FAIL'
    out <- prepare.result(data, args$input, status, error, params, NULL, NULL)
    dump.output(out, args$output)
    quit(save='no')
}

#' Exit with SUCCESS status
#'
#' @param data  original data loaded by \code{\link{load.dataset}}
#' @param args command line arguments
#' @param params combined hyperparameters generated by 
#' \code{\link{make.param.list}}
#' @param locations detected change point locations (0-based!)
#' @param runtime runtime in seconds
exit.success <- function(data, args, params, locations, runtime) {
    status = 'SUCCESS'
    error = NULL
    out <- prepare.result(data, args$input, status, error, params, locations,
                          runtime)
    dump.output(out, args$output)
}