Skip to content

run and summarize function #178

@andersone1

Description

@andersone1
#' Run an R script with provenance collection and summarize inputs/outputs to YAML
#'
#' This function executes a specified R script using rdtLite to capture provenance.
#' It then parses the generated PROV-JSON to identify input and output files,
#' saves this summary to a YAML file, and cleans up the generated provenance directory.
#'
#' @param script_path Character string. The relative or absolute path to the R script to be executed.
#' @param output_yaml_path Character string. The path where the output YAML summary should be saved.
#'
#' @return NULL. The function is called for its side effects (running the script and creating a YAML file).
#' @importFrom rdtLite prov.run
#' @importFrom jsonlite fromJSON
#' @importFrom yaml write_yaml
#' @importFrom tools file_path_sans_ext
#' @importFrom fs path_abs
#' @export
run_and_summarize <- function(script_path, output_yaml_path) {
  
  # 1. Setup paths
  # Resolve full path for the YAML report using fs
  # We use as.character to ensure it outputs a simple string
  full_script_path <- as.character(fs::path_abs(script_path))
  
  # Get the script name for directory handling
  script_name <- tools::file_path_sans_ext(basename(script_path))
  
  # Define the expected provenance directory name
  # rdtLite creates a folder named "prov_" + script name
  prov_dir_name <- paste0("prov_", script_name)
  prov_json_path <- file.path(prov_dir_name, "prov.json")
  
  # 2. Run the script using rdtLite
  cat(sprintf("Running %s with provenance collection...\n", script_path))
  rdtLite::prov.run(script_path, prov.dir = ".", overwrite = TRUE)
  
  # 3. Parse the generated JSON
  if (!file.exists(prov_json_path)) {
    stop("Error: prov.json was not found. The script may have failed to run.")
  }
  
  prov_data <- jsonlite::fromJSON(prov_json_path)
  
  entities <- prov_data$entity
  used_block <- prov_data$used
  generated_block <- prov_data$wasGeneratedBy
  
  # --- Helper Logic (Internal) ---
  extract_files <- function(relationship_block, entity_key_label) {
    file_list <- list()
    
    if (length(relationship_block) == 0) return(list())
    
    for (i in 1:length(relationship_block)) {
      rel_item <- relationship_block[[i]]
      entity_id <- rel_item[[entity_key_label]]
      entity_details <- entities[[entity_id]]
      
      if (!is.null(entity_details)) {
        # Robust check for File type using isTRUE
        if (isTRUE(entity_details[["rdt:type"]] == "File")) {
          # Only saving location as requested (name removed)
          file_info <- list(
            location = entity_details[["rdt:location"]]
          )
          file_list[[length(file_list) + 1]] <- file_info
        }
      }
    }
    return(file_list)
  }
  
  # Extract Inputs and Outputs
  inputs <- extract_files(used_block, "prov:entity")
  outputs <- extract_files(generated_block, "prov:entity")
  
  # 4. Save to YAML
  yaml_data <- list(
    script = full_script_path,
    inputs = inputs,
    outputs = outputs
  )
  
  yaml::write_yaml(yaml_data, output_yaml_path)
  cat(sprintf("Summary saved to %s\n", output_yaml_path))
  
  # 5. Cleanup
  # Remove the directory created by rdtLite
  if (dir.exists(prov_dir_name)) {
    cat(sprintf("Cleaning up provenance directory: %s\n", prov_dir_name))
    unlink(prov_dir_name, recursive = TRUE)
  }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions