diff --git a/.github/workflows/dockerhub-on-version.yml b/.github/workflows/dockerhub-on-version.yml new file mode 100644 index 0000000..5e89d89 --- /dev/null +++ b/.github/workflows/dockerhub-on-version.yml @@ -0,0 +1,139 @@ +name: Build & Push Docker image on package version change + +on: + push: + branches: [ main, vcf_sanity_check ] + paths: + - .github/workflows/dockerhub-on-version.yml + - Dockerfile + - DESCRIPTION + workflow_dispatch: + +concurrency: + group: docker-${{ github.ref }} + cancel-in-progress: true + +env: + IMAGE: docker.io/breedinginsight/bigapp + IMAGE_DEPS: docker.io/breedinginsight/bigapp-deps + DEPS_TAG: r4.5-bioc3.21-2025-08 + +jobs: + check-version: + runs-on: ubuntu-latest + outputs: + changed: ${{ steps.ver.outputs.changed }} + version: ${{ steps.ver.outputs.version }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - id: ver + shell: bash + run: | + set -euo pipefail + cur_ver=$(sed -n 's/^Version:[[:space:]]*//p' DESCRIPTION | tr -d '[:space:]') + prev_sha="${{ github.event.before }}" + if git show "${prev_sha}:DESCRIPTION" >/dev/null 2>&1; then + prev_ver=$(git show "${prev_sha}:DESCRIPTION" | sed -n 's/^Version:[[:space:]]*//p' | tr -d '[:space:]') + else + prev_ver="" + fi + changed=false + if [[ -z "${prev_ver}" || "${cur_ver}" != "${prev_ver}" ]]; then changed=true; fi + echo "version=${cur_ver}" >> "$GITHUB_OUTPUT" + echo "changed=${changed}" >> "$GITHUB_OUTPUT" + + build-amd64: + needs: check-version + if: needs['check-version'].outputs.changed == 'true' + runs-on: ubuntu-latest + timeout-minutes: 120 + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE }} + tags: | + type=raw,value=v${{ needs['check-version'].outputs.version }}-amd64 + type=raw,value=sha-${{ github.sha }}-amd64 + - name: Build & push app (amd64) + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile + platforms: linux/amd64 + push: true + build-args: | + BASE_IMAGE=${{ env.IMAGE_DEPS }}:${{ env.DEPS_TAG }} + tags: ${{ steps.meta.outputs.tags }} + cache-from: | + type=registry,ref=${{ env.IMAGE }}:buildcache-amd64 + cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache-amd64,mode=max,compression=zstd + + build-arm64: + needs: check-version + if: needs['check-version'].outputs.changed == 'true' + continue-on-error: true + runs-on: ubuntu-latest + timeout-minutes: 350 + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE }} + tags: | + type=raw,value=v${{ needs['check-version'].outputs.version }}-arm64 + type=raw,value=sha-${{ github.sha }}-arm64 + - name: Build & push app (arm64) + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile + platforms: linux/arm64 + push: true + build-args: | + BASE_IMAGE=${{ env.IMAGE_DEPS }}:${{ env.DEPS_TAG }} + tags: ${{ steps.meta.outputs.tags }} + cache-from: | + type=registry,ref=${{ env.IMAGE }}:buildcache-arm64 + cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache-arm64,mode=max,compression=zstd + + manifest: + needs: [check-version, build-amd64, build-arm64] + if: > + needs['check-version'].outputs.changed == 'true' && + needs['build-amd64'].result == 'success' && + needs['build-arm64'].result == 'success' + runs-on: ubuntu-latest + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Create multi-arch manifest + env: + IMAGE: ${{ env.IMAGE }} + PKG_VERSION: ${{ needs['check-version'].outputs.version }} + run: | + set -euo pipefail + docker buildx imagetools create \ + -t "$IMAGE:v${PKG_VERSION}" \ + -t "$IMAGE:latest" \ + "$IMAGE:v${PKG_VERSION}-amd64" \ + "$IMAGE:v${PKG_VERSION}-arm64" \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 47cd469..6fccfde 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGapp Title: Breeding Insight Genomics Shiny Application -Version: 1.3.0 +Version: 1.5.1 Authors@R: c( person(c("Alexander", "M."), "Sandercock", @@ -73,6 +73,6 @@ Imports: methods Remotes: github::jendelman/GWASpoly, - github::Breeding-Insight/BIGr, + bioc::Rsamtools Suggests: testthat diff --git a/Dockerfile b/Dockerfile index b09a0bc..d783a35 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,47 +1,25 @@ -FROM rocker/r-ver:4.4.2 -RUN apt-get update && apt-get install -y cmake libz-dev libcurl4-openssl-dev libssl-dev -RUN R -e 'install.packages("remotes")' -RUN Rscript -e 'remotes::install_version("adegenet",upgrade="never", version = "2.1.10")' -RUN Rscript -e 'remotes::install_version("curl",upgrade="never", version = "6.0.1")' -RUN Rscript -e 'remotes::install_version("DT",upgrade="never", version = "0.33")' -RUN Rscript -e 'remotes::install_version("dplyr",upgrade="never", version = "1.1.4")' -RUN Rscript -e 'remotes::install_version("vcfR",upgrade="never", version = "1.15.0")' -RUN Rscript -e 'remotes::install_version("ggplot2",upgrade="never", version = "3.5.1")' -RUN Rscript -e 'remotes::install_version("tidyr",upgrade="never", version = "1.3.1")' -RUN Rscript -e 'remotes::install_version("curl",upgrade="never", version = "6.0.1")' -RUN Rscript -e 'remotes::install_version("shiny",upgrade="never", version = "1.9.1")' -RUN Rscript -e 'remotes::install_version("config",upgrade="never", version = "0.3.2")' -RUN Rscript -e 'remotes::install_version("bs4Dash",upgrade="never", version = "2.3.4")' -RUN Rscript -e 'remotes::install_version("golem",upgrade="never", version = "0.5.1")' -RUN Rscript -e 'remotes::install_version("purrr",upgrade="never", version = "1.0.2")' -RUN Rscript -e 'remotes::install_version("markdown",upgrade="never", version = "1.13")' -RUN Rscript -e 'remotes::install_version("scales",upgrade="never", version = "1.3.0")' -RUN Rscript -e 'remotes::install_version("plotly",upgrade="never", version = "4.10.4")' -RUN Rscript -e 'remotes::install_version("shinyWidgets",upgrade="never", version = "0.8.7")' -RUN Rscript -e 'remotes::install_version("shinyjs",upgrade="never", version = "2.1.0")' -RUN Rscript -e 'remotes::install_version("shinydisconnect",upgrade="never", version = "0.1.1")' -RUN Rscript -e 'remotes::install_version("shinyalert",upgrade="never", version = "3.1.0")' -RUN Rscript -e 'remotes::install_version("stringr",upgrade="never", version = "1.5.1")' -RUN Rscript -e 'remotes::install_version("updog",upgrade="never", version = "2.1.5")' -RUN Rscript -e 'remotes::install_version("AGHmatrix",upgrade="never", version = "2.1.4")' -RUN Rscript -e 'remotes::install_version("factoextra",upgrade="never", version = "1.0.7")' -RUN Rscript -e 'remotes::install_version("httr",upgrade="never", version = "1.4.7")' -RUN Rscript -e 'remotes::install_version("future",upgrade="never", version = "1.34.0")' -RUN Rscript -e 'remotes::install_version("shinycssloaders",upgrade="never", version = "1.1.0")' -RUN Rscript -e 'remotes::install_version("RColorBrewer",upgrade="never", version = "1.1.3")' -RUN Rscript -e 'remotes::install_version("tibble",upgrade="never", version = "3.2.1")' -RUN Rscript -e 'remotes::install_version("rrBLUP",upgrade="never", version = "4.6.3")' -RUN Rscript -e 'remotes::install_version("MASS",upgrade="never", version = "7.3.60.2")' -RUN Rscript -e 'remotes::install_version("Matrix",upgrade="never", version = "1.7.0")' -RUN Rscript -e 'remotes::install_version("matrixcalc",upgrade="never", version = "1.0.6")' -RUN Rscript -e 'remotes::install_github("Breeding-Insight/BIGr",upgrade="never")' -RUN Rscript -e 'remotes::install_github("jendelman/GWASpoly",upgrade="never")' +# syntax=docker/dockerfile:1.7 +# Buildx/Actions will pass BASE_IMAGE as a manifest tag that covers both arches +ARG BASE_IMAGE=docker.io/breedinginsight/bigapp-deps:r4.5-bioc3.21-2025-08 +FROM ${BASE_IMAGE} -RUN mkdir /build_zone -ADD . /build_zone -WORKDIR /build_zone -RUN R -e 'remotes::install_local(upgrade="never")' -RUN rm -rf /build_zone -EXPOSE 80 -CMD R -e "options('shiny.port'=80,shiny.host='0.0.0.0');BIGapp::run_app()" +SHELL ["/bin/bash","-eo","pipefail","-c"] +ENV DEBIAN_FRONTEND=noninteractive TZ=UTC +ENV MAKEFLAGS="-j2" R_PKG_INSTALL_ARGS="--no-build-vignettes --no-manual" + +# App install (only your code changes should rebuild this layer) +WORKDIR /app +COPY DESCRIPTION /app/ +# COPY NAMESPACE /app/ # if present, include for better cache hits +COPY . /app +RUN R -q -e "remotes::install_local('.', upgrade='never', dependencies=TRUE, \ + INSTALL_opts=c('--no-build-vignettes','--no-manual'))" +# Runtime +RUN useradd -m appuser +USER appuser +WORKDIR /app +EXPOSE 80 +HEALTHCHECK --interval=30s --timeout=5s --retries=5 \ + CMD wget -qO- http://localhost:${PORT:-80}/ || exit 1 +CMD ["R","-q","-e","options(shiny.port=as.integer(Sys.getenv('PORT','80')), shiny.host='0.0.0.0'); BIGapp::run_app()"] diff --git a/NAMESPACE b/NAMESPACE index 703183d..9399374 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,9 @@ # Generated by roxygen2: do not edit by hand +S3method(print,vcf_sanity_check) export(run_app) +export(vcf_sanity_check) +export(vcf_sanity_messages) import(BIGr) import(GWASpoly) import(dplyr) diff --git a/R/DosageCall_functions.R b/R/DosageCall_functions.R index 88e0b71..5cd18bf 100644 --- a/R/DosageCall_functions.R +++ b/R/DosageCall_functions.R @@ -22,77 +22,75 @@ polyRAD_dosage_call <- function(vcf, ploidy, model, p1 = NULL, p2 = NULL, backcross.gen = 0, intercross.gen = 0, selfing.gen = 0, contamRate = 0.001, min_ind_read = 1, min_ind_maf = 0, tol = 1e-05, session) { - + # Variables vcf_path <- vcf - ploidy <- ploidy - model <- model n.bx <- backcross.gen n.inter <- intercross.gen n.self <- selfing.gen - + # Having some issues formatting the output when multiallelic SNPs is input, so excluding for now temp_vcf <- vcfR::read.vcfR(vcf_path, verbose = FALSE) temp_vcf <- temp_vcf[is.biallelic(temp_vcf),] - + # Function to randomly assign non-matching bases for polyRAD assign_random_bases <- function(vcf) { # Get the number of rows in the VCF num_rows <- nrow(vcf@fix) - + # Generate random bases for REF and ALT bases <- c("A", "C", "G", "T") ref_bases <- sample(bases, num_rows, replace = TRUE) alt_bases <- character(num_rows) # Initialize an empty character vector - + # Ensure REF and ALT are different for each row for (i in 1:num_rows) { alt_bases[i] <- sample(bases[bases != ref_bases[i]], 1) } - + # Assign the new bases to the vcf object vcf@fix[, "REF"] <- ref_bases vcf@fix[, "ALT"] <- alt_bases - + return(vcf) } - + #Editing REF and ALT fields randomly temporarily if blank or AB ref_base <- temp_vcf@fix[1, "REF"] alt_base <- temp_vcf@fix[1, "ALT"] if (is.na(ref_base) || is.na(alt_base) || alt_base == "B") { - temp_vcf <- assign_random_bases(temp_vcf) + temp_vcf <- assign_random_bases(temp_vcf) } - + # Adding filtered VCF as a temp object temp_vcf_path <- tempfile(fileext = ".vcf.gz") vcfR::write.vcf(temp_vcf, file = temp_vcf_path) rm(temp_vcf) - + # vcfR gzipped the VCF, but polyRAD cannot accept gzipped, only unzipped or bgzipped. # Unzipping the VCF (see if there is a workaround because this is wasteful for memory) temp_unzipped_path <- tempfile(fileext = ".vcf") R.utils::gunzip(temp_vcf_path, destname = temp_unzipped_path, overwrite = TRUE) rm(temp_vcf_path) - + # Load the VCF file as RADobject # Retaining all markers with at least 1 individual with reads (minimal filtering at this stage) # Need to determine the best contamRate to use; currently using the default - polyRAD_obj <- polyRAD::VCF2RADdata(file = temp_unzipped_path, - min.ind.with.reads = min_ind_read, - min.ind.with.minor.allele = min_ind_maf, - taxaPloidy = ploidy, - contamRate = contamRate, - tol = tol, - phaseSNPs = FALSE - + polyRAD_obj <- VCF2RADdata(file = temp_unzipped_path, + min.ind.with.reads = min_ind_read, + min.ind.with.minor.allele = min_ind_maf, + taxaPloidy = ploidy, + contamRate = contamRate, + tol = tol, + phaseSNPs = FALSE + ) - + # Empty vcf from memory rm(temp_unzipped_path) - + # Perform QC - + # Test overdispersion # Need to see if we can adapt the to_test values based on the input VCF or RAD object values # Currently using a range of 2:20, but an error will be provbided if the optimal value is the max or min of this range @@ -106,10 +104,10 @@ polyRAD_dosage_call <- function(vcf, ploidy, model, p1 = NULL, p2 = NULL, to_test_max <- to_test_max + 20 OD <- TestOverdispersion(polyRAD_obj, to_test = to_test_min:to_test_max)$optimal } - + # Test HindHe myhindhe <- HindHe(polyRAD_obj) - + # Perform dosage calling # "If you expect that your species has high linkage disequilibrium, the functions IterateHWE_LD and IteratePopStructLD behave like IterateHWE and IteratePopStruct, respectively, but also update priors based on genotypes at linked loci." if (model == "IterateHWE") { @@ -126,7 +124,7 @@ polyRAD_dosage_call <- function(vcf, ploidy, model, p1 = NULL, p2 = NULL, if (!p1 %in% rownames(data.frame(polyRAD_obj$taxaPloidy))) { stop("Parent 1 is not a valid parent") } - + polyRAD_obj <- SetDonorParent(polyRAD_obj, p1) } if (!is.null(p2)) { @@ -134,27 +132,27 @@ polyRAD_dosage_call <- function(vcf, ploidy, model, p1 = NULL, p2 = NULL, if (!p2 %in% rownames(data.frame(polyRAD_obj$taxaPloidy))) { stop("Parent 2 is not a valid parent") } - + polyRAD_obj <- SetRecurrentParent(polyRAD_obj, p2) } - + rad <- PipelineMapping2Parents(polyRAD_obj, overdispersion = OD, n.gen.backcrossing = n.bx, n.gen.intermating = n.inter, n.gen.selfing = n.self) } - - + + # Get discrete genotypes pg <- GetProbableGenotypes(rad, naIfZeroReads = TRUE)$genotypes - + # Free up memory rm(rad) rm(polyRAD_obj) - + return(list(Genos = t(pg), RADHindHe = myhindhe)) - + } @@ -176,19 +174,20 @@ polyRAD_dosage_call <- function(vcf, ploidy, model, p1 = NULL, p2 = NULL, polyRAD2vcf <- function(geno, model, vcf_path, hindhe.obj, ploidy, output.file, session) { # Making the VCF # Appending the polyRAD info to the original VCF file and exporting - + #Format dosages gt <- data.frame(flip_dosage(geno, ploidy = ploidy), check.names = FALSE) - row.names(gt) <- sub("_[A,T,C,G,B]$", "", row.names(gt)) #Removing the appended allele to the SNP IDs - + #row.names(gt) <- sub("_[A,T,C,G,B]$", "", row.names(gt)) #Removing the appended allele to the SNP IDs + row.names(gt) <- sub("_[^_]+$", "", row.names(gt)) + #Format HindHe - colnames(hindhe.obj) <- sub("_[A,T,C,G,B]$", "", colnames(hindhe.obj)) #Removing the appended allele to the column + #colnames(hindhe.obj) <- sub("_[A,T,C,G,B]$", "", colnames(hindhe.obj)) #Removing the appended allele to the column hh <- data.frame(t(colMeans(hindhe.obj, na.rm = TRUE)), check.names = FALSE) hh <- hh[, row.names(gt)] - + #Getting information from VCF og_vcf <- read.vcfR(vcf_path, verbose = FALSE) - + # Updating Header # Extracting the header (meta) meta <- og_vcf@meta @@ -200,14 +199,14 @@ polyRAD2vcf <- function(geno, model, vcf_path, hindhe.obj, ploidy, output.file, ) # Custom FORMAT lines to add custom_format_lines <- c( - '##FORMAT=', + '##FORMAT=', '##FORMAT=' ) custom_version_lines <- c( paste0('##polyRAD','_',model,'=', packageVersion("polyRAD")), paste0('##BIGapp_polyRAD2vcf=', packageVersion("BIGapp")) ) - + #polyrad_meta <- paste0('##PolyRADCommandLine.multidog=') @@ -216,18 +215,18 @@ polyRAD2vcf <- function(geno, model, vcf_path, hindhe.obj, ploidy, output.file, # Sys.time(),'", CommandLine="> updog2vcf(',deparse(substitute(multidog.object)),',', # output.file, ',', # updog_version,')">') - + # Inserting the new header lines into the VCF header meta <- append(meta, custom_format_lines, after = FORMAT_index-1) #FORMAT INFO_index <- max(grep("^##INFO=", meta)) meta <- append(meta, custom_info_lines, after = INFO_index) #INFO FORMAT_index <- max(grep("^##FORMAT=", meta)) meta <- append(meta, custom_version_lines, after = FORMAT_index) - - + + #Make a header separate from the dataframe vcf_header <- meta - + #Make the VCF df vcf_df <- data.frame( CHROM = data.frame(og_vcf@fix)$CHROM, @@ -240,26 +239,26 @@ polyRAD2vcf <- function(geno, model, vcf_path, hindhe.obj, ploidy, output.file, INFO = data.frame(og_vcf@fix)$INFO, FORMAT = data.frame(og_vcf@gt)$FORMAT ) - + # Replace NA values in the data frame with "." vcf_df[is.na(vcf_df)] <- "." - + #Get FORMAT info format_df <- og_vcf@gt[,-1] - + # Add the IDs as row names for subsetting row.names(vcf_df) <- vcf_df$ID row.names(format_df) <- vcf_df$ID vcf_df <- vcf_df[row.names(gt),] format_df <- format_df[row.names(gt),] - + #Add the INFO column for each SNP vcf_df$INFO <- paste0(vcf_df$INFO,";", "HH=",t(hh)) - + #Add the FORMAT label for each SNP - vcf_df$FORMAT <- paste("GT","UD",og_vcf@gt[1, "FORMAT"],sep=":") - + vcf_df$FORMAT <- paste("GTP","UD",og_vcf@gt[1, "FORMAT"],sep=":") + #Convert genotypes from dosage to gt precompute_genotype_strings <- function(ploidy) { genotype_strings <- character(ploidy + 1) @@ -272,31 +271,31 @@ polyRAD2vcf <- function(geno, model, vcf_path, hindhe.obj, ploidy, output.file, } return(genotype_strings) } - + # Apply the precomputed genotype strings to the matrix convert_dosage2gt <- function(dosage_matrix, ploidy) { dosage_matrix <- as.matrix(dosage_matrix) genotype_strings <- precompute_genotype_strings(ploidy) - + # Create missing GT missing_gt_string <- paste(rep(".", ploidy), collapse = "/") - + # Handle missing values separately genotype_matrix <- matrix(genotype_strings[dosage_matrix + 1], nrow = nrow(dosage_matrix), ncol = ncol(dosage_matrix)) genotype_matrix[is.na(dosage_matrix)] <- missing_gt_string # Handle missing values - + # Retain row and column names rownames(genotype_matrix) <- rownames(dosage_matrix) colnames(genotype_matrix) <- colnames(dosage_matrix) - + return(genotype_matrix) } - + # Convert the dosage matrix to genotypes gt_df <- convert_dosage2gt(gt, ploidy) # Replace NA values in the data frame with "." gt[is.na(gt)] <- "." - + #Combine info from the matrices to form the VCF information for each sample # Combine the matrices into a single matrix with elements separated by ":" make_vcf_format <- function(matrix1, matrix2, matrix3) { @@ -304,66 +303,66 @@ polyRAD2vcf <- function(geno, model, vcf_path, hindhe.obj, ploidy, output.file, if (!is.matrix(matrix1) || !is.matrix(matrix2) || !is.matrix(matrix3)) { stop("All inputs must be matrices.") } - + # Check if matrices have the same dimensions (optional, but recommended) if (!all(dim(matrix1) == dim(matrix2), dim(matrix1) == dim(matrix3))) { stop("Input matrices must have the same dimensions.") } - + # Check if matrices have row and column names (optional, based on problem description) if (is.null(rownames(matrix1)) || is.null(colnames(matrix1)) || is.null(rownames(matrix2)) || is.null(colnames(matrix2)) || is.null(rownames(matrix3)) || is.null(colnames(matrix3))) { stop("Input matrices must have row and column names.") } - + # 1. Convert matrices to vectors vector1 <- as.vector(matrix1) vector2 <- as.vector(matrix2) vector3 <- as.vector(matrix3) - + # 2. Combine the vectors element-wise with ":" as separator combined_vector <- paste(vector1, vector2, vector3, sep = ":") - + # 3. Reshape the combined vector back into a matrix with the original dimensions combined_matrix <- matrix(combined_vector, nrow = nrow(matrix1), ncol = ncol(matrix1)) - + # 4. Convert the combined matrix to a dataframe combined_dataframe <- as.data.frame(combined_matrix) - + # 5. Set the row and column names for the dataframe rownames(combined_dataframe) <- rownames(matrix1) colnames(combined_dataframe) <- colnames(matrix1) - + return(combined_dataframe) } - + # Combine the matrices # First get each item that exists in the VCF FORMAT #format_fields <- strsplit(og_vcf@gt[1, "FORMAT"], ":")[[1]] geno_df <- make_vcf_format(gt_df, as.matrix(gt), format_df - ) - + ) + #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) - + # Add # to the CHROM column name colnames(vcf_df)[1] <- "#CHROM" - + # Sort vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] - + # Write the header to the file # Make sure that .vcf is at the end of the file name output.file <- paste0(output.file, ".vcf") writeLines(vcf_header, con = output.file) - + # Append the dataframe to the file in tab-separated format suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - - + + } diff --git a/R/GS_functions.R b/R/GS_functions.R index 9893a24..ddd2340 100644 --- a/R/GS_functions.R +++ b/R/GS_functions.R @@ -200,8 +200,12 @@ get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amat if (type == "Gmatrix") { #Convert normalized genotypes to relationship matrix #By default, it removes SNPs with more than 50% missing data and imputes using the mean - Geno.mat <- A.mat(t(geno_input)) # only diploids - #Geno.mat <- Gmatrix(t(geno_input), method = "VanRaden", ploidy = ploidy) + Geno.mat <- Gmatrix(t(geno_input), + method = "VanRaden", + ploidy = ploidy, + ploidy.correction=TRUE, + ratio = FALSE, + missingValue = "NA") return(Geno.mat) }else if (type == "Amatrix") { @@ -230,7 +234,11 @@ get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amat #Using Gmatrix to get the Gmatrix instead of A.mat for consistency #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? - G.mat <- Gmatrix(t(geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + G.mat <- Gmatrix(t(geno_input[ ,valid_ids]), + method = "VanRaden", + ploidy = as.numeric(ploidy), + missingValue = "NA", + ploidy.correction = TRUE) G.mat <- round(G.mat,3) #to be easy to invert #Computing H matrix (Martini) - Using the name Geno.mat for consistency @@ -374,7 +382,7 @@ assign_colors <- function(color){ format_geno_matrix <- function(geno, model, pred_matrix = NULL, ploidy){ if(is.null(pred_matrix)) pred_matrix <- "none_selected" - if(model == "rrBLUP" | (model == "GBLUP" & pred_matrix == "Gmatrix")) { + if(model == "rrBLUP" & ploidy == 2 & pred_matrix == "Gmatrix") { #if(model == "rrBLUP") { geno_formated <- 2 * (geno / as.numeric(ploidy)) - 1 # codification -1 0 1 } else { diff --git a/R/app_ui.R b/R/app_ui.R index e32c5c6..55d43cb 100644 --- a/R/app_ui.R +++ b/R/app_ui.R @@ -103,8 +103,9 @@ app_ui <- function(request) { ) ), left = div( - style = "display: flex; align-items: center; height: 100%;", # Center the version text vertically - "v1.3.0") + style = "display: flex; align-items: center; height: 100%;", + sprintf("v%s", as.character(utils::packageVersion("BIGapp"))) + ) ), dashboardBody( disconnectMessage(), #Adds generic error message for any error if not already accounted for diff --git a/R/mod_DosageCall.R b/R/mod_DosageCall.R index ac107ea..67a1af4 100644 --- a/R/mod_DosageCall.R +++ b/R/mod_DosageCall.R @@ -265,6 +265,11 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ tol = 1e-05 ) + #VCF items + vcf_items <- reactiveValues( + RefAlt_df = NULL + ) + #UI popup window for input observeEvent(input$advanced_options, { showModal(modalDialog( @@ -392,12 +397,40 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ matrices <- list() #Import genotype information if in VCF format + #### VCF sanity check + checks <- vcf_sanity_check(madc_file, depth_support_fields = c("AD","RA"), max_markers = 10000) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", + "samples", "chrom_info", "pos_info", "allele_counts", "VCF_compressed" + ) + + error_if_true <- c( + "duplicated_samples", "duplicated_markers" + ) + warning_if_false <- c("ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = NULL) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + vcf <- read.vcfR(madc_file, verbose = FALSE) #Get items in FORMAT column info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT chrom <- vcf@fix[,1] pos <- vcf@fix[,2] + #Retain RefAlt df for later use + RefAlt_df <- data.frame(vcf@fix) + RefAlt_df <- RefAlt_df[,c(1,2,4,5)] + colnames(RefAlt_df) <- c("chr","pos","ref","alt") + vcf_items$RefAlt_df <- RefAlt_df info_ids <- extract_info_ids(info[1]) @@ -600,7 +633,7 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ } updateProgressBar(session = session, id = "pb_madc", value = 35, title = "Performing Dosage Calling") - polyrad_items <- polyRAD_dosage_call(vcf = polyrad_items$vcf_path, + polyrad_results <- polyRAD_dosage_call(vcf = polyrad_items$vcf_path, ploidy = input$ploidy, model = input$polyRAD_model, p1 = input$parent1, @@ -613,7 +646,8 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ min_ind_maf = advanced_options$min_ind_maf, tol=advanced_options$tol) updateProgressBar(session = session, id = "pb_madc", value = 100, title = "Finished") - polyrad_items + polyrad_results$vcf_path <- polyrad_items$vcf_path + polyrad_results } }) @@ -637,6 +671,7 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ multidog.object = updog_out(), output.file = temp, updog_version = packageVersion("updog"), + RefAlt = vcf_items$RefAlt_df, compress = FALSE ) diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 373ff2d..b040996 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -102,7 +102,9 @@ mod_Filtering_ui <- function(id){ ), box(title = "Status", width =12, collapsible = TRUE, status = "info", progressBar(id = ns("pb_filter"), value = 0, status = "info", display_pct = TRUE, striped = TRUE, title = " ") - ) + ), + # A placeholder for the download button. It will be rendered in the shinyalert modal. + uiOutput(ns("download_ui_placeholder")) ) ) ) @@ -178,6 +180,35 @@ mod_Filtering_server <- function(input, output, session, parent_session){ #Get list of sample names from VCF file observeEvent(input$updog_rdata, { + #### VCF sanity check + checks <- vcf_sanity_check(input$updog_rdata$datapath, + max_markers = 16000, + depth_support_fields = c("DP", "AD", "RA")) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "chrom_info", "pos_info", "VCF_compressed", "allele_counts" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL) + + print(checks) + print(checks_result) + if(checks_result) return() # Stop the analysis if checks fail + ######### + + #populate preview_data preview_vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE, nrows = 1) @@ -262,7 +293,8 @@ mod_Filtering_server <- function(input, output, session, parent_session){ raw_sample_miss_df = NULL, maf_df = NULL, raw_maf_df = NULL, - format_fields = NULL + format_fields = NULL, + removed_names = NULL ) @@ -343,6 +375,23 @@ mod_Filtering_server <- function(input, output, session, parent_session){ updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) & grepl("OD", info_fields) filtering_files$format_fields <- updog_par + if(length(updog_par) > 1) { + shinyalert( + title = "Malformed VCF", + text = "Make sure all your markers have the same information in the FORMAT field.", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + if (input$use_updog & updog_par) { # Use Updog filtering parameters OD_filter <- as.numeric(input$OD_filter) @@ -390,7 +439,7 @@ mod_Filtering_server <- function(input, output, session, parent_session){ #Filtering #Raw VCF info gt_matrix <- extract.gt(filterVCF(vcf, ploidy = ploidy,filter.DP = as.numeric(size_depth), output.file = NULL), element = "GT", as.numeric = FALSE) - starting_samples <- ncol(gt_matrix) + starting_samples <- colnames(gt_matrix) filtering_files$raw_snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values filtering_files$raw_sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values @@ -454,7 +503,7 @@ mod_Filtering_server <- function(input, output, session, parent_session){ gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values - final_samples <- ncol(gt_matrix) + final_samples <- colnames(gt_matrix) rm(gt_matrix) #Remove gt matrix #Pb @@ -473,23 +522,32 @@ mod_Filtering_server <- function(input, output, session, parent_session){ }) #User warning if samples were removed during filtering - sample_removed <- starting_samples - final_samples + sample_removed <- length(starting_samples) - length(final_samples) + removed_names <- setdiff(starting_samples, final_samples) + filtering_files$removed_names <- removed_names + + # Define the download handler + output$download_removed_samples <- downloadHandler( + filename = function() { + "removed_samples.txt" + }, + content = function(file) { + if (!is.null(filtering_files$removed_names)) { + writeLines(filtering_files$removed_names, file) + } + } + ) if (sample_removed > 0 && removed_samples == 0) { - shinyalert( + showModal(modalDialog( title = "Samples Filtered", - text = paste(sample_removed, "samples were removed during filtering."), - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) + footer = tagList( + modalButton("OK"), + # Use a downloadButton instead of a manual tag + downloadButton(ns("download_removed_samples"), "Save Sample List") + ), + paste(sample_removed, "samples were removed during filtering.") + )) } else if (sample_removed > 0 && removed_samples > 0) { shinyalert( title = "Samples Filtered", @@ -523,8 +581,7 @@ mod_Filtering_server <- function(input, output, session, parent_session){ tabPanel("Results", p("Upload VCF file to access results in this section.")) ) } else { - - if (!is.null(filtering_files$format_fields) && filtering_files$format_fields == TRUE && input$vcf_type == "Unfiltered VCF") { + if (!any(is.null(filtering_files$format_fields)) && any(filtering_files$format_fields == TRUE) && input$vcf_type == "Unfiltered VCF") { # Include "Bias Histogram", "OD Histogram", and "Prop_mis Histogram" for Pre-Filtered VCF tabBox( width = 12, collapsible = FALSE, status = "info", @@ -944,6 +1001,7 @@ mod_Filtering_server <- function(input, output, session, parent_session){ writeLines(paste(capture.output(filtering_summary_info()), collapse = "\n"), file) } ) + } ## To be copied in the UI diff --git a/R/mod_GS.R b/R/mod_GS.R index c0b309c..7367186 100644 --- a/R/mod_GS.R +++ b/R/mod_GS.R @@ -15,7 +15,7 @@ #' @import shinydisconnect #' #' -mod_GS_ui <- function(id){ +mod_GS_ui <- function(id) { ns <- NS(id) tagList( fluidRow( @@ -28,106 +28,109 @@ mod_GS_ui <- function(id){ overlayOpacity = 0.3, refreshColour = "purple" ), - column(width = 3, - box(title="Inputs", width = 12, collapsible = TRUE, collapsed = FALSE, status = "info", solidHeader = TRUE, - "* Required", - fileInput(ns("pred_known_file"), "Choose VCF File*", accept = c(".csv",".vcf",".gz")), - fileInput(ns("pred_trait_file"), "Choose Trait File*", accept = ".csv"), - numericInput(ns("pred_est_ploidy"), "Species Ploidy*", min = 1, value = NULL), - virtualSelectInput( - inputId = ns("pred_trait_info2"), - label = "Select Trait*", - choices = NULL, - showValueAsTags = TRUE, - search = TRUE, - multiple = TRUE - ), - virtualSelectInput( - inputId = ns("pred_fixed_info2"), - label = span("Select Fixed Effects", bs4Badge("beta", position = "right", color = "success")), - choices = NULL, - showValueAsTags = TRUE, - search = TRUE, - multiple = TRUE - ), - conditionalPanel( - condition = "input.pred_fixed_info2.length > 0", ns = ns, - div( - "(unselected will be considered covariates)", - virtualSelectInput( - inputId = ns("pred_fixed_cat2"), - label = "Select Categorical Fixed Effects", - choices = NULL, - showValueAsTags = TRUE, - search = TRUE, - multiple = TRUE - ) - ) - ), - actionButton(ns("prediction_est_start"), "Run Analysis"), - div(style="display:inline-block; float:right",dropdownButton( - HTML("Input files"), - p(downloadButton(ns('download_vcf'),""), "VCF Example File"), - p(downloadButton(ns('download_pheno'),""), "Trait Example File"), - p(downloadButton(ns('download_vcfp'), ""), "Download Prediction VCF Example File"),hr(), - p(HTML("Parameters description:"), actionButton(ns("goPar"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), - p(HTML("Results description:"), actionButton(ns("goRes"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), - p(HTML("How to cite:"), actionButton(ns("goCite"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), - actionButton(ns("pred_summary"), "Summary"), - circle = FALSE, - status = "warning", - icon = icon("info"), width = "300px", - tooltip = tooltipOptions(title = "Click to see info!") - )), - tags$hr(style="border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line - div(style="text-align: left; margin-top: 10px;", - actionButton(ns("advanced_options_pred"), - label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options (beta)")), - style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" - ) - ) - - ) + column( + width = 3, + box( + title = "Inputs", width = 12, collapsible = TRUE, collapsed = FALSE, status = "info", solidHeader = TRUE, + "* Required", + fileInput(ns("pred_known_file"), "Choose VCF File*", accept = c(".csv", ".vcf", ".gz")), + fileInput(ns("pred_trait_file"), "Choose Trait File*", accept = ".csv"), + numericInput(ns("pred_est_ploidy"), "Species Ploidy*", min = 1, value = NULL), + virtualSelectInput( + inputId = ns("pred_trait_info2"), + label = "Select Trait*", + choices = NULL, + showValueAsTags = TRUE, + search = TRUE, + multiple = TRUE + ), + virtualSelectInput( + inputId = ns("pred_fixed_info2"), + label = span("Select Fixed Effects", bs4Badge("beta", position = "right", color = "success")), + choices = NULL, + showValueAsTags = TRUE, + search = TRUE, + multiple = TRUE + ), + conditionalPanel( + condition = "input.pred_fixed_info2.length > 0", ns = ns, + div( + "(unselected will be considered covariates)", + virtualSelectInput( + inputId = ns("pred_fixed_cat2"), + label = "Select Categorical Fixed Effects", + choices = NULL, + showValueAsTags = TRUE, + search = TRUE, + multiple = TRUE + ) + ) + ), + actionButton(ns("prediction_est_start"), "Run Analysis"), + div(style = "display:inline-block; float:right", dropdownButton( + HTML("Input files"), + p(downloadButton(ns("download_vcf"), ""), "VCF Example File"), + p(downloadButton(ns("download_pheno"), ""), "Trait Example File"), + p(downloadButton(ns("download_vcfp"), ""), "Download Prediction VCF Example File"), hr(), + p(HTML("Parameters description:"), actionButton(ns("goPar"), icon("arrow-up-right-from-square", verify_fa = FALSE))), hr(), + p(HTML("Results description:"), actionButton(ns("goRes"), icon("arrow-up-right-from-square", verify_fa = FALSE))), hr(), + p(HTML("How to cite:"), actionButton(ns("goCite"), icon("arrow-up-right-from-square", verify_fa = FALSE))), hr(), + actionButton(ns("pred_summary"), "Summary"), + circle = FALSE, + status = "warning", + icon = icon("info"), width = "300px", + tooltip = tooltipOptions(title = "Click to see info!") + )), + tags$hr(style = "border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line + div( + style = "text-align: left; margin-top: 10px;", + actionButton(ns("advanced_options_pred"), + label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options (beta)")), + style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" + ) + ) + ) ), - - column(width = 6, - box(title = "Results", status = "info", solidHeader = FALSE, width = 12, height = 600, maximizable = T, - bs4Dash::tabsetPanel( - tabPanel("Predicted Pheno Table", DTOutput(ns("pred_trait_table")), style = "overflow-y: auto; height: 500px"), - tabPanel("EBVs Table", DTOutput(ns("pred_gebvs_table2")),style = "overflow-y: auto; height: 500px") - - ) - ) + column( + width = 6, + box( + title = "Results", status = "info", solidHeader = FALSE, width = 12, height = 600, maximizable = T, + bs4Dash::tabsetPanel( + tabPanel("Predicted Pheno Table", DTOutput(ns("pred_trait_table")), style = "overflow-y: auto; height: 500px"), + tabPanel("EBVs Table", DTOutput(ns("pred_gebvs_table2")), style = "overflow-y: auto; height: 500px") + ) + ) ), - - column(width = 3, - valueBoxOutput("shared_snps", width = NULL), - box(title = "Status", width = 12, collapsible = TRUE, status = "info", - progressBar(id = ns("pb_gp"), value = 0, status = "info", display_pct = TRUE, striped = TRUE, title = " ") - ), - box(title = "Plot Controls", status = "warning", solidHeader = TRUE, collapsible = TRUE, width = 12, - div(style="display:inline-block; float:left",dropdownButton( - tags$h3("Save Files"), - fluidRow( - downloadButton(ns("download_pred_results_file"), "Save Files")), - circle = FALSE, - status = "danger", - icon = icon("floppy-disk"), width = "300px", label = "Save", - tooltip = tooltipOptions(title = "Click to see inputs!") - )) - ) - + column( + width = 3, + valueBoxOutput("shared_snps", width = NULL), + box( + title = "Status", width = 12, collapsible = TRUE, status = "info", + progressBar(id = ns("pb_gp"), value = 0, status = "info", display_pct = TRUE, striped = TRUE, title = " ") + ), + box( + title = "Plot Controls", status = "warning", solidHeader = TRUE, collapsible = TRUE, width = 12, + div(style = "display:inline-block; float:left", dropdownButton( + tags$h3("Save Files"), + fluidRow( + downloadButton(ns("download_pred_results_file"), "Save Files") + ), + circle = FALSE, + status = "danger", + icon = icon("floppy-disk"), width = "300px", label = "Save", + tooltip = tooltipOptions(title = "Click to see inputs!") + )) + ) ) - ) - ) } #' GS Server Functions #' #' @importFrom vcfR read.vcfR extract.gt -#' @importFrom rrBLUP mixed.solve A.mat +#' @importFrom rrBLUP mixed.solve +#' @importFrom AGHmatrix Gmatrix #' @importFrom stats cor #' @importFrom shinyalert shinyalert #' @import dplyr @@ -135,80 +138,93 @@ mod_GS_ui <- function(id){ #' @import tidyr #' @importFrom DT renderDT #' @noRd -mod_GS_server <- function(input, output, session, parent_session){ - +mod_GS_server <- function(input, output, session, parent_session) { ns <- session$ns - + # Help links observeEvent(input$goPar, { # change to help tab - updatebs4TabItems(session = parent_session, inputId = "MainMenu", - selected = "help") - + updatebs4TabItems( + session = parent_session, inputId = "MainMenu", + selected = "help" + ) + # select specific tab - updateTabsetPanel(session = parent_session, inputId = "Genomic_Prediction_tabset", - selected = "Genomic_Prediction_par") + updateTabsetPanel( + session = parent_session, inputId = "Genomic_Prediction_tabset", + selected = "Genomic_Prediction_par" + ) # expand specific box updateBox(id = "Genomic_Prediction_box", action = "toggle", session = parent_session) }) - + observeEvent(input$goRes, { # change to help tab - updatebs4TabItems(session = parent_session, inputId = "MainMenu", - selected = "help") - + updatebs4TabItems( + session = parent_session, inputId = "MainMenu", + selected = "help" + ) + # select specific tab - updateTabsetPanel(session = parent_session, inputId = "Genomic_Prediction_tabset", - selected = "Genomic_Prediction_results") + updateTabsetPanel( + session = parent_session, inputId = "Genomic_Prediction_tabset", + selected = "Genomic_Prediction_results" + ) # expand specific box updateBox(id = "Genomic_Prediction_box", action = "toggle", session = parent_session) }) - + observeEvent(input$goCite, { # change to help tab - updatebs4TabItems(session = parent_session, inputId = "MainMenu", - selected = "help") - + updatebs4TabItems( + session = parent_session, inputId = "MainMenu", + selected = "help" + ) + # select specific tab - updateTabsetPanel(session = parent_session, inputId = "Genomic_Prediction_tabset", - selected = "Genomic_Prediction_cite") + updateTabsetPanel( + session = parent_session, inputId = "Genomic_Prediction_tabset", + selected = "Genomic_Prediction_cite" + ) # expand specific box updateBox(id = "Genomic_Prediction_box", action = "toggle", session = parent_session) }) - - #Default model choices + + # Default model choices advanced_options_pred <- reactiveValues( pred_model = "GBLUP", pred_matrix = "Gmatrix", pred_est_file = NULL, ped_file = NULL ) - - pred_outputs <- reactiveValues(corr_output = NULL, - comb_output = NULL, - all_GEBVs = NULL, - avg_GEBVs = NULL) - - #List the ped file name if previously uploaded + + pred_outputs <- reactiveValues( + corr_output = NULL, + comb_output = NULL, + all_GEBVs = NULL, + avg_GEBVs = NULL + ) + + # List the ped file name if previously uploaded output$uploaded_file_name <- renderText({ if (!is.null(advanced_options_pred$ped_file)) { paste("Previously uploaded file:", advanced_options_pred$ped_file$name) } else { - "" # Return an empty string if no file has been uploaded + "" # Return an empty string if no file has been uploaded } }) - + output$uploaded_file_name_pred <- renderText({ if (!is.null(advanced_options_pred$pred_est_file)) { paste("Previously uploaded file:", advanced_options_pred$pred_est_file$name) } else { - "" # Return an empty string if no file has been uploaded + "" # Return an empty string if no file has been uploaded } }) ## Trait file modal window - #Default choices + # Default choices trait_options <- reactiveValues( missing_data = "NA", custom_missing = NULL, @@ -216,37 +232,39 @@ mod_GS_server <- function(input, output, session, parent_session){ file_type = NULL ) - #UI popup window for input + # UI popup window for input observeEvent(input$pred_trait_file, { req(input$pred_trait_file) - #Get the column names of the csv file - info_df <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, nrows=2) - info_df[,1] <- as.character(info_df[,1]) #Makes sure that the sample names are characters instead of numeric + # Get the column names of the csv file + info_df <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, nrows = 2) + info_df[, 1] <- as.character(info_df[, 1]) # Makes sure that the sample names are characters instead of numeric # Read first 5 rows for preview - preview_data <- tryCatch({ - head(read.csv(input$pred_trait_file$datapath, nrows = 5, na.strings=trait_options$missing_data),5) - }, error = function(e) { - NULL - }) + preview_data <- tryCatch( + { + head(read.csv(input$pred_trait_file$datapath, nrows = 5, na.strings = trait_options$missing_data), 5) + }, + error = function(e) { + NULL + } + ) showModal(modalDialog( title = "Trait File Options", - size= "l", - + size = "l", selectInput( - inputId = ns('missing_data'), - label = 'Missing Data Value', - choices = c("NA",".","-99","(blank)","Custom"), - selected = trait_options$missing_data # Initialize with stored value + inputId = ns("missing_data"), + label = "Missing Data Value", + choices = c("NA", ".", "-99", "(blank)", "Custom"), + selected = trait_options$missing_data # Initialize with stored value ), conditionalPanel( condition = "input.missing_data == 'Custom'", ns = ns, div( textInput( - inputId = ns('custom_missing'), - label = 'Custom Missing Value', - value = trait_options$custom_missing # Initialize with stored value + inputId = ns("custom_missing"), + label = "Custom Missing Value", + value = trait_options$custom_missing # Initialize with stored value ) ), div( @@ -256,11 +274,10 @@ mod_GS_server <- function(input, output, session, parent_session){ ) ), selectInput( - inputId = ns('sample_column'), - label = 'Sample ID Column', + inputId = ns("sample_column"), + label = "Sample ID Column", choices = colnames(info_df) ), - if (!is.null(preview_data)) { div( h4( @@ -280,7 +297,6 @@ mod_GS_server <- function(input, output, session, parent_session){ p("Could not load file preview.") ) }, - footer = tagList( actionButton(ns("save_trait_options"), "Save") ) @@ -291,7 +307,6 @@ mod_GS_server <- function(input, output, session, parent_session){ req(preview_data) preview_data }) - }) output$custom_missing_msg <- renderText({ @@ -303,12 +318,12 @@ mod_GS_server <- function(input, output, session, parent_session){ }) - #Close popup window when user "saves options" + # Close popup window when user "saves options" observeEvent(input$save_trait_options, { trait_options$missing_data <- input$missing_data trait_options$custom_missing <- input$custom_missing trait_options$sample_column <- input$sample_column - #trait_options$file_type + # trait_options$file_type # Save other inputs as needed if (input$missing_data == "Custom" && nchar(input$custom_missing) == 0) { @@ -321,38 +336,38 @@ mod_GS_server <- function(input, output, session, parent_session){ return() # Stop further execution and keep the modal open } - removeModal() # Close the modal after saving + removeModal() # Close the modal after saving }) - - #UI popup window for input + + # UI popup window for input observeEvent(input$advanced_options_pred, { showModal(modalDialog( title = "Advanced Options (beta)", selectInput( - inputId = ns('pred_model'), - label = 'Method Choice', + inputId = ns("pred_model"), + label = "Method Choice", choices = c("GBLUP"), - selected = advanced_options_pred$pred_model # Initialize with stored value + selected = advanced_options_pred$pred_model # Initialize with stored value ), conditionalPanel( condition = "input.pred_model == 'GBLUP'", ns = ns, div( selectInput( - inputId = ns('pred_matrix'), - label = 'Relationship Matrix Choice', - #choices = c("Gmatrix", "Amatrix", "Hmatrix"), + inputId = ns("pred_matrix"), + label = "Relationship Matrix Choice", + # choices = c("Gmatrix", "Amatrix", "Hmatrix"), choices = c("Gmatrix"), - selected = advanced_options_pred$pred_matrix # Initialize with stored value + selected = advanced_options_pred$pred_matrix # Initialize with stored value ) ) ), conditionalPanel( condition = "input.pred_matrix != 'Amatrix'", ns = ns, div( - fileInput(ns("pred_est_file"), "Choose Prediction VCF", accept = c(".vcf",".gz")), + fileInput(ns("pred_est_file"), "Choose Prediction VCF", accept = c(".vcf", ".gz")), conditionalPanel( condition = "output.uploaded_file_name_pred !== ''", # Show only if there's content - textOutput(ns("uploaded_file_name_pred")) # Display the uploaded file name + textOutput(ns("uploaded_file_name_pred")) # Display the uploaded file name ) ) ), @@ -362,7 +377,7 @@ mod_GS_server <- function(input, output, session, parent_session){ fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), conditionalPanel( condition = "output.uploaded_file_name !== ''", # Show only if there's content - textOutput(ns("uploaded_file_name")) # Display the uploaded file name + textOutput(ns("uploaded_file_name")) # Display the uploaded file name ) ) ), @@ -372,28 +387,28 @@ mod_GS_server <- function(input, output, session, parent_session){ ) )) }) - - - - #Close popup window when user "saves options" + + + + # Close popup window when user "saves options" observeEvent(input$save_advanced_options_pred, { advanced_options_pred$pred_model <- input$pred_model advanced_options_pred$pred_matrix <- input$pred_matrix advanced_options_pred$pred_est_file <- input$pred_est_file advanced_options_pred$ped_file <- input$ped_file # Save other inputs as needed - - removeModal() # Close the modal after saving + + removeModal() # Close the modal after saving }) - - ###Genomic Prediction - #This tab involved 3 observeEvents - #1) to get the traits listed in the phenotype file - #2) to input and validate the input files - #3) to perform the genomic prediction - - - #1) Get traits + + ### Genomic Prediction + # This tab involved 3 observeEvents + # 1) to get the traits listed in the phenotype file + # 2) to input and validate the input files + # 3) to perform the genomic prediction + + + # 1) Get traits observeEvent(input$save_trait_options, { info_df2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) sample_col_name <- input$sample_column @@ -401,12 +416,12 @@ mod_GS_server <- function(input, output, session, parent_session){ updateVirtualSelect("pred_fixed_info2", choices = trait_var2, session = session) updateVirtualSelect("pred_trait_info2", choices = trait_var2, session = session) }) - + observeEvent(input$pred_fixed_info2, { updateVirtualSelect("pred_fixed_cat2", choices = input$pred_fixed_info2, session = session) }) - - #2) Error check for prediction and save input files + + # 2) Error check for prediction and save input files continue_prediction2 <- reactiveVal(NULL) pred_inputs2 <- reactiveValues( pheno_input = NULL, @@ -417,7 +432,7 @@ mod_GS_server <- function(input, output, session, parent_session){ pred_geno_pheno = NULL, matrix = NULL ) - + pred_outputs2 <- reactiveValues( corr_output = NULL, box_plot = NULL, @@ -428,8 +443,8 @@ mod_GS_server <- function(input, output, session, parent_session){ colors = NULL, trait_output = NULL ) - - #Reactive boxes + + # Reactive boxes output$shared_snps <- renderValueBox({ valueBox( value = pred_inputs2$shared_snps, @@ -438,12 +453,12 @@ mod_GS_server <- function(input, output, session, parent_session){ color = "info" ) }) - + observeEvent(input$prediction_est_start, { - #req(pred_inputs$pheno_input, pred_inputs$geno_input) - + # req(pred_inputs$pheno_input, pred_inputs$geno_input) + toggleClass(id = "pred_est_ploidy", class = "borderred", condition = (is.na(input$pred_est_ploidy) | is.null(input$pred_est_ploidy))) - + if (is.null(input$pred_known_file$datapath) | is.null(input$pred_trait_file$datapath)) { shinyalert( title = "Missing input!", @@ -461,11 +476,11 @@ mod_GS_server <- function(input, output, session, parent_session){ ) } req(input$pred_known_file$datapath, input$pred_trait_file$datapath, input$pred_est_ploidy) - - #Status + + # Status updateProgressBar(session = session, id = "pb_gp", value = 5, title = "Checking input files") - - #Variables + + # Variables ploidy <- as.numeric(input$pred_est_ploidy) train_geno_path <- input$pred_known_file$datapath est_geno_path <- input$pred_est_file$datapath @@ -473,7 +488,7 @@ mod_GS_server <- function(input, output, session, parent_session){ # Phenotypic variables if (trait_options$missing_data == "(blank)") { - pheno2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, na.strings="") + pheno2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, na.strings = "") } else if (trait_options$missing_data == "Custom") { pheno2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, na.strings = trait_options$custom_missing) } else { @@ -485,32 +500,35 @@ mod_GS_server <- function(input, output, session, parent_session){ pheno2 <- pheno2[, c(sample_col_name, setdiff(names(pheno2), sample_col_name))] # Add row names and catch errors - tryCatch({ - row.names(pheno2) <- pheno2[, 1] - }, warning = function(w) { - showNotification( - paste("Warning: Duplicate row names detected. Please ensure the sample ID column has unique values.", w$message), - type = "warning", - duration = NULL - ) - return(NULL) # Return NULL to prevent further processing - }, error = function(e) { - showNotification( - paste("Error: An error occurred while assigning row names. Please check your sample ID column.", e$message), - type = "error", - duration = NULL - ) - return(NULL) # Return NULL to prevent further processing - }) + tryCatch( + { + row.names(pheno2) <- pheno2[, 1] + }, + warning = function(w) { + showNotification( + paste("Warning: Duplicate row names detected. Please ensure the sample ID column has unique values.", w$message), + type = "warning", + duration = NULL + ) + return(NULL) # Return NULL to prevent further processing + }, + error = function(e) { + showNotification( + paste("Error: An error occurred while assigning row names. Please check your sample ID column.", e$message), + type = "error", + duration = NULL + ) + return(NULL) # Return NULL to prevent further processing + } + ) # Assigning the IDs based on user input for column 1 ids_pheno <- pheno2[, 1] - - - #Make sure at least one trait was input + + + # Make sure at least one trait was input if (length(traits) == 0) { - # If condition is met, show notification toast shinyalert( title = "Missing input!", @@ -527,30 +545,27 @@ mod_GS_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE, ) - - + + # Stop the observeEvent gracefully return() - } - - - #Getting genotype matrix - - #Geno file path + + + # Getting genotype matrix + + # Geno file path file_path <- train_geno_path - - #Geno.file conversion if needed + + # Geno.file conversion if needed if (grepl("\\.csv$", file_path)) { train_geno <- read.csv(train_geno_path, header = TRUE, row.names = 1, check.names = FALSE) est_geno <- read.csv(est_geno_path, header = TRUE, row.names = 1, check.names = FALSE) - - #Save number of SNPs - #pred_inputs$pred_snps <- nrow(geno) - + + # Save number of SNPs + # pred_inputs$pred_snps <- nrow(geno) } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - - #Function to convert GT to dosage calls (add to BIGr) + # Function to convert GT to dosage calls (add to BIGr) convert_to_dosage <- function(gt) { # Split the genotype string alleles <- strsplit(gt, "[|/]") @@ -563,19 +578,55 @@ mod_GS_server <- function(input, output, session, parent_session){ } }) } - - #Convert VCF file if submitted + + #### VCF sanity check + checks <- vcf_sanity_check(train_geno_path) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "VCF_compressed" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("chrom_info", "pos_info", "ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = as.numeric(input$pred_est_ploidy)) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + + # Convert VCF file if submitted train_vcf <- vcfR::read.vcfR(train_geno_path, verbose = FALSE) - if (is.null(est_geno_path) || is.na(est_geno_path)){ + if (is.null(est_geno_path) || is.na(est_geno_path)) { est_vcf <- NULL } else { + ## Check VCF + checks <- vcf_sanity_check(est_geno_path) + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + as.numeric(input$pred_est_ploidy)) + + if(checks_result) return() # Stop the analysis if checks fail + est_vcf <- vcfR::read.vcfR(est_geno_path, verbose = FALSE) } - - #Get number of SNPs - #pred_inputs$pred_snps <- nrow(vcf) - - #Extract GT + + # Get number of SNPs + # pred_inputs$pred_snps <- nrow(vcf) + + # Extract GT if (is.null(est_vcf)) { est_geno <- NULL } else { @@ -588,9 +639,7 @@ mod_GS_server <- function(input, output, session, parent_session){ class(train_geno) <- "numeric" rm(train_vcf) rm(est_vcf) - } else { - # If condition is met, show notification toast shinyalert( title = "File Error", @@ -607,17 +656,17 @@ mod_GS_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE, ) - - #Stop the analysis + + # Stop the analysis return() } - - #Check that the ploidy entered is correct + + # Check that the ploidy entered is correct if (ploidy != max(train_geno, na.rm = TRUE)) { # If condition is met, show notification toast shinyalert( title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(train_geno, na.rm = TRUE),") does not equal the ploidy entered"), + text = paste0("The maximum value in the genotype file (", max(train_geno, na.rm = TRUE), ") does not equal the ploidy entered"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, @@ -627,45 +676,48 @@ mod_GS_server <- function(input, output, session, parent_session){ confirmButtonText = "OK", confirmButtonCol = "#004192", showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, + # closeOnConfirm = TRUE, + # closeOnCancel = TRUE, imageUrl = "", animation = TRUE ) - - + + # Stop the observeEvent gracefully - #return() + # return() } - - + + # Function to convert genotype matrix according to ploidy - convert_genotype <- function(genotype_matrix, ploidy) { - normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 - return(normalized_matrix) - } + + #convert_genotype <- function(genotype_matrix, ploidy) { + # normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 + # return(normalized_matrix) + #} #tranforming genotypes - train_geno_adj_init <- convert_genotype(train_geno, as.numeric(ploidy)) + #train_geno_adj_init <- convert_genotype(train_geno, as.numeric(ploidy)) + train_geno_adj_init <- train_geno + if (is.null(est_geno)) { est_geno_adj_init <- NULL } else { - est_geno_adj_init <- convert_genotype(est_geno, as.numeric(ploidy)) + #est_geno_adj_init <- convert_genotype(est_geno, as.numeric(ploidy)) + est_geno_adj_init <- est_geno } - - #Make sure the trait file and genotype file are in the same order + + # Make sure the trait file and genotype file are in the same order # Column names for geno (assuming these are the individual IDs) colnames_geno <- colnames(train_geno) # Assuming the first column in Pheno contains the matching IDs ids_pheno <- pheno2[, 1] # Find common identifiers common_ids <- intersect(colnames_geno, ids_pheno) - #Get number of id + # Get number of id pred_inputs2$pred_geno_pheno <- length(common_ids) - - #Throw an error if there are less matching samples in the phenotype file than the genotype file + + # Throw an error if there are less matching samples in the phenotype file than the genotype file if (length(common_ids) == 0) { - # If condition is met, show notification toast shinyalert( title = "Data Mismatch", @@ -682,16 +734,15 @@ mod_GS_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE, ) - - + + # Stop the observeEvent gracefully return() - } else if (length(common_ids) < length(colnames_geno)) { # If condition is met, show notification toast shinyalert( title = "Data Check", - text = paste0((length(common_ids))," samples in VCF File have trait information"), + text = paste0((length(common_ids)), " samples in VCF File have trait information"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, @@ -701,21 +752,21 @@ mod_GS_server <- function(input, output, session, parent_session){ confirmButtonText = "OK", confirmButtonCol = "#004192", showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, + # closeOnConfirm = TRUE, + # closeOnCancel = TRUE, imageUrl = "", animation = TRUE ) - - + + # Stop the observeEvent gracefully - #return() + # return() } - - - - - #Final check before performing analyses + + + + + # Final check before performing analyses shinyalert( title = "Ready?", text = "Inputs have been checked", @@ -728,8 +779,8 @@ mod_GS_server <- function(input, output, session, parent_session){ confirmButtonText = "Proceed", confirmButtonCol = "#004192", showCancelButton = TRUE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, + # closeOnConfirm = TRUE, + # closeOnCancel = TRUE, imageUrl = "", animation = TRUE, callbackR = function(value) { @@ -742,52 +793,63 @@ mod_GS_server <- function(input, output, session, parent_session){ } } ) - - #Status + + # Status updateProgressBar(session = session, id = "pb_gp", value = 40, title = "Generating Matrices") - - #Create relationship matrix depending on the input VCF files + + # Create relationship matrix depending on the input VCF files if (is.null(advanced_options_pred$pred_est_file)) { - #Subset phenotype file by common IDs + # Subset phenotype file by common IDs pheno2 <- pheno2[common_ids, ] - + #Matrix - Kin_mat <- A.mat(t(train_geno_adj_init), max.missing=NULL,impute.method="mean",return.imputed=FALSE) + #Kin_mat <- A.mat(t(train_geno_adj_init), max.missing=NULL,impute.method="mean",return.imputed=FALSE) + Kin_mat <- Gmatrix(t(train_geno_adj_init), + method = "VanRaden", + ploidy = ploidy, + ploidy.correction=TRUE, + ratio = FALSE, + missingValue = "NA") } else{ #Subset phenotype file and training file by common IDs pheno2 <- pheno2[common_ids, ] - - #Match training and testing genotype file SNPs + + # Match training and testing genotype file SNPs common_markers <- intersect(rownames(train_geno_adj_init), rownames(est_geno_adj_init)) - #first remove samples from training genotype matrix that are not in the phenotype file + # first remove samples from training genotype matrix that are not in the phenotype file train_geno_adj <- train_geno_adj_init[common_markers, common_ids] - #Merge the training and prediction genotype matrices + # Merge the training and prediction genotype matrices est_geno_adj_init <- est_geno_adj_init[common_markers, ] train_pred_geno <- cbind(train_geno_adj, est_geno_adj_init) #Matrix - Kin_mat <- A.mat(t(train_pred_geno), max.missing=NULL,impute.method="mean",return.imputed=FALSE) + #Kin_mat <- A.mat(t(train_pred_geno), max.missing=NULL,impute.method="mean",return.imputed=FALSE) + Kin_mat <- Gmatrix(t(train_pred_geno), + method = "VanRaden", + ploidy = ploidy, + ploidy.correction=TRUE, + ratio = FALSE, + missingValue = "NA") } - - #Save to reactive values - #pred_inputs2$shared_snps <- length(common_markers) + + # Save to reactive values + # pred_inputs2$shared_snps <- length(common_markers) pred_inputs2$matrix <- Kin_mat pred_inputs2$pheno_input <- pheno2 }) - - #3) Analysis only proceeds once continue_prediction is converted to TRUE + + # 3) Analysis only proceeds once continue_prediction is converted to TRUE observe({ - - req(continue_prediction2(),pred_inputs2$pheno_input, pred_inputs2$matrix) - + req(continue_prediction2(), pred_inputs2$pheno_input, pred_inputs2$matrix) + # Stop analysis if cancel was selected if (isFALSE(continue_prediction2())) { return() } - - #Variables + + # Variables ploidy <- as.numeric(input$pred_est_ploidy) gmat <- pred_inputs2$matrix pheno2 <- pred_inputs2$pheno_input @@ -806,89 +868,101 @@ mod_GS_server <- function(input, output, session, parent_session){ } else { fixed_cov <- setdiff(input$pred_fixed_info2, input$pred_fixed_cat2) } - #fixed_cov <- setdiff(input$pred_fixed_info2, input$pred_fixed_cat2) + # fixed_cov <- setdiff(input$pred_fixed_info2, input$pred_fixed_cat2) cores <- 1 total_population <- ncol(gmat) - #train_size <- floor(percentage / 100 * total_population) - - #Status + # train_size <- floor(percentage / 100 * total_population) + + # Status updateProgressBar(session = session, id = "pb_gp", value = 90, title = "Generating Results") - - #initialize dataframe + + # initialize dataframe results_GEBVs <- matrix(nrow = ncol(gmat), ncol = length(traits) + 1) results_TRAITs <- matrix(nrow = ncol(gmat), ncol = length(traits) + 1) - colnames(results_TRAITs) <- c("Sample",paste0(traits)) # Set the column names to be the traits - colnames(results_GEBVs) <- c("Sample",paste0(traits)) # Set the column names to be the traits - - #GBLUP for each trait + colnames(results_TRAITs) <- c("Sample", paste0(traits)) # Set the column names to be the traits + colnames(results_GEBVs) <- c("Sample", paste0(traits)) # Set the column names to be the traits + + # GBLUP for each trait for (trait_idx in 1:length(traits)) { - traitpred <- kin.blup(data = pheno2, - geno = colnames(pheno2)[1], - pheno = traits[trait_idx], - fixed = fixed_cat, - covariate = fixed_cov, - K=gmat, - n.core = cores) - - results_GEBVs[, (trait_idx+1)] <- traitpred$g - results_TRAITs[, (trait_idx+1)] <- traitpred$pred + traitpred <- kin.blup( + data = pheno2, + geno = colnames(pheno2)[1], + pheno = traits[trait_idx], + fixed = fixed_cat, + covariate = fixed_cov, + K = gmat, + n.core = cores + ) + + results_GEBVs[, (trait_idx + 1)] <- traitpred$g + results_TRAITs[, (trait_idx + 1)] <- traitpred$pred } - #Organize dataframes - results_GEBVs[,1] <- row.names(data.frame(traitpred$g)) - results_TRAITs[,1] <- row.names(data.frame(traitpred$pred)) - - #Label the samples that already had phenotype information + # Organize dataframes + results_GEBVs[, 1] <- row.names(data.frame(traitpred$g)) + results_TRAITs[, 1] <- row.names(data.frame(traitpred$pred)) + + # Label the samples that already had phenotype information results_GEBVs <- data.frame(results_GEBVs) results_TRAITs <- data.frame(results_TRAITs) exists_in_df <- results_GEBVs[[1]] %in% pheno2[[1]] results_GEBVs <- cbind(results_GEBVs[1], "w/Pheno" = exists_in_df, results_GEBVs[-1]) results_TRAITs <- cbind(results_TRAITs[1], "w/Pheno" = exists_in_df, results_TRAITs[-1]) - - #Status + + # Status updateProgressBar(session = session, id = "pb_gp", value = 100, title = "Finished!") - - ##Make output tables depending on 1 or 2 VCF/pedigree files used. - #GEBVs + + ## Make output tables depending on 1 or 2 VCF/pedigree files used. + # GEBVs if (!is.null(advanced_options_pred$pred_est_file)) { # Subset rows where 'w/Pheno' is FALSE and drop the 'w/Pheno' column pred_outputs2$all_GEBVs <- results_GEBVs[results_GEBVs$`w/Pheno` == FALSE, !names(results_GEBVs) %in% "w/Pheno"] - } else{ + } else { pred_outputs2$all_GEBVs <- results_GEBVs } - - #BLUPs of genetic values + + # BLUPs of genetic values if (!is.null(advanced_options_pred$pred_est_file)) { # Subset rows where 'w/Pheno' is FALSE and drop the 'w/Pheno' column pred_outputs2$trait_output <- results_TRAITs[results_TRAITs$`w/Pheno` == FALSE, !names(results_TRAITs) %in% "w/Pheno"] - } else{ + } else { pred_outputs2$trait_output <- results_TRAITs } - - #End the event + + # End the event continue_prediction2(NULL) }) - - #Output the prediction tables + + # Output the prediction tables all_GEBVs <- reactive({ validate( need(!is.null(pred_outputs2$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") ) pred_outputs2$all_GEBVs }) - - #GEBVs from all iterations/folds - output$pred_gebvs_table2 <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - + + # GEBVs from all iterations/folds + output$pred_gebvs_table2 <- renderDT( + { + all_GEBVs() + }, + options = list(scrollX = TRUE, autoWidth = FALSE, pageLength = 5) + ) + trait_output <- reactive({ validate( need(!is.null(pred_outputs2$trait_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") ) pred_outputs2$trait_output }) - - #GEBVs from all iterations/folds - output$pred_trait_table <- renderDT({trait_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - + + # GEBVs from all iterations/folds + output$pred_trait_table <- renderDT( + { + trait_output() + }, + options = list(scrollX = TRUE, autoWidth = FALSE, pageLength = 5) + ) + output$download_vcft <- downloadHandler( filename = function() { paste0("BIGapp_Training_VCF_Example_file.vcf.gz") @@ -896,8 +970,9 @@ mod_GS_server <- function(input, output, session, parent_session){ content = function(file) { ex <- system.file("test-dose.vcf.gz", package = "BIGapp") file.copy(ex, file) - }) - + } + ) + output$download_vcfp <- downloadHandler( filename = function() { paste0("BIGapp_Predict_VCF_Example_file.vcf") @@ -905,8 +980,9 @@ mod_GS_server <- function(input, output, session, parent_session){ content = function(file) { ex <- system.file("test-dose-use-for-prediction.vcf", package = "BIGapp") file.copy(ex, file) - }) - + } + ) + output$download_pheno <- downloadHandler( filename = function() { paste0("BIGapp_passport_Example_file.csv") @@ -914,9 +990,10 @@ mod_GS_server <- function(input, output, session, parent_session){ content = function(file) { ex <- system.file("iris_passport_file.csv", package = "BIGapp") file.copy(ex, file) - }) - - #Download files for GP + } + ) + + # Download files for GP output$download_pred_results_file <- downloadHandler( filename = function() { paste0("Prediction-results-", Sys.Date(), ".zip") @@ -925,34 +1002,34 @@ mod_GS_server <- function(input, output, session, parent_session){ # Temporary files list temp_dir <- tempdir() temp_files <- c() - + # Create a temporary file for data frames ebv_file <- file.path(temp_dir, paste0("GS-EBV-predictions-", Sys.Date(), ".csv")) write.csv(pred_outputs2$all_GEBVs, ebv_file, row.names = FALSE) temp_files <- c(temp_files, ebv_file) - + trait_file <- file.path(temp_dir, paste0("GS-Phenotype-predictions-", Sys.Date(), ".csv")) write.csv(pred_outputs2$trait_output, trait_file, row.names = FALSE) temp_files <- c(temp_files, trait_file) - + # Zip files only if there's something to zip if (length(temp_files) > 0) { zip(file, files = temp_files, extras = "-j") # Using -j to junk paths } - + # Optionally clean up file.remove(temp_files) } ) - - ##Summary Info + + ## Summary Info pred_summary_info <- function() { # Handle possible NULL values for inputs dosage_file_name <- if (!is.null(input$pred_known_file$name)) input$pred_known_file$name else "No file selected" est_file_name <- if (!is.null(input$pred_est_file$name)) input$pred_est_file$name else "No file selected" passport_file_name <- if (!is.null(input$pred_trait_file$name)) input$pred_trait_file$name else "No file selected" selected_ploidy <- if (!is.null(input$pred_est_ploidy)) as.character(input$pred_est_ploidy) else "Not selected" - + # Print the summary information cat( "BIGapp Selection Summary\n", @@ -971,8 +1048,8 @@ mod_GS_server <- function(input, output, session, parent_session){ paste("Selected Ploidy:", selected_ploidy), "\n", paste("Selected Trait(s):", input$pred_trait_info2), "\n", paste("Selected Fixed Effects:", input$pred_fixed_info2), "\n", - #paste("Selected Model:", input$pred_fixed_info2), "\n", - #paste("Selected Matrix:", input$pred_fixed_info2), "\n", + # paste("Selected Model:", input$pred_fixed_info2), "\n", + # paste("Selected Matrix:", input$pred_fixed_info2), "\n", "\n", "### R Packages Used ###\n", "\n", @@ -986,7 +1063,7 @@ mod_GS_server <- function(input, output, session, parent_session){ sep = "" ) } - + # Popup for analysis summary observeEvent(input$pred_summary, { showModal(modalDialog( @@ -1002,8 +1079,8 @@ mod_GS_server <- function(input, output, session, parent_session){ ) )) }) - - + + # Download Summary Info output$download_pred_info <- downloadHandler( filename = function() { diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index f433ab2..8f5a788 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -429,6 +429,32 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) + #### VCF sanity check + checks <- vcf_sanity_check(input$pred_file$datapath) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "VCF_compressed" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("chrom_info", "pos_info", "ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = as.numeric(input$pred_ploidy)) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + + if(is.null(advanced_options$pred_matrix)) advanced_options$pred_matrix <- "none_selected" if (((is.null(input$pred_file$datapath) & advanced_options$pred_matrix != "Amatrix") | (is.null(advanced_options$ped_file$datapath) & advanced_options$pred_matrix == "Amatrix")) | @@ -521,35 +547,11 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ #Getting genotype matrix #Geno.file conversion if needed if(!is.null(input$pred_file$datapath)){ - geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") + geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT", ploidy = as.numeric(input$pred_ploidy)) + if(is.null(geno_snps)) return() geno <- geno_snps[[1]] values_boxes$pred_snps <- geno_snps[[2]] - #Check that the ploidy entered is correct - if (input$pred_ploidy != max(geno, na.rm = TRUE)) { - # If condition is met, show notification toast - shinyalert( - title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - # Stop the observeEvent gracefully - #return() - } - #Make sure the trait file and genotype file are in the same order # Column names for geno (assuming these are the individual IDs) colnames_geno <- colnames(geno) diff --git a/R/mod_PCA.R b/R/mod_PCA.R index b315e76..cdacdbe 100644 --- a/R/mod_PCA.R +++ b/R/mod_PCA.R @@ -93,6 +93,7 @@ mod_PCA_ui <- function(id){ sliderInput(inputId = ns('pca_image_width'), label = 'Width', value = 10, min = 1, max = 20, step=0.5), sliderInput(inputId = ns('pca_image_height'), label = 'Height', value = 6, min = 1, max = 20, step = 0.5), downloadButton(ns("download_pca"), "Save Image"), + downloadButton(ns("download_pcs"), "Save PCs table"), circle = FALSE, status = "danger", icon = icon("floppy-disk"), width = "300px", label = "Save", @@ -363,6 +364,31 @@ mod_PCA_server <- function(input, output, session, parent_session){ genomat <- read.csv(geno, header = TRUE, row.names = 1, check.names = FALSE) } else{ + #### VCF sanity check + checks <- vcf_sanity_check(geno) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "VCF_compressed" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("chrom_info", "pos_info", "ref_alt", "max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = as.numeric(ploidy)) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + #Import genotype information if in VCF format vcf <- read.vcfR(geno, verbose = FALSE) @@ -447,25 +473,6 @@ mod_PCA_server <- function(input, output, session, parent_session){ # Print the modified dataframe row.names(info_df) <- info_df[,1] - # Check ploidy - if(input$pca_ploidy != max(genomat, na.rm = T)){ - shinyalert( - title = "Wrong ploidy", - text = "The input ploidy does not match the maximum dosage found in the genotype file", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - req(input$pca_ploidy == max(genomat, na.rm = T)) - } - #Plotting #First build a relationship matrix using the genotype values G.mat.updog <- Gmatrix(t(genomat), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") @@ -909,6 +916,15 @@ mod_PCA_server <- function(input, output, session, parent_session){ ex <- system.file("iris_passport_file.csv", package = "BIGapp") file.copy(ex, file) }) + + output$download_pcs <- downloadHandler( + filename = function() { + paste0("PCs.csv") + }, + content = function(file) { + df <- pca_data$pc_df_pop + write.csv(df, file, row.names = FALSE, quote = FALSE) + }) } diff --git a/R/mod_dapc.R b/R/mod_dapc.R index 174ecc1..921bc39 100644 --- a/R/mod_dapc.R +++ b/R/mod_dapc.R @@ -203,6 +203,31 @@ mod_dapc_server <- function(input, output, session, parent_session){ ##Add in VCF with the vcfR package (input VCF, then convert to genlight using vcf2genlight function) #Import genotype information if in VCF format + #### VCF sanity check + checks <- vcf_sanity_check(geno) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("chrom_info", "pos_info", "ref_alt") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = as.numeric(ploidy)) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + vcf <- read.vcfR(geno) #Get items in FORMAT column @@ -270,6 +295,31 @@ mod_dapc_server <- function(input, output, session, parent_session){ selected_K <- as.numeric(input$dapc_k) #Import genotype information if in VCF format + #### VCF sanity check + checks <- vcf_sanity_check(geno) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "VCF_compressed" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("chrom_info", "pos_info", "ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = as.numeric(ploidy)) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + vcf <- read.vcfR(geno) #Get items in FORMAT column @@ -349,14 +399,16 @@ mod_dapc_server <- function(input, output, session, parent_session){ col = my_palette, pch = 20, # shapes cstar = 1, # 0 or 1, arrows from center of cluster - cell = 2, # size of elipse + cellipse = 2, # size of elipse scree.da = T, # plot da scree.pca = T, # plot pca posi.da = "topright", posi.pca="bottomright", mstree = F, # lines connecting clusters - lwd = 1, lty = 2, - leg = F, clab = 1) # legend and label of legend clusters. clab 0 or 1 + lwd = 1, + lty = 2, + legeng = F, + clabel = 1) # legend and label of legend clusters. clab 0 or 1 }) output$DAPC_plot <- renderPlot({ @@ -424,14 +476,14 @@ mod_dapc_server <- function(input, output, session, parent_session){ col = my_palette, pch = 20, # shapes cstar = 1, # 0 or 1, arrows from center of cluster - cell = 2, # size of elipse + cellipse = 2, # size of elipse scree.da = T, # plot da scree.pca = T, # plot pca posi.da = "topright", posi.pca="bottomright", mstree = F, # lines connecting clusters lwd = 1, lty = 2, - leg = F, clab = 1) # legend and label of legend clusters. clab 0 or 1 + legend = F, clabel = 1) # legend and label of legend clusters. clab 0 or 1 } else if (input$dapc_figure == "BIC Plot") { req(dapc_items$BIC, dapc_items$bestK) diff --git a/R/mod_diversity.R b/R/mod_diversity.R index ddf97a8..a10c089 100644 --- a/R/mod_diversity.R +++ b/R/mod_diversity.R @@ -224,6 +224,31 @@ mod_diversity_server <- function(input, output, session, parent_session){ updateProgressBar(session = session, id = "pb_diversity", value = 20, title = "Importing VCF") #Import genotype information if in VCF format + #### VCF sanity check + checks <- vcf_sanity_check(geno) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "chrom_info", "pos_info", "VCF_compressed" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = NULL, + warning_if_true = NULL, + input_ploidy = ploidy) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + vcf <- read.vcfR(geno, verbose = FALSE) #Save position information diff --git a/R/mod_dosage2vcf.R b/R/mod_dosage2vcf.R index e7fa70c..b168d4f 100644 --- a/R/mod_dosage2vcf.R +++ b/R/mod_dosage2vcf.R @@ -326,8 +326,8 @@ mod_dosage2vcf_server <- function(input, output, session, parent_session){ updateProgressBar(session = session, id = "dosage2vcf_pb", value = 30, title = "Writing VCF") madc2vcf_all(madc = read_madc, - botloci = botloci, - hap_seq = input$hapDB_file$datapath, + botloci_file = botloci, + hap_seq_file = input$hapDB_file$datapath, n.cores= input$cores, rm_multiallelic_SNP = TRUE, out_vcf = output_name, diff --git a/R/mod_gwas.R b/R/mod_gwas.R index b569273..d308a9c 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -126,7 +126,8 @@ mod_gwas_ui <- function(id){ sliderInput(inputId = ns('gwas_image_height'), label = 'Height', value = 5, min = 1, max = 20, step = 0.5), fluidRow( downloadButton(ns("download_gwas_figure"), "Save Image"), - downloadButton(ns("download_gwas_file"), "Save Files")), + downloadButton(ns("download_gwas_file"), "Save Files"), + downloadButton(ns("download_viewpoly"), "VIEWpoly input")), circle = FALSE, status = "danger", icon = icon("floppy-disk"), width = "300px", label = "Save", @@ -455,6 +456,31 @@ mod_gwas_server <- function(input, output, session, parent_session){ temp_geno_file <- tempfile(fileext = ".csv") #Convert VCF file if submitted + #### VCF sanity check + checks <- vcf_sanity_check(input$gwas_file$datapath, max_markers = 10000) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples", "chrom_info", "pos_info", "VCF_compressed" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("ref_alt","max_markers") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = ploidy) + + if(checks_result) return() # Stop the analysis if checks fail + ######### + vcf <- read.vcfR(input$gwas_file$datapath, verbose = FALSE) #Extract GT @@ -872,6 +898,18 @@ mod_gwas_server <- function(input, output, session, parent_session){ ) }) + + + output$download_viewpoly <- downloadHandler( + filename = function() { + paste0("BIGapp_Viewpoly_GWASpoly.RData") + }, + content = function(file) { + temp <- gwas_data$data2 + save(temp, file = file) + } + ) + #Download files for GWAS output$download_gwas_file <- downloadHandler( filename = function() { diff --git a/R/run_app.R b/R/run_app.R index 7c41142..47b1cb9 100644 --- a/R/run_app.R +++ b/R/run_app.R @@ -17,7 +17,7 @@ run_app <- function( ) { #Uncomment the sink command to allow console output sink(file = tempfile()) - options(warn = -1) + #options(warn = -1) with_golem_options( app = shinyApp( diff --git a/R/utils.R b/R/utils.R index f1f7385..0696bcf 100644 --- a/R/utils.R +++ b/R/utils.R @@ -6,7 +6,7 @@ get_counts <- function(madc_file, output_name) { # Read the madc file madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE, header = FALSE) header <- grep("AlleleID", madc_df[,1]) - if(header > 1) madc_df <- madc_df[-c(1:(grep("AlleleID", madc_df[,1]))-1),] + if(header > 1) madc_df <- madc_df[-c(1:(grep("AlleleID", madc_df[,1]))-1), ] colnames(madc_df) <- madc_df[1,] madc_df <- madc_df[-1,] @@ -35,7 +35,7 @@ get_matrices <- function(result_df) { alt_df <- subset(update_df, grepl("Alt$", AlleleID)) #Ensure that each has the same SNPs and that they are in the same order - same <- identical(alt_df$CloneID,ref_df$CloneID) + same <- identical(alt_df$CloneID, ref_df$CloneID) ###Convert the ref and alt counts into matrices with the CloneID as the index #Set SNP names as index @@ -52,10 +52,10 @@ get_matrices <- function(result_df) { alt_df <- alt_df[common_ids, ] } - #Remove unwanted columns and convert to matrix + # Remove unwanted columns and convert to matrix rm.col <- c("AlleleID", "CloneID", "AlleleSequence", "ClusterConsensusSequence", - "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", - "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp") + "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") ref_matrix <- as.matrix(ref_df[, -which(colnames(ref_df) %in% rm.col)]) alt_matrix <- as.matrix(alt_df[, -which(colnames(alt_df) %in% rm.col)]) @@ -64,7 +64,7 @@ get_matrices <- function(result_df) { class(ref_matrix) <- "numeric" class(alt_matrix) <- "numeric" - #Make the size matrix by combining the two matrices + # Make the size matrix by combining the two matrices size_matrix <- (ref_matrix + alt_matrix) #Count the number of cells with 0 count to estimate missing data @@ -115,27 +115,29 @@ convert_to_dosage <- function(gt) { findK <- function(genotypeMatrix, maxK, ploidy) { # Convert the genotype matrix to a genlight object genlight_new <- new("genlight", t(genotypeMatrix), - ind.names = row.names(t(genotypeMatrix)), - loc.names = colnames(t(genotypeMatrix)), - ploidy = ploidy, - NA.char = NA) + ind.names = row.names(t(genotypeMatrix)), + loc.names = colnames(t(genotypeMatrix)), + ploidy = ploidy, + NA.char = NA + ) #Assign the populations as the sample names since there is no assumed populations pop(genlight_new) <- genlight_new@ind.names - #Estimate number of clusters - #Retain all pca for the find.clusters step. Retain as few as possible while maximizing variance captured for DAPC step. - #Choose is the option to allow adegenet to select the best cluster number based on the BIC minimum - #The default criterion is "diffNgroup, which is not necessarily the minimum BIC, but based on the sharp decrease of the BIC value. - #Either way, this is a suggestion, and the number of clusters to use should be made with biology considerations. - graphics.off() #Prevent plot from automatically displaying - grp <- find.clusters(genlight_new, max.n.clust = maxK, + # Estimate number of clusters + # Retain all pca for the find.clusters step. Retain as few as possible while maximizing variance captured for DAPC step. + # Choose is the option to allow adegenet to select the best cluster number based on the BIC minimum + # The default criterion is "diffNgroup, which is not necessarily the minimum BIC, but based on the sharp decrease of the BIC value. + # Either way, this is a suggestion, and the number of clusters to use should be made with biology considerations. + graphics.off() # Prevent plot from automatically displaying + grp <- find.clusters(genlight_new, + max.n.clust = maxK, n.pca = nInd(genlight_new), stat = "BIC", criterion = "diffNgroup", parallel = FALSE, choose = FALSE) - + # Identify the best K based on lowest BIC bestK <- length(grp$size) @@ -143,40 +145,43 @@ findK <- function(genotypeMatrix, maxK, ploidy) { bicDF <- data.frame(K = 1:maxK, BIC = as.data.frame(grp$Kstat)$`grp$Kstat`) return(list(bestK = as.numeric(bestK), grp = grp, BIC = bicDF)) - + } #' @importFrom methods new performDAPC <- function(genotypeMatrix, selected_K, ploidy) { - + #Convert matrix to genlight genlight_new <- new("genlight", t(genotypeMatrix), ind.names = row.names(t(genotypeMatrix)), loc.names = colnames(t(genotypeMatrix)), ploidy = ploidy, NA.char = NA) - - #Get groups based on specified cluster number (K) - graphics.off() #Prevent plot from automatically displaying - grp <- find.clusters(genlight_new, n.clust = selected_K, - n.pca = nInd(genlight_new), - stat = "BIC", - criterion = "diffNgroup", - parallel = FALSE, - choose = FALSE) + + # Get groups based on specified cluster number (K) + graphics.off() # Prevent plot from automatically displaying + grp <- find.clusters(genlight_new, + n.clust = selected_K, + n.pca = nInd(genlight_new), + stat = "BIC", + criterion = "diffNgroup", + parallel = FALSE, + choose = FALSE + ) # Find the optimal number of principal components - #NOTE: The default n.da is K-1, but I have read previously to use #Samples - 1? + # NOTE: The default n.da is K-1, but I have read previously to use #Samples - 1? dapc1 <- dapc(genlight_new, grp$grp, - n.pca = nInd(genlight_new), - n.da = nInd(genlight_new)-1, - parallel = FALSE) + n.pca = nInd(genlight_new), + n.da = nInd(genlight_new) - 1, + parallel = FALSE + ) a.score <- optim.a.score(dapc1, plot = FALSE) n.pca <- a.score$best # Perform DAPC with the best K - finalDapc <- dapc(genlight_new, grp$grp, n.pca = n.pca, n.da = selected_K-1, parallel= FALSE) + finalDapc <- dapc(genlight_new, grp$grp, n.pca = n.pca, n.da = selected_K - 1, parallel = FALSE) # Extract the membership probabilities Q <- as.data.frame(finalDapc$posterior) @@ -184,14 +189,14 @@ performDAPC <- function(genotypeMatrix, selected_K, ploidy) { # Add cluster assignments to Q dataframe Q$Cluster_Assignment <- finalDapc$assign - #a data.frame giving the contributions of original variables (alleles in the case of genetic data) to the principal components of DAPC. - #dapc$var.contr + # a data.frame giving the contributions of original variables (alleles in the case of genetic data) to the principal components of DAPC. + # dapc$var.contr # Return list containing BIC dataframe, Q dataframe w/ dapc assignments return(list(Q = Q, dapc = finalDapc)) } -#Heterozygosity function +# Heterozygosity function calculate_heterozygosity <- function(genotype_matrix, ploidy = 2) { # Determine the heterozygous values based on ploidy heterozygous_values <- seq(1, ploidy - 1) @@ -231,8 +236,8 @@ calculateMAF <- function(df, ploidy) { df <- as.data.frame(df) } - #Convert the elements to numeric if they are characters - df[] <- lapply(df, function(x) if(is.character(x)) as.numeric(as.character(x)) else x) + # Convert the elements to numeric if they are characters + df[] <- lapply(df, function(x) if (is.character(x)) as.numeric(as.character(x)) else x) allele_frequencies <- apply(df, 1, function(row) { non_na_count <- sum(!is.na(row)) @@ -249,9 +254,9 @@ calculateMAF <- function(df, ploidy) { df$AF <- allele_frequencies df$MAF <- maf - maf_df <- df[,c("AF", "MAF"), drop = FALSE] + maf_df <- df[, c("AF", "MAF"), drop = FALSE] - #Make the row names (SNP ID) the first column + # Make the row names (SNP ID) the first column maf_df <- maf_df %>% rownames_to_column(var = "SNP_ID") @@ -263,7 +268,7 @@ calculate_percentages <- function(matrix_data, ploidy) { apply(matrix_data, 2, function(col) { counts <- table(col) prop <- prop.table(counts) * 100 - prop[as.character(0:ploidy)] # Adjust the range based on the max value (consider entering the ploidy value explicitly for max_val) + prop[as.character(0:ploidy)] # Adjust the range based on the max value (consider entering the ploidy value explicitly for max_val) }) } @@ -285,10 +290,9 @@ convert_genotype_counts <- function(df, ploidy, is_reference = TRUE) { #' posdefmat <- function(mat) { if (is.positive.definite(round(mat, 18))) { - g = mat - } - else { - g <-nearPD(mat)$mat + g <- mat + } else { + g <- nearPD(mat)$mat warning("The matrix was adjusted for the nearest positive definite matrix") } return(g) @@ -300,8 +304,10 @@ split_info_column <- function(info) { info_split <- str_split(info, ";")[[1]] # Create a named list by splitting each element by equals sign - info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), - map(info_split, ~ str_split(.x, "=")[[1]][1])) + info_list <- set_names( + map(info_split, ~ str_split(.x, "=")[[1]][2]), + map(info_split, ~ str_split(.x, "=")[[1]][1]) + ) return(info_list) } @@ -310,25 +316,51 @@ split_info_column <- function(info) { #' #' @param file_path character indicanting path to file #' @param requires which information is required from the VCF. Define the FORMAT or INFO letters. Example: c("GT", "DP", "PL") +#' @param check logical indicating whether to perform checks on the VCF file +#' @param ploidy numeric value indicating the ploidy of the species #' #' @importFrom vcfR read.vcfR #' @importFrom shinyalert shinyalert #' -read_geno_file <- function(file_path, requires = c("GT")){ +read_geno_file <- function(file_path, requires = c("GT"), ploidy, check=TRUE) { if (grepl("\\.csv$", file_path)) { geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) n_snps <- nrow(geno) return(list(geno, n_snps)) - } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - - #Convert VCF file if submitted + # Convert VCF file if submitted + #### VCF sanity check - Used in the GSAcc mod + if(check){ + checks <- vcf_sanity_check(file_path) + + error_if_false <- c( + "VCF_header", "VCF_columns", "unique_FORMAT", "GT", + "samples" + ) + + error_if_true <- c( + "multiallelics", "phased_GT", "mixed_ploidies", + "duplicated_samples", "duplicated_markers" + ) + + warning_if_false <- c("chrom_info", "pos_info", "ref_alt") + + checks_result <- vcf_sanity_messages(checks, + error_if_false, + error_if_true, + warning_if_false = warning_if_false, + warning_if_true = NULL, + input_ploidy = as.numeric(ploidy)) + + if(checks_result) return(NULL) # Stop the analysis if checks fail + ######### + } vcf <- read.vcfR(file_path, verbose = FALSE) all_requires <- vector() - for(i in 1:length(requires)) all_requires[i] <- grepl(requires[i], vcf@fix[1,8]) | grepl(requires[i], vcf@gt[1,1]) + for (i in 1:length(requires)) all_requires[i] <- grepl(requires[i], vcf@fix[1, 8]) | grepl(requires[i], vcf@gt[1, 1]) - if(!all(all_requires)) { + if (!all(all_requires)) { shinyalert( title = "Oops", text = paste("The VCF file does not contain required information:", requires[which(!all_requires)]), @@ -349,7 +381,7 @@ read_geno_file <- function(file_path, requires = c("GT")){ n_snps <- nrow(vcf@gt) - #Extract GT + # Extract GT geno <- extract.gt(vcf, element = "GT") geno <- apply(geno, 2, convert_to_dosage) class(geno) <- "numeric" @@ -384,7 +416,7 @@ read_geno_file <- function(file_path, requires = c("GT")){ ##' @param file location to be saved ##' @importFrom Rsamtools bgzip ##' -bgzip_compress <- function(output_name, file){ +bgzip_compress <- function(output_name, file) { # Check if the VCF file was created if (file.exists(output_name)) { # Compress the VCF file using gzip @@ -399,7 +431,6 @@ bgzip_compress <- function(output_name, file){ # Delete the temporary files unlink(bgzip_file) unlink(output_name) - } else { stop("Error: Failed to create the bgzip file.") } @@ -417,26 +448,24 @@ bgzip_compress <- function(output_name, file){ #' subset_vcf <- function(vcfR.object, remove.sample.list = NULL, remove.sample.file = NULL) { # Remove samples from the VCF object - + vcf <- vcfR.object - - if (!is.null(remove.sample.file)){ - #unwanted_samples <- read.csv(remove.sample.file, header = FALSE, stringsAsFactors = FALSE)$V1 + + if (!is.null(remove.sample.file)) { + # unwanted_samples <- read.csv(remove.sample.file, header = FALSE, stringsAsFactors = FALSE)$V1 unwanted_samples <- suppressWarnings(readLines(remove.sample.file)) - - }else{ + } else { unwanted_samples <- remove.sample.list } - - all_samples <- names(data.frame(vcf@gt, check.names=FALSE)) + + all_samples <- names(data.frame(vcf@gt, check.names = FALSE)) samples_to_keep <- all_samples[!all_samples %in% unwanted_samples] - vcf <- vcf[,samples_to_keep] - - #Get the number of samples removed to add to the filtering info popup + vcf <- vcf[, samples_to_keep] + + # Get the number of samples removed to add to the filtering info popup removed_number <- (length(all_samples) - length(samples_to_keep)) - + return(list(vcf = vcf, removed_number = removed_number)) - } #' Internal function @@ -454,30 +483,30 @@ subset_vcf <- function(vcfR.object, remove.sample.list = NULL, remove.sample.fil gmatrix2vcf <- function(Gmat.file, ploidy, output.file, dosageCount = "Reference") { # Convert a genotype matrix with numeric dosage values to a VCF file input_path <- Gmat.file - - #Import matrix + + # Import matrix dosage <- readr::read_csv(input_path) dosage <- data.frame(dosage, check.names = FALSE) - rownames(dosage) <- dosage[,1] + rownames(dosage) <- dosage[, 1] names(dosage)[1] <- "ID" - - ###Test that all values are integers within file - + + ### Test that all values are integers within file + # Header vcf_header <- c( - "##fileformat=VCFv4.3", - paste0("##BIGapp_gmatrix2vcf=",packageVersion("BIGapp")), - "##reference=NA", - "##contig=", - '##FORMAT=' + "##fileformat=VCFv4.3", + paste0("##BIGapp_gmatrix2vcf=", packageVersion("BIGapp")), + "##reference=NA", + "##contig=", + '##FORMAT=' ) - + # Assuming markers are in the "Chr_Pos" format df <- dosage %>% separate(ID, into = c("Chr", "Pos"), sep = "_") - - #Make the VCF df + + # Make the VCF df vcf_df <- data.frame( CHROM = df$Chr, POS = df$Pos, @@ -489,10 +518,10 @@ gmatrix2vcf <- function(Gmat.file, ploidy, output.file, dosageCount = "Reference INFO = ".", FORMAT = "GT" ) - + cat("Converting dosages to genotype format\n") - - ###Convert genotypes from dosage to gt + + ### Convert genotypes from dosage to gt # Precompute genotype strings for all possible dosage values to improve efficiency precompute_genotype_strings <- function(ploidy) { genotype_strings <- character(ploidy + 1) @@ -505,46 +534,74 @@ gmatrix2vcf <- function(Gmat.file, ploidy, output.file, dosageCount = "Reference } return(genotype_strings) } - + # Apply the precomputed genotype strings to the matrix convert_dosage2gt <- function(dosage_matrix, ploidy) { dosage_matrix <- as.matrix(dosage_matrix) genotype_strings <- precompute_genotype_strings(ploidy) - + # Handle missing values separately genotype_matrix <- matrix(genotype_strings[dosage_matrix + 1], nrow = nrow(dosage_matrix), ncol = ncol(dosage_matrix)) - #genotype_matrix[is.na(dosage_matrix)] <- "./." # Handle missing values + # genotype_matrix[is.na(dosage_matrix)] <- "./." # Handle missing values genotype_matrix[is.na(dosage_matrix)] <- paste(rep(".", ploidy), collapse = "/") - + # Retain row and column names rownames(genotype_matrix) <- rownames(dosage_matrix) colnames(genotype_matrix) <- colnames(dosage_matrix) - + return(genotype_matrix) } - + # Convert the dosage matrix to genotypes # Adjust dosage values depending on allele count dosage_check <- ifelse(dosageCount == "Reference", FALSE, TRUE) - dosage <- BIGr::flip_dosage(dosage[,-c(1)], ploidy, is.reference= dosage_check) + dosage <- BIGr::flip_dosage(dosage[, -c(1)], ploidy, is.reference = dosage_check) geno_df <- convert_dosage2gt(dosage, ploidy) - - #Combine info from the matrices to form the VCF information for each sample - #Combine the dataframes together - vcf_df <- cbind(vcf_df,geno_df) - + + # Combine info from the matrices to form the VCF information for each sample + # Combine the dataframes together + vcf_df <- cbind(vcf_df, geno_df) + # Add # to the CHROM column name colnames(vcf_df)[1] <- "#CHROM" - + # Write the header to the file writeLines(vcf_header, con = output.file) - + # Append the dataframe to the file in tab-separated format suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - + # Unload all items from memory rm(vcf_df, geno_df, dosage, df) +} + +#' Check if a File is Compressed +#' +#' This function checks whether a given file is compressed by inspecting its +#' magic number (first few bytes). It detects common compression formats such as +#' gzip (`.gz`), bzip2 (`.bz2`), and xz (`.xz`). +#' +#' @param file_path A character string giving the path to the file to be checked. +#' +#' @return A character string indicating the compression type (`"gzip (.gz)"`, +#' `"bzip2 (.bz2)"`, or `"xz (.xz)"`) if the file is compressed, or `FALSE` if +#' the file is not recognized as compressed. +#' +is_compressed_file <- function(file_path) { + con <- file(file_path, "rb") + magic <- readBin(con, "raw", n = 4) + close(con) + # Known magic numbers for compressed formats + if (all(magic[1:2] == as.raw(c(0x1f, 0x8b)))) { + return("gzip (.gz)") + } else if (all(magic[1:3] == as.raw(c(0x42, 0x5a, 0x68)))) { + return("bzip2 (.bz2)") + } else if (all(magic[1:6] == as.raw(c(0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00)))) { + return("xz (.xz)") + } else { + return(FALSE) # Not a known compressed format + } } diff --git a/R/vcf_sanity_check.R b/R/vcf_sanity_check.R new file mode 100644 index 0000000..5db2e35 --- /dev/null +++ b/R/vcf_sanity_check.R @@ -0,0 +1,602 @@ +#' Perform a Sanity Check on a VCF File +#' +#' This function performs a series of checks on a VCF file to ensure its validity and integrity. It verifies the presence of required headers, columns, and data fields, and checks for common issues such as missing or malformed data. +#' +#' @param vcf_path A character string specifying the path to the VCF file. The file can be plain text or gzipped. +#' @param n_data_lines An integer specifying the number of data lines to sample for detailed checks. Default is 100. +#' @param depth_support_fields (Optional) A character vector of fields that are expected to be present in the FORMAT column for allele counts. Default is `c("AD", "RA", "AO", "RO", "NR", "NV", "SB", "F1R2", "F2R1")`. +#' @param max_markers An integer specifying the maximum number of markers allowed in the VCF file. Default is 10,000. +#' @param verbose A logical value indicating whether to print detailed messages during the checks. Default is FALSE. +#' +#' @return A list containing: +#' - `checks`: A named vector indicating the results of each check (TRUE or FALSE). +#' - `messages`: A data frame containing messages for each check, indicating success or failure. +#' - `duplicates`: A list containing any duplicated sample or marker IDs found in the VCF file. +#' - `ploidy_max`: The maximum ploidy detected from the genotype field, if applicable. +#' +#' @details The function performs the following checks: +#' - **VCF_header**: Verifies the presence of the `##fileformat` header. +#' - **VCF_compressed**: Checks if the VCF file is .gz compressed and if the extension is correct. +#' - **VCF_columns**: Ensures required columns (`#CHROM`, `POS`, `ID`, `REF`, `ALT`, `QUAL`, `FILTER`, `INFO`) are present. +#' - **max_markers**: Checks if the total number of markers exceeds the specified limit. +#' - **unique_FORMAT**: Ensures that the FORMAT fields are consistent across sampled markers. +#' - **GT**: Verifies the presence of the `GT` (genotype) field in the FORMAT column. +#' - **allele_counts**: Checks for allele-level count fields (e.g., `AD`, `RA`, `AO`, `RO`). +#' - **samples**: Ensures sample/genotype columns are present. +#' - **chrom_info** and **pos_info**: Verifies the presence of `CHROM` and `POS` columns. +#' - **ref_alt**: Ensures `REF` and `ALT` fields contain valid nucleotide codes. +#' - **multiallelics**: Identifies multiallelic sites (ALT field with commas). +#' - **phased_GT**: Checks for phased genotypes (presence of `|` in the `GT` field). +#' - **duplicated_samples**: Checks for duplicated sample IDs. +#' - **duplicated_markers**: Checks for duplicated marker IDs. +#' +#' @importFrom stats setNames +#' +#' @export +vcf_sanity_check <- function( + vcf_path, + n_data_lines = 100, + max_markers = 10000, + depth_support_fields = c("AD", "RA", "AO", "RO", "NR", "NV", "SB", "F1R2", "F2R1"), + verbose = FALSE) { + + # --- Prepare result vector --- + checks_names <- c( + "VCF_header", + "VCF_compressed", + "VCF_columns", + "max_markers", + "unique_FORMAT", + "GT", + "allele_counts", + "samples", + "chrom_info", + "pos_info", + "ref_alt", + "multiallelics", + "phased_GT", + "duplicated_samples", + "duplicated_markers", + "mixed_ploidies" + ) + checks <- setNames(rep(NA, length(checks_names)), checks_names) + + if (!file.exists(vcf_path)) { + warning("File does not exist.") + return(structure( + list( + checks = NULL, messages = "File does not exist", + duplicates = NULL, ploidy_max = NULL + ), + class = "vcf_sanity_check" + )) + } + + is_gz_ext <- grepl("\\.gz$", vcf_path) + is_gz <- is_compressed_file(vcf_path) + + con <- if (is_gz == "gzip (.gz)") { + checks["VCF_compressed"] <- TRUE + if(!is_gz_ext) { + checks["VCF_compressed"] <- FALSE + if(verbose) warning("File is compressed with gzip (.gz), but does not have .gz extension.") + } + gzfile(vcf_path, open = "rt") + } else if(is_gz == "bzip2 (.bz2)") { + if (verbose) warning("File is compressed th bzip2 (.bz2), which is not supported.") + checks["VCF_compressed"] <- FALSE + } else if (is_gz == "xz (.xz)") { + if (verbose) warning("File is compressed with xz (.xz), which is not supported.") + checks["VCF_compressed"] <- FALSE + } else { + checks["VCF_compressed"] <- TRUE + file(vcf_path, open = "r") + } + + lines <- readLines(con, warn = FALSE) + close(con) + + # Container for duplicated IDs + duplicates <- list( + duplicated_samples = character(0), + duplicated_markers = character(0) + ) + + # --- Header checks --- + header_lines <- grep("^##", lines, value = TRUE) + if (!any(grepl("^##fileformat=VCFv", header_lines))) { + checks["VCF_header"] <- FALSE + if (verbose) warning("Missing ##fileformat header.") + } else { + checks["VCF_header"] <- TRUE + if (verbose) cat("VCF header is present.\n") + } + + # --- Column header line --- + column_header_line <- grep("^#CHROM", lines, value = TRUE) + if (length(column_header_line) > 1) { + return(structure( + list( + checks = NULL, messages = "There are more than 1 line in your VCF starting with the header #CHROM. Please check the file format.", + duplicates = NULL, ploidy_max = NULL + ), + class = "vcf_sanity_check" + )) + } else if (length(column_header_line) == 0) { + return(structure( + list( + checks = NULL, messages = "Malformed VCF file: missing column header line.", + duplicates = NULL, ploidy_max = NULL + ), + class = "vcf_sanity_check" + )) + } else { + column_names <- unlist(strsplit(column_header_line, "\t")) + has_genotypes <- length(column_names) > 8 + required_columns <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") + checks["VCF_columns"] <- all(required_columns %in% column_names[1:8]) + if (checks["VCF_columns"]) { + if (verbose) cat("Required VCF columns are present.\n") + } else { + if (verbose) warning("Missing one or more required VCF columns.") + } + } + + # --- Total marker count --- + data_line_indices <- grep("^[^#]", lines) + total_markers <- length(data_line_indices) + if (verbose) cat(sprintf("Total markers (data rows): %d\n", total_markers)) + + checks["max_markers"] <- total_markers <= max_markers + if (!checks["max_markers"]) { + warning(sprintf("More than %d markers found. Consider subsampling.", max_markers)) + } + + # --- Check for duplicated marker IDs --- + if (total_markers > 0) { + marker_ids <- sapply(lines[data_line_indices], function(line) { + fields <- strsplit(line, "\t")[[1]] + if (length(fields) >= 3) fields[3] else NA + }) + marker_ids <- marker_ids[!is.na(marker_ids)] + duplicated_markers <- marker_ids[duplicated(marker_ids)] + duplicates$duplicated_markers <- duplicated_markers + if (length(duplicated_markers) > 0) { + checks["duplicated_markers"] <- TRUE + if (verbose) warning("Duplicated marker IDs found: ", paste(head(duplicated_markers, 10), collapse = ", "), "...") + } else { + if (verbose) cat("No duplicated marker IDs.\n") + checks["duplicated_markers"] <- FALSE + } + } + + # --- FORMAT field checks (GT, AD, etc.) --- + if (has_genotypes) { + checks["samples"] <- TRUE + if (n_data_lines >= length(data_line_indices)) sample_indices <- data_line_indices else + sample_indices <- sample(data_line_indices, n_data_lines) + format_fields <- character() + + chrom_pos <- list() # this list will store the CHROM and POS for n_data_lines sample markers + format_keys <- list() # this list store the format field for n_data_lines sample markers + for (i in seq_along(sample_indices)) { + fields <- strsplit(lines[sample_indices[i]], "\t")[[1]] + if (length(fields) >= 9) { + format_keys[[i]] <- unlist(strsplit(fields[9], ":")) + chrom_pos[[i]] <- fields[1:2] + } + } + + # Check if all markers sampled format are identical with the first + checks["unique_FORMAT"] <- all(sapply(format_keys[-1], function(x) identical(x, format_keys[[1]]))) + + # --- CHROM and POS column checks --- + chrom_pos <- do.call(rbind, chrom_pos) + checks["chrom_info"] <- all(chrom_pos[,1] != "." | chrom_pos[,1] != "" | !is.na(chrom_pos[,1])) + checks["pos_info"] <- all(chrom_pos[,2] != "." | chrom_pos[,2] != "" | !is.na(chrom_pos[,2])) + + if (checks["chrom_info"] && checks["pos_info"]) { + if (verbose) cat("Both CHROM and POS information are present.\n") + } else { + if (!checks["chrom_info"]) warning(" 'CHROM' information is missing for at least one marker.") + if (!checks["pos_info"]) warning(" 'POS' information is missing for at least one marker.") + } + + format_fields <- unique(unlist(format_keys)) + + # GT check + checks["GT"] <- "GT" %in% format_fields + if (verbose) { + if (checks["GT"]) cat("FORMAT field 'GT' (genotype) is present.\n") else warning("FORMAT field 'GT' is missing.") + } + # Allele counts check + checks["allele_counts"] <- any(depth_support_fields %in% format_fields) + if (checks["allele_counts"]) { + if (verbose) { + cat(sprintf( + "Allele count FORMAT field(s) found: %s\n", + paste(intersect(depth_support_fields, format_fields), collapse = ", ") + )) + } + } else { + warning(paste(" No required allele-level count fields found. e.g.", depth_support_fields)) + } + + # Optional: phased GT (presence of '|' instead of '/' in genotypes) + # this complicated regex avoids the 5 first columns + phased_lines <- grep('^(?:[^\\t]*\\t){5}.*\\|', lines[sample_indices], perl = TRUE) + checks["phased_GT"] <- length(phased_lines) > 0 + + # --- Check for duplicated sample names --- + sample_names <- column_names[10:length(column_names)] + duplicated_samples <- sample_names[duplicated(sample_names)] + duplicates$duplicated_samples <- duplicated_samples + if (length(duplicated_samples) > 0) { + if (verbose) warning("Duplicated sample names found: ", paste(duplicated_samples, collapse = ", ")) + checks["duplicated_samples"] <- TRUE + } else { + if (verbose) cat("No duplicated sample names.\n") + checks["duplicated_samples"] <- FALSE + } + } else { + checks["samples"] <- FALSE + checks["GT"] <- FALSE + checks["allele_counts"] <- FALSE + checks["phased_GT"] <- FALSE + checks["duplicated_samples"] <- FALSE + + warning("No sample/genotype columns found.") + } + + # --- Ploidy inference (based on GT) --- + ploidy_max <- NA # default if GT not found or no valid genotypes + + if (checks["GT"]) { + ploidy_values <- c() + + for (i in seq_along(sample_indices)) { + fields <- strsplit(lines[sample_indices[i]], "\t")[[1]] + + # Skip if not enough fields + if (length(fields) < 10) next + + format_keys <- unlist(strsplit(fields[9], ":")) + gt_index <- which(format_keys == "GT") + + # Loop through samples (from column 10 onward) + for (sample_field in fields[10:length(fields)]) { + sample_fields <- unlist(strsplit(sample_field, ":")) + if (length(sample_fields) >= gt_index) { + gt_raw <- sample_fields[gt_index] + if (grepl("[/|]", gt_raw)) { + ploidy <- length(unlist(strsplit(gt_raw, "[/|]"))) + ploidy_values <- c(ploidy_values, ploidy) + } + } + } + } + + if (length(ploidy_values) > 0) { + ploidy_max <- max(ploidy_values, na.rm = TRUE) + if (verbose) cat("Highest ploidy detected from GT field:", ploidy_max, "\n") + } + if (length(unique(ploidy_values)) > 1) { + checks["mixed_ploidies"] <- TRUE + if (verbose) cat("Mixed ploidies detected\n") + } else { + checks["mixed_ploidies"] <- FALSE + } + } + + # --- REF/ALT basic check on sample rows --- + sample_lines <- lines[head(data_line_indices, n_data_lines)] + ref_alt_valid <- sapply(sample_lines, function(line) { + fields <- strsplit(line, "\t")[[1]] + if (length(fields) >= 5) { + ref <- fields[4] + alt <- fields[5] + grepl("^[ACGTN]+$", ref) && grepl("^[ACGTN.,<>]+$", alt) + } else { + FALSE + } + }) + checks["ref_alt"] <- all(ref_alt_valid) + + # --- Multiallelic site check (ALT with ',' separator) --- + multiallelic_flags <- grepl(",", sapply(sample_lines, function(line) strsplit(line, "\t")[[1]][5])) + checks["multiallelics"] <- any(multiallelic_flags) + + # --- Compile messages --- + + # Messages in case of failure + messages <- data.frame( + "VCF_header" = c( + "VCF header is missing. Please check the file format\n", + "VCF header is present\n" + ), + "VCF_compressed" = c( + "VCF is compressed but filename doesn't have the extension or it has non-supported format\n", + "VCF is .gz compressed or uncompressed\n" + ), + "VCF_columns" = c( + "Required VCF columns are missing. Please check the file format\n", + "Required VCF columns are present\n" + ), + "max_markers" = c( + "More than 10,000 markers found. Consider subsampling or running in HPC\n", + "Less than maximum number of markers found\n" + ), + "unique_FORMAT" = c( + "FORMAT fields are not consistent across sampled markers\n", + "FORMAT fields are consistent across sampled markers\n" + ), + "GT" = c( + "Genotype information is not available in the VCF file\n", + "Genotype information is available in the VCF file\n" + ), + "allele_counts" = c( + "Required field for allele counts are not available in the VCF file\n", + "Required field for allele counts are available in the VCF file\n" + ), + "samples" = c( + "Sample information is not available in the VCF file\n", + "Sample information is available in the VCF file\n" + ), + "chrom_info" = c( + "Chromosome information is not available in the VCF file\n", + "Chromosome information is available in the VCF file\n" + ), + "pos_info" = c( + "Position information is not available in the VCF file\n", + "Position information is available in the VCF file\n" + ), + "ref_alt" = c( + "REF/ALT fields contain invalid nucleotide codes\n", + "REF/ALT fields are valid\n" + ), + "multiallelics" = c( + "Multiallelic sites not found in the VCF file\n", + "Multiallelic sites found in the VCF file\n" + ), + "phased_GT" = c( + "Phased genotypes (|) are not present in the VCF file\n", + "Phased genotypes (|) are present in the VCF file\n" + ), + "duplicated_samples" = c( + "No duplicated sample IDs found", + paste("Duplicated sample IDs found: ", paste(duplicates$duplicated_samples, collapse = ", ")) + ), + "duplicated_markers" = c( + "No duplicated marker IDs found", + paste("Duplicated marker IDs found: ", paste(duplicates$duplicated_markers, collapse = ", ")) + ), + "mixed_ploidies" = c( + "No mixed ploidies detected", + "Mixed ploidies detected" + ) + ) + rownames(messages) <- c("false", "true") + + # --- Done --- + if (verbose) cat("Sanity check complete.\n") + return(structure( + list( + checks = checks, messages = messages, + duplicates = duplicates, ploidy_max = ploidy_max + ), + class = "vcf_sanity_check" + )) +} + +#' Generate Sanity Check Messages for VCF Files +#' +#' This function generates messages based on the results of a VCF sanity check. +#' +#' @param checks A `vcf_sanity_check` object containing the results of the sanity check. +#' @param error_if_false (Optional) A character vector of checks that must be TRUE for the VCF file to pass validation. +#' @param error_if_true (Optional) A character vector of checks that must be FALSE for the VCF file to pass validation. +#' @param warning_if_true (Optional) A character vector of checks that should trigger a warning if TRUE. +#' @param warning_if_false (Optional) A character vector of checks that should trigger a warning if FALSE. +#' @param input_ploidy (Optional) An integer specifying the expected ploidy of the VCF file. If provided, the function will compare it with the detected ploidy. +#' +#' @details This function uses the results of a VCF sanity check to display appropriate messages to the user. It checks for required conditions that must be met (TRUE or FALSE) and optionally validates the ploidy of the VCF file. If any issues are detected, the function displays alerts using `shinyalert`. +#' +#' @return A logical value indicating whether any of the specified checks failed (TRUE) or passed (FALSE). If any required checks are not met, an error alert is displayed. If any warnings are triggered, a warning alert is displayed. +#' +#' @importFrom shinyalert shinyalert +#' +#' @export +vcf_sanity_messages <- function( + checks, + error_if_false = c( + "VCF_header", + "VCF_columns", + "max_markers", + "GT", + "allele_counts", + "samples", + "chrom_info", + "pos_info", + "ref_alt", + "mixed_ploidies" + ), + error_if_true = c( "phased_GT", + "duplicated_samples", + "duplicated_markers"), + warning_if_true = c("multiallelics"), + warning_if_false = c("unique_FORMAT"), + input_ploidy = NULL) { + + # check must be from vcf_sanity_check + if (!inherits(checks, "vcf_sanity_check")) { + stop("check must be a vcf_sanity_check object.") + } + + if(is.null(checks$checks)){ + shinyalert( + title = "File Error", + text = checks$message, + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + + it_should_brake <- vector() + # Check required TRUE + if (length(error_if_false) > 0 && !all(checks$checks[error_if_false])) { + it_should_brake <- c(it_should_brake, !checks$checks[error_if_false]) + shinyalert( + title = "File Error", + text = paste(checks$message[1, error_if_false][which(!checks$checks[error_if_false])], + collapse = ";" + ), + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + + # Check warning TRUE + if (length(warning_if_true) > 0 && all(checks$checks[warning_if_true])) { + shinyalert( + title = "Warning", + text = paste(checks$message[2, warning_if_true][which(checks$checks[warning_if_true])], + collapse = ";" + ), + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + + # Check required FALSE + if (length(error_if_true) > 0 && any(checks$checks[error_if_true])) { + it_should_brake <- c(it_should_brake, checks$checks[error_if_true]) + + shinyalert( + title = "File Error", + text = paste(checks$message[2, error_if_true][which(checks$checks[error_if_true])], + collapse = ";" + ), + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + + # Check warning FALSE + if (length(warning_if_false) > 0 && !all(checks$checks[warning_if_false])) { + shinyalert( + title = "Warning", + text = paste(checks$message[1, warning_if_false][which(!checks$checks[warning_if_false])], + collapse = ";" + ), + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + + if (!is.null(input_ploidy)) { + it_should_brake <- c(it_should_brake, checks$ploidy_max != input_ploidy) + + if (checks$ploidy_max != input_ploidy) { + shinyalert( + title = "File Error", + text = "Informed ploidy doesn't match genotypes in the VCF file", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + } + return(any(it_should_brake)) +} + +#' Print Method for vcf_sanity_check Objects +#' +#' Displays a summary table of the VCF sanity check results, showing the check name, result (TRUE/FALSE), and the corresponding message for each selected check. +#' +#' @param x A vcf_sanity_check object as returned by `vcf_sanity_check()`. +#' @param checks A character vector of check names to display. Defaults to all checks in the object. +#' @param ... Additional arguments (currently ignored). +#' +#' @return The input object, invisibly. Called for its side effect (printing to console). +#' +#' @method print vcf_sanity_check +#' +#' @export +print.vcf_sanity_check <- function(x, checks = names(x$checks), ...) { + if (!inherits(x, "vcf_sanity_check")) stop("Object must be of class 'vcf_sanity_check'.") + + if(is.null(x$checks)) { + warning(paste("Checks could not be performed. Because:", x$message)) + } + + # Prepare a data.frame with check name, result, and message + res <- data.frame( + Check = checks, + Result = as.logical(x$checks[checks]), + Message = mapply( + function(chk, val) { + # TRUE is row 'true', FALSE is row 'false' + row_idx <- ifelse(isTRUE(val), "true", "false") + as.character(x$messages[row_idx, chk]) + }, + chk = checks, + val = x$checks[checks], + SIMPLIFY = TRUE + ), + stringsAsFactors = FALSE + ) + print(res, row.names = FALSE) +} diff --git a/inst/Dockerfile.deps b/inst/Dockerfile.deps new file mode 100644 index 0000000..0c48941 --- /dev/null +++ b/inst/Dockerfile.deps @@ -0,0 +1,78 @@ +# syntax=docker/dockerfile:1.7 +FROM rocker/r2u:noble + +SHELL ["/bin/bash","-eo","pipefail","-c"] +ENV DEBIAN_FRONTEND=noninteractive TZ=UTC +# keep compiles gentle for CI/QEMU +ENV MAKEFLAGS="-j2" +ENV R_PKG_INSTALL_ARGS="--no-build-vignettes --no-manual" + +# System libs (no Bioc via apt to avoid ABI/arch mismatches) +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ + apt-get update && apt-get install -y --no-install-recommends \ + build-essential gfortran cmake git wget pkg-config \ + ccache \ + libhts-dev \ + libcurl4-gnutls-dev libssl-dev libxml2-dev \ + libpng-dev libjpeg-dev libfreetype6-dev libfontconfig1-dev \ + libbz2-dev liblzma-dev zlib1g-dev \ + r-cran-remotes r-cran-pak r-cran-biocmanager \ + && rm -rf /var/lib/apt/lists/* + +# ccache +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache ccache -M 2G && \ + { echo 'CC=ccache gcc'; echo 'CXX=ccache g++'; echo 'FC=ccache gfortran'; } >> /usr/lib/R/etc/Makevars.site + +# ---- CRAN via r2u binaries first (fast) ---- +ENV CRAN_PKGS="adegenet curl DT dplyr vcfR \ +ggplot2 tidyr shiny config bs4Dash \ +golem purrr markdown scales plotly \ +shinyWidgets shinyjs shinydisconnect shinyalert \ +stringr updog AGHmatrix factoextra \ +httr future shinycssloaders RColorBrewer \ +tibble rrBLUP MASS Matrix matrixcalc BIGr" +RUN install2.r --skipinstalled --ncpus 1 $CRAN_PKGS || true + +# ---- Bioconductor from source (R 4.5 -> Bioc 3.21) ---- +RUN --mount=type=cache,target=/root/.cache/R/src Rscript - <<'RS' +options(Ncpus = 2) +repos <- BiocManager::repositories(version = "3.21") +options(repos = repos) +bioc_pkgs <- c("Rsamtools","Biostrings","pwalign","VariantAnnotation") +for (p in bioc_pkgs) { + if (!requireNamespace(p, quietly = TRUE)) { + message("Installing Bioconductor (source): ", p) + install.packages(p, dependencies = TRUE, type = "source", + INSTALL_opts = c("--no-build-vignettes","--no-manual")) + } +} +RS + +# ---- Fallback for any missing CRAN pkgs (source, sequential) ---- +RUN --mount=type=cache,target=/root/.cache/R/src Rscript - <<'RS' +pkgs <- strsplit(Sys.getenv("CRAN_PKGS"), " +")[[1]] +miss <- setdiff(pkgs, rownames(installed.packages())) +if (length(miss)) { + message("Installing remaining CRAN from source: ", paste(miss, collapse=", ")) + for (p in miss) { + install.packages(p, repos="https://cloud.r-project.org", type="source", + INSTALL_opts=c("--no-build-vignettes","--no-manual")) + } +} +RS + +# ---- GitHub dep (pak-free to avoid QEMU helper issues) ---- +RUN R -q -e "remotes::install_github('jendelman/GWASpoly', upgrade='never', dependencies=TRUE, \ + INSTALL_opts=c('--no-build-vignettes','--no-manual'))" + +# Version snapshot (handy for audits) +RUN R -q -e "pkgs <- c('adegenet','curl','DT','dplyr','vcfR','ggplot2','tidyr','shiny','config','bs4Dash', \ + 'golem','purrr','markdown','scales','plotly','shinyWidgets','shinyjs','shinydisconnect','shinyalert', \ + 'stringr','updog','AGHmatrix','factoextra','httr','future','shinycssloaders','RColorBrewer', \ + 'tibble','rrBLUP','MASS','Matrix','matrixcalc','BIGr', \ + 'Rsamtools','VariantAnnotation','Biostrings','pwalign'); \ + ip <- installed.packages(); dir.create('/opt/pins', recursive=TRUE, showWarnings=FALSE); \ + write.table(data.frame(Package=pkgs, Version=ip[pkgs,'Version']), \ + file='/opt/pins/deps_versions.tsv', sep='\t', row.names=FALSE, quote=FALSE)" diff --git a/inst/help_files/DAPC_cite.Rmd b/inst/help_files/DAPC_cite.Rmd index c590227..3e97a14 100644 --- a/inst/help_files/DAPC_cite.Rmd +++ b/inst/help_files/DAPC_cite.Rmd @@ -4,9 +4,9 @@ output: html_document date: "2024-08-29" --- -* **BIGapp** +* **BIGapp** and **BIGr** -* **BIGr** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 * **vcfR** diff --git a/inst/help_files/DArT_Report2VCF_cite.Rmd b/inst/help_files/DArT_Report2VCF_cite.Rmd index 7ff8c10..ffcc9c0 100644 --- a/inst/help_files/DArT_Report2VCF_cite.Rmd +++ b/inst/help_files/DArT_Report2VCF_cite.Rmd @@ -4,10 +4,9 @@ output: html_document date: "2024-08-29" --- -* **BIGapp** +* **BIGapp** and **BIGr** - -* **BIGr** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 * **vcfR** diff --git a/inst/help_files/GWAS_cite.Rmd b/inst/help_files/GWAS_cite.Rmd index 837ece6..bf4f98f 100644 --- a/inst/help_files/GWAS_cite.Rmd +++ b/inst/help_files/GWAS_cite.Rmd @@ -6,6 +6,7 @@ date: "2024-08-29" * BIGapp +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 * GWASpoly package diff --git a/inst/help_files/Genomic_Diversity_cite.Rmd b/inst/help_files/Genomic_Diversity_cite.Rmd index 1870dbc..523d0c8 100644 --- a/inst/help_files/Genomic_Diversity_cite.Rmd +++ b/inst/help_files/Genomic_Diversity_cite.Rmd @@ -4,9 +4,9 @@ output: html_document date: "2024-08-29" --- -* **BIGapp** +* **BIGapp** and **BIGr** -* **BIGr** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 * **vcfR** diff --git a/inst/help_files/Genomic_Prediction_cite.Rmd b/inst/help_files/Genomic_Prediction_cite.Rmd index 1e59726..bb06dd0 100644 --- a/inst/help_files/Genomic_Prediction_cite.Rmd +++ b/inst/help_files/Genomic_Prediction_cite.Rmd @@ -6,6 +6,8 @@ date: "2024-08-29" * **BIGapp** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 + * **vcfR** Knaus BJ, Grünwald NJ (2017). “VCFR: a package to manipulate and visualize variant call format data in R.” Molecular Ecology Resources, 17(1), 44–53. ISSN 757, https://dx.doi.org/10.1111/1755-0998.12549. diff --git a/inst/help_files/PCA_cite.Rmd b/inst/help_files/PCA_cite.Rmd index d9e2272..a0c546e 100644 --- a/inst/help_files/PCA_cite.Rmd +++ b/inst/help_files/PCA_cite.Rmd @@ -4,9 +4,9 @@ output: html_document date: "2024-08-29" --- -- **BIGapp** +- **BIGapp** and **BIGr** -- **BIGr** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 - **vcfR** - Knaus BJ, Grünwald NJ (2017). *VCFR: a package to manipulate and visualize variant call format data in R.* Molecular Ecology Resources, 17(1), 44–53. ISSN 757. [doi:10.1111/1755-0998.12549](https://dx.doi.org/10.1111/1755-0998.12549). diff --git a/inst/help_files/Predictive_Ability_cite.Rmd b/inst/help_files/Predictive_Ability_cite.Rmd index 2bdeda1..1dec9ef 100644 --- a/inst/help_files/Predictive_Ability_cite.Rmd +++ b/inst/help_files/Predictive_Ability_cite.Rmd @@ -4,7 +4,9 @@ output: html_document date: "2024-08-29" --- -* **BIGapp** +* **BIGapp** + +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 * **vcfR** diff --git a/inst/help_files/Updog_Dosage_Calling_cite.Rmd b/inst/help_files/Updog_Dosage_Calling_cite.Rmd index e7ca342..0ba57e0 100644 --- a/inst/help_files/Updog_Dosage_Calling_cite.Rmd +++ b/inst/help_files/Updog_Dosage_Calling_cite.Rmd @@ -4,9 +4,9 @@ output: html_document date: "2024-08-29" --- -- **BIGapp** +- **BIGapp** and **BIGr** -- **BIGr** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 - **Updog Package** diff --git a/inst/help_files/VCF_Filtering_cite.Rmd b/inst/help_files/VCF_Filtering_cite.Rmd index 390e900..1a9a0a9 100644 --- a/inst/help_files/VCF_Filtering_cite.Rmd +++ b/inst/help_files/VCF_Filtering_cite.Rmd @@ -5,9 +5,9 @@ date: "2024-08-29" --- -- **BIGapp** +- **BIGapp** and **BIGr** -- **BIGr** +Sandercock A.M., Peel M.D., Taniguti C.H., Chinchilla-Vargas J., Chen S., Sapkota M., Lin M., Zhao D., Ackerman A.J., Basnet B.R., Beil C.T., Sheehan M.J. (2025). BIGapp: A User-Friendly Genomic Tool Kit Identified Quantitative Trait Loci for Creeping Rootedness in Alfalfa (Medicago sativa L.)., The Plant Genome. doi:https://doi.org/10.1002/tpg2.70067 - **Updog** (if filtering parameters used) Gerard, D., Ferrão, L. F. V., Garcia, A. A. F., & Stephens, M. (2018). *Genotyping Polyploids from Messy Sequencing Data*. Genetics, 210(3), 789-807. [doi: 10.1534/genetics.118.301468](https://doi.org/10.1534/genetics.118.301468). diff --git a/inst/iris_DArT_VCF.vcf.gz b/inst/iris_DArT_VCF.vcf.gz index 03de653..756e73f 100644 Binary files a/inst/iris_DArT_VCF.vcf.gz and b/inst/iris_DArT_VCF.vcf.gz differ diff --git a/inst/update_dep_image.sh b/inst/update_dep_image.sh new file mode 100644 index 0000000..094b96c --- /dev/null +++ b/inst/update_dep_image.sh @@ -0,0 +1,32 @@ +# The arm64 image is too large to load with Github actions +# So we build and push locally with the following commands + +# There is a file inside the image with all packages versions listed +# find it at /opt/pins/deps_versions.tsv + +# Make sure to login with BI user +IMAGE_DEPS=docker.io/breedinginsight/bigapp-deps +DEPS_TAG=r4.5-bioc3.21-2025-08 # Update version here + +# amd64 (load locally instead of pushing) +docker buildx build \ + -f Dockerfile.deps \ + --platform linux/amd64 \ + -t $IMAGE_DEPS:$DEPS_TAG-amd64 \ + --progress=plain \ + --load \ + . + +# arm64 (push or save to a tar; cannot load multi-arch to local daemon) +docker buildx build \ + -f Dockerfile.deps \ + --platform linux/arm64 \ + -t $IMAGE_DEPS:$DEPS_TAG-arm64 \ + --progress=plain \ + --push \ + . + +docker buildx imagetools create \ + -t $IMAGE_DEPS:$DEPS_TAG \ + $IMAGE_DEPS:$DEPS_TAG-amd64 \ + $IMAGE_DEPS:$DEPS_TAG-arm64 \ No newline at end of file diff --git a/man/is_compressed_file.Rd b/man/is_compressed_file.Rd new file mode 100644 index 0000000..9a1b5e4 --- /dev/null +++ b/man/is_compressed_file.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{is_compressed_file} +\alias{is_compressed_file} +\title{Check if a File is Compressed} +\usage{ +is_compressed_file(file_path) +} +\arguments{ +\item{file_path}{A character string giving the path to the file to be checked.} +} +\value{ +A character string indicating the compression type (\code{"gzip (.gz)"}, +\code{"bzip2 (.bz2)"}, or \code{"xz (.xz)"}) if the file is compressed, or \code{FALSE} if +the file is not recognized as compressed. +} +\description{ +This function checks whether a given file is compressed by inspecting its +magic number (first few bytes). It detects common compression formats such as +gzip (\code{.gz}), bzip2 (\code{.bz2}), and xz (\code{.xz}). +} diff --git a/man/print.vcf_sanity_check.Rd b/man/print.vcf_sanity_check.Rd new file mode 100644 index 0000000..68ab2b9 --- /dev/null +++ b/man/print.vcf_sanity_check.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vcf_sanity_check.R +\name{print.vcf_sanity_check} +\alias{print.vcf_sanity_check} +\title{Print Method for vcf_sanity_check Objects} +\usage{ +\method{print}{vcf_sanity_check}(x, checks = names(x$checks), ...) +} +\arguments{ +\item{x}{A vcf_sanity_check object as returned by \code{vcf_sanity_check()}.} + +\item{checks}{A character vector of check names to display. Defaults to all checks in the object.} + +\item{...}{Additional arguments (currently ignored).} +} +\value{ +The input object, invisibly. Called for its side effect (printing to console). +} +\description{ +Displays a summary table of the VCF sanity check results, showing the check name, result (TRUE/FALSE), and the corresponding message for each selected check. +} diff --git a/man/read_geno_file.Rd b/man/read_geno_file.Rd index 8552c1d..cfb2d27 100644 --- a/man/read_geno_file.Rd +++ b/man/read_geno_file.Rd @@ -4,12 +4,16 @@ \alias{read_geno_file} \title{Read geno file} \usage{ -read_geno_file(file_path, requires = c("GT")) +read_geno_file(file_path, requires = c("GT"), ploidy, check = TRUE) } \arguments{ \item{file_path}{character indicanting path to file} \item{requires}{which information is required from the VCF. Define the FORMAT or INFO letters. Example: c("GT", "DP", "PL")} + +\item{ploidy}{numeric value indicating the ploidy of the species} + +\item{check}{logical indicating whether to perform checks on the VCF file} } \description{ Read geno file diff --git a/man/vcf_sanity_check.Rd b/man/vcf_sanity_check.Rd new file mode 100644 index 0000000..d6675d5 --- /dev/null +++ b/man/vcf_sanity_check.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vcf_sanity_check.R +\name{vcf_sanity_check} +\alias{vcf_sanity_check} +\title{Perform a Sanity Check on a VCF File} +\usage{ +vcf_sanity_check( + vcf_path, + n_data_lines = 100, + max_markers = 10000, + depth_support_fields = c("AD", "RA", "AO", "RO", "NR", "NV", "SB", "F1R2", "F2R1"), + verbose = FALSE +) +} +\arguments{ +\item{vcf_path}{A character string specifying the path to the VCF file. The file can be plain text or gzipped.} + +\item{n_data_lines}{An integer specifying the number of data lines to sample for detailed checks. Default is 100.} + +\item{max_markers}{An integer specifying the maximum number of markers allowed in the VCF file. Default is 10,000.} + +\item{depth_support_fields}{(Optional) A character vector of fields that are expected to be present in the FORMAT column for allele counts. Default is \code{c("AD", "RA", "AO", "RO", "NR", "NV", "SB", "F1R2", "F2R1")}.} + +\item{verbose}{A logical value indicating whether to print detailed messages during the checks. Default is FALSE.} +} +\value{ +A list containing: +\itemize{ +\item \code{checks}: A named vector indicating the results of each check (TRUE or FALSE). +\item \code{messages}: A data frame containing messages for each check, indicating success or failure. +\item \code{duplicates}: A list containing any duplicated sample or marker IDs found in the VCF file. +\item \code{ploidy_max}: The maximum ploidy detected from the genotype field, if applicable. +} +} +\description{ +This function performs a series of checks on a VCF file to ensure its validity and integrity. It verifies the presence of required headers, columns, and data fields, and checks for common issues such as missing or malformed data. +} +\details{ +The function performs the following checks: +\itemize{ +\item \strong{VCF_header}: Verifies the presence of the \verb{##fileformat} header. +\item \strong{VCF_compressed}: Checks if the VCF file is .gz compressed and if the extension is correct. +\item \strong{VCF_columns}: Ensures required columns (\verb{#CHROM}, \code{POS}, \code{ID}, \code{REF}, \code{ALT}, \code{QUAL}, \code{FILTER}, \code{INFO}) are present. +\item \strong{max_markers}: Checks if the total number of markers exceeds the specified limit. +\item \strong{unique_FORMAT}: Ensures that the FORMAT fields are consistent across sampled markers. +\item \strong{GT}: Verifies the presence of the \code{GT} (genotype) field in the FORMAT column. +\item \strong{allele_counts}: Checks for allele-level count fields (e.g., \code{AD}, \code{RA}, \code{AO}, \code{RO}). +\item \strong{samples}: Ensures sample/genotype columns are present. +\item \strong{chrom_info} and \strong{pos_info}: Verifies the presence of \code{CHROM} and \code{POS} columns. +\item \strong{ref_alt}: Ensures \code{REF} and \code{ALT} fields contain valid nucleotide codes. +\item \strong{multiallelics}: Identifies multiallelic sites (ALT field with commas). +\item \strong{phased_GT}: Checks for phased genotypes (presence of \code{|} in the \code{GT} field). +\item \strong{duplicated_samples}: Checks for duplicated sample IDs. +\item \strong{duplicated_markers}: Checks for duplicated marker IDs. +} +} diff --git a/man/vcf_sanity_messages.Rd b/man/vcf_sanity_messages.Rd new file mode 100644 index 0000000..1a0fb3e --- /dev/null +++ b/man/vcf_sanity_messages.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vcf_sanity_check.R +\name{vcf_sanity_messages} +\alias{vcf_sanity_messages} +\title{Generate Sanity Check Messages for VCF Files} +\usage{ +vcf_sanity_messages( + checks, + error_if_false = c("VCF_header", "VCF_columns", "max_markers", "GT", "allele_counts", + "samples", "chrom_info", "pos_info", "ref_alt", "mixed_ploidies"), + error_if_true = c("phased_GT", "duplicated_samples", "duplicated_markers"), + warning_if_true = c("multiallelics"), + warning_if_false = c("unique_FORMAT"), + input_ploidy = NULL +) +} +\arguments{ +\item{checks}{A \code{vcf_sanity_check} object containing the results of the sanity check.} + +\item{error_if_false}{(Optional) A character vector of checks that must be TRUE for the VCF file to pass validation.} + +\item{error_if_true}{(Optional) A character vector of checks that must be FALSE for the VCF file to pass validation.} + +\item{warning_if_true}{(Optional) A character vector of checks that should trigger a warning if TRUE.} + +\item{warning_if_false}{(Optional) A character vector of checks that should trigger a warning if FALSE.} + +\item{input_ploidy}{(Optional) An integer specifying the expected ploidy of the VCF file. If provided, the function will compare it with the detected ploidy.} +} +\value{ +A logical value indicating whether any of the specified checks failed (TRUE) or passed (FALSE). If any required checks are not met, an error alert is displayed. If any warnings are triggered, a warning alert is displayed. +} +\description{ +This function generates messages based on the results of a VCF sanity check. +} +\details{ +This function uses the results of a VCF sanity check to display appropriate messages to the user. It checks for required conditions that must be met (TRUE or FALSE) and optionally validates the ploidy of the VCF file. If any issues are detected, the function displays alerts using \code{shinyalert}. +} diff --git a/tests/testthat/test-DosageCall.R b/tests/testthat/test-DosageCall.R index 05ab2df..ef8a912 100644 --- a/tests/testthat/test-DosageCall.R +++ b/tests/testthat/test-DosageCall.R @@ -4,33 +4,34 @@ context("Dosage Calling") #library(vcfR) test_that("Dosage Calling from MADC file",{ - + future::plan(future::sequential) + madc_file <- system.file("iris_DArT_MADC.csv", package="BIGapp") - + output_name <- "output" ploidy <- 2 - cores <- 2 + cores <- 1 model_select <- "norm" # TODO: test for other models - + # Status #Import genotype info if genotype matrix format - + # Call the get_counts function with the specified MADC file path and output file path #Status result_df <- get_counts(madc_file, output_name) - + #Call the get_matrices function matrices <- get_matrices(result_df) - + mout <- updog::multidog(refmat = matrices$ref_matrix, sizemat = matrices$size_matrix, ploidy = as.numeric(ploidy), model = model_select, nc = cores) - + expect_equal(sum(mout$snpdf$bias), 402.7979, tolerance = 0.01) expect_equal(sum(mout$inddf$postmean), 95229.13, tolerance = 0.01) - + # Convert updog to VCF updog2vcf( multidog.object = mout, @@ -38,40 +39,40 @@ test_that("Dosage Calling from MADC file",{ updog_version = packageVersion("updog"), compress = TRUE ) - + vcf_result <- read.vcfR(paste0(output_name,".vcf.gz")) - + DP <- sum(as.numeric(extract.gt(vcf_result, "DP"))) - + expect_equal(DP, 23618990) - + MPP <- sum(as.numeric(extract.gt(vcf_result, "MPP"))) - + expect_equal(MPP, 74519.94, tolerance = 0.01) }) test_that("Dosage Calling from VCF file",{ - + madc_file <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") ploidy <- 2 model_select <- "norm" - cores <- 2 + cores <- 1 output_name <- "out" - + #Initialize matrices list matrices <- list() - + #Import genotype information if in VCF format vcf <- read.vcfR(madc_file) - + #Get items in FORMAT column info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - + info_ids <- extract_info_ids(info[1]) chrom <- vcf@fix[,1] pos <- vcf@fix[,2] - + if (("DP" %in% info_ids) && (("RA" %in% info_ids) | ("AD" %in% info_ids))) { #Extract DP and RA and convert to matrices matrices$size_matrix <- extract.gt(vcf, element = "DP") @@ -82,22 +83,22 @@ test_that("Dosage Calling from VCF file",{ matrices$ref_matrix <- matrix(sapply(strsplit(ad_matrix, ","), "[[", 1), nrow = nrow(matrices$size_matrix)) colnames(matrices$ref_matrix) <- colnames(matrices$size_matrix) } - + class(matrices$size_matrix) <- "numeric" class(matrices$ref_matrix) <- "numeric" rownames(matrices$size_matrix) <- rownames(matrices$ref_matrix) <- paste0(chrom, "_", pos) - + } - + mout <- updog::multidog(refmat = matrices$ref_matrix, sizemat = matrices$size_matrix, ploidy = as.numeric(ploidy), model = model_select, nc = cores) - + expect_equal(sum(mout$snpdf$bias), 402.79, tolerance = 0.01) expect_equal(sum(mout$inddf$postmean), 95229.13, tolerance = 0.01) - + # Convert updog to VCF BIGr::updog2vcf( multidog.object = mout, @@ -105,47 +106,68 @@ test_that("Dosage Calling from VCF file",{ updog_version = packageVersion("updog"), compress = TRUE ) - + vcf_result <- read.vcfR(paste0(output_name,".vcf.gz")) - + DP <- sum(as.numeric(extract.gt(vcf_result, "DP"))) expect_equal(DP, 23618990) - + MPP <- sum(as.numeric(extract.gt(vcf_result, "MPP"))) expect_equal(MPP, 74519.94, tolerance = 0.01) + +}) +test_that("polyRAD", { + #Variables + + madc_file <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + + polyrad_items <- polyRAD_dosage_call(vcf = madc_file, + ploidy = 2, + model = "IterateHWE") + + polyRAD2vcf(geno = polyrad_items$Genos, + model = "IterateHWE", + vcf_path = madc_file, + hindhe.obj = polyrad_items$RADHindHe, + ploidy = 2, + output.file = "temp") + + bgzip_compress("temp.vcf", file = ".") + }) -test_that("Dosage Calling from VCF file f1 and s1 model",{ +test_that("Dosage Calling from VCF file f1 and s1 model",{ + input <- list() madc_file <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") ploidy <- 2 - + # input$updog_model <- "s1" # input$parent <- "Sample_1" - + input$updog_model <- "f1" input$parent1 <- "Sample_1" input$parent2 <- "Sample_2" - - cores <- 2 + + cores <- 1 output_name <- "out2" - + #Initialize matrices list matrices <- list() - + #Import genotype information if in VCF format vcf <- read.vcfR(madc_file) - + #Get items in FORMAT column info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - + info_ids <- extract_info_ids(info[1]) chrom <- vcf@fix[,1] pos <- vcf@fix[,2] - + if (("DP" %in% info_ids) && (("RA" %in% info_ids) | ("AD" %in% info_ids))) { #Extract DP and RA and convert to matrices matrices$size_matrix <- extract.gt(vcf, element = "DP") @@ -156,13 +178,13 @@ test_that("Dosage Calling from VCF file f1 and s1 model",{ matrices$ref_matrix <- matrix(sapply(strsplit(ad_matrix, ","), "[[", 1), nrow = nrow(matrices$size_matrix)) colnames(matrices$ref_matrix) <- colnames(matrices$size_matrix) } - + class(matrices$size_matrix) <- "numeric" class(matrices$ref_matrix) <- "numeric" rownames(matrices$size_matrix) <- rownames(matrices$ref_matrix) <- paste0(chrom, "_", pos) - + } - + # Select parents if(input$updog_model == "s1" | input$updog_model == "s1pp"){ parents <- c(input$parent, NULL) @@ -171,7 +193,7 @@ test_that("Dosage Calling from VCF file f1 and s1 model",{ } else { parents <- c(NULL, NULL) } - + if (!all(parents %in% colnames(matrices$size_matrix))) { shinyalert( title = "Data Warning!", @@ -187,10 +209,10 @@ test_that("Dosage Calling from VCF file f1 and s1 model",{ showCancelButton = FALSE, animation = TRUE ) - + return() } - + mout <- updog::multidog(refmat = matrices$ref_matrix, sizemat = matrices$size_matrix, ploidy = as.numeric(ploidy), @@ -198,10 +220,10 @@ test_that("Dosage Calling from VCF file f1 and s1 model",{ p2_id = if(is.na(parents[2])) NULL else parents[2], model = input$updog_model, nc = cores) - + expect_equal(sum(mout$snpdf$bias), 408.5401, tolerance = 0.01) expect_equal(sum(mout$inddf$postmean), 94249.05, tolerance = 0.01) - + # Convert updog to VCF updog2vcf( multidog.object = mout, @@ -209,13 +231,13 @@ test_that("Dosage Calling from VCF file f1 and s1 model",{ updog_version = packageVersion("updog"), compress = TRUE ) - + vcf_result <- read.vcfR(paste0(output_name,".vcf.gz")) - + DP <- sum(as.numeric(extract.gt(vcf_result, "DP"))) expect_equal(DP, 23618990) - + MPP <- sum(as.numeric(extract.gt(vcf_result, "MPP"))) expect_equal(MPP, 74519.94, tolerance = 0.01) - + }) diff --git a/tests/testthat/test-GSAcc.R b/tests/testthat/test-GSAcc.R index 97051a1..3ed7dcb 100644 --- a/tests/testthat/test-GSAcc.R +++ b/tests/testthat/test-GSAcc.R @@ -64,7 +64,7 @@ test_that("test Predictive Ability iris",{ #Getting genotype matrix #Geno.file conversion if needed - geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") + geno_snps <- BIGapp:::read_geno_file(input$pred_file$datapath, requires = "GT", ploidy = input$pred_ploidy, check = FALSE) geno <- geno_snps[[1]] pred_inputs$pred_snps <- geno_snps[[2]] # n markers pred_inputs$pred_genos <- ncol(geno) # n samples @@ -111,9 +111,9 @@ test_that("test Predictive Ability iris",{ input$pred_model <- "rrBLUP" # Convert genotype matrix according to ploidy and model used - geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) + geno_formated <- BIGapp:::format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) - results <- run_predictive_model(geno = geno_formated, + results <- BIGapp:::run_predictive_model(geno = geno_formated, pheno = pred_inputs$pheno_input, selected_traits = input$pred_trait_info, predictive_model = input$pred_model, @@ -163,10 +163,10 @@ test_that("test Predictive Ability iris",{ advanced_options$pred_matrix <- input$pred_matrix # Convert genotype matrix according to ploidy and model used - geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) + geno_formated <- BIGapp:::format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) # Main function - results <- run_predictive_model(geno = geno_formated, + results <- BIGapp:::run_predictive_model(geno = geno_formated, pheno = pred_inputs$pheno_input, selected_traits = input$pred_trait_info, predictive_model = input$pred_model, @@ -203,8 +203,8 @@ test_that("test Predictive Ability iris",{ pred_outputs_gBLUP$comb_output <- average_accuracy_df # Compare rrBLUP and GBLUP - expect_equal(pred_outputs_gBLUP$corr_output[,1], pred_outputs_rrBLUP$corr_output[,1], tolerance = 0.01) - expect_equal(pred_outputs_gBLUP$comb_output[,2], pred_outputs_rrBLUP$comb_output[,2], tolerance = 0.01) + expect_equal(pred_outputs_gBLUP$corr_output[,1], pred_outputs_rrBLUP$corr_output[,1], tolerance = 0.02) + expect_equal(pred_outputs_gBLUP$comb_output[,2], pred_outputs_rrBLUP$comb_output[,2], tolerance = 0.02) expect_equal(sum(pred_outputs_gBLUP$avg_GEBVs[,2]), -0.594, tolerance = 0.01) expect_equal(sum(as.numeric(pred_outputs_gBLUP$all_GEBVs[,1])), -2.971, tolerance = 0.1) }) diff --git a/tests/testthat/test-vcf_sanity_check.R b/tests/testthat/test-vcf_sanity_check.R new file mode 100644 index 0000000..298b647 --- /dev/null +++ b/tests/testthat/test-vcf_sanity_check.R @@ -0,0 +1,18 @@ +# Test for vcf_sanity_check function + +test_that("vcf_sanity_check works on a valid VCF", { + # Use a small example VCF file from inst/ or create a minimal one + vcf_path <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + + res <- vcf_sanity_check(vcf_path, n_data_lines = 100, max_markers = 10000, verbose = FALSE) + expect_s3_class(res, "vcf_sanity_check") + expect_true(res$checks["VCF_header"]) + expect_true(res$checks["VCF_columns"]) + expect_true(res$checks["max_markers"]) + expect_true(res$checks["samples"]) + expect_true(res$checks["chrom_info"]) + expect_true(res$checks["pos_info"]) + expect_false(res$checks["ref_alt"]) +}) + +