From d2fc874a9c4fa6c864725aa51f2261ce148daeda Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 16 May 2025 11:43:06 -0400 Subject: [PATCH 01/12] Added MADC Functions --- NAMESPACE | 3 ++ R/filterMADC.R | 99 +++++++++++++++++++++++++++++++++++++++++++ R/madc2gmat.R | 106 ++++++++++++++++++++++++++++++++++++++++++++++ man/filterMADC.Rd | 63 +++++++++++++++++++++++++++ man/madc2gmat.Rd | 35 +++++++++++++++ 5 files changed, 306 insertions(+) create mode 100644 R/filterMADC.R create mode 100644 R/madc2gmat.R create mode 100644 man/filterMADC.Rd create mode 100644 man/madc2gmat.Rd diff --git a/NAMESPACE b/NAMESPACE index 2ca9c31..9b60af0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,10 +7,12 @@ export(check_ped) export(check_replicates) export(dosage2vcf) export(dosage_ratios) +export(filterMADC) export(filterVCF) export(flip_dosage) export(get_countsMADC) export(imputation_concordance) +export(madc2gmat) export(madc2vcf_all) export(madc2vcf_targets) export(merge_MADCs) @@ -31,6 +33,7 @@ importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) +importFrom(rrBLUP,A.mat) importFrom(stats,cor) importFrom(stats,setNames) importFrom(utils,packageVersion) diff --git a/R/filterMADC.R b/R/filterMADC.R new file mode 100644 index 0000000..8c8259e --- /dev/null +++ b/R/filterMADC.R @@ -0,0 +1,99 @@ +#' Filter MADC Files +#' +#' Filter and process MADC files to remove low quality microhaplotypes +#' +#' @details +#' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, +#' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users +#' can scale and normalize the data in preparation for conversion to relationship matrices, +#' plot summary statistics, and save the filtered data to a file. +#' +#'@import dplyr +#'@importFrom utils read.csv +#' +#'@param madc_file Path to the MADC file to be filtered +#'@param min.mean.reads Minimum mean read depth for filtering +#'@param max.mean.reads Maximum mean read depth for filtering +#'@param max.match.mhaps Maximum number of matching mhaps per target loci +#'@param min.reads.per.site Minimum number of reads per site for filtering +#'@param min.ind.with.reads Minimum number of individuals with reads for filtering +#'@param target_only Logical indicating whether to filter for target loci only +#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs +#'@param plot.summary Logical indicating whether to plot summary statistics +#'@param output.file Path to save the filtered data (if NULL, data will not be saved) +#'@param verbose Logical indicating whether to print additional information during processing +#' +#'@return data.frame or saved csv file +#' +#'@examples +#' #Example... +#' +#' ##Plots +#' #Mean read depth +#' #Number of Altmatch and Refmatch mhaps per target loci +#' +#' +#'@export +filterMADC <- function(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.match.mhaps = 10, + min.reads.per.site = NULL, + min.ind.with.reads = NULL, + target_only = FALSE, + fixed_allele_ids = FALSE, + plot.summary = FALSE, + output.file = NULL) { + + + #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not + first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) + + #Check if all entries in the first column are either blank or "*" + check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline + if (check_entries) { + #Note: This assumes that the first 7 rows are placeholder info from DArT processing + + warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.") + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + + } else { + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + + } + + #Remove refmatch and altmatch if wanted + if (target_only) { + message("Retaining target markers only") + #Retain only the Ref and Alt haplotypes + filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ] + } + + ## Filtering + if (!is.null(min.mean.reads)) { + message("Filtering for minimum mean reads across all samples") + filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ] + } + + #Save the output to disk if file name provided + if (!is.null(output.file)) { + message("Saving filtered data to file") + write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE) + } + + return(filtered_df) +} diff --git a/R/madc2gmat.R b/R/madc2gmat.R new file mode 100644 index 0000000..b39b364 --- /dev/null +++ b/R/madc2gmat.R @@ -0,0 +1,106 @@ +#' Convert MADC Files to an Additive Genomic Relationship Matrix +#' +#' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix. +#' +#'@details +#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and +#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package. +#' The resulting matrix can be used for genomic selection or other genetic analyses. +#' +#'@import dplyr +#'@importFrom utils read.csv write.csv +#'@importFrom rrBLUP A.mat +#' +#'@param madc_file Path to the MADC file to be filtered +#'@param output.file Path to save the filtered data (if NULL, data will not be saved) +#' +#'@return data.frame or saved csv file +#' +#'@examples +#' #Example... +#' +#' ##Plots +#' #Mean read depth +#' #Number of Altmatch and Refmatch mhaps per target loci +#' +#'@references +#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). +#' +#'@export +madc2gmat <- function(madc_file, + output.file = NULL) { + + + #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not + first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) + + #Check if all entries in the first column are either blank or "*" + check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline + if (check_entries) { + #Note: This assumes that the first 7 rows are placeholder info from DArT processing + + warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.") + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID) + + } else { + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) + + } + + #Removing extra columns + row.names(filtered_df) <- filtered_df$AlleleID + filtered_df <- filtered_df %>% + select(-c(AlleleID, CloneID, AlleleSequence)) + + + #Scale and normalized data + message("Scaling and normalizing data to be -1,1") + #filtered_df <- filtered_df %>% + # mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric())) + + # Function to scale a matrix to be between -1 and 1 for rrBLUP + scale_matrix <- function(mat) { + min_val <- min(mat) + max_val <- max(mat) + + # Normalize to [0, 1] + normalized <- (mat - min_val) / (max_val - min_val) + + # Scale to [-1, 1] + scaled <- 2 * normalized - 1 + + return(scaled) + } + + # Apply the scaling function + filtered_df <- scale_matrix(filtered_df) + + #Making additive relationship matrix + MADC.mat <- A.mat(t(filtered_df)) + + #Save the output to disk if file name provided + if (!is.null(output.file)) { + message("Saving filtered data to file") + write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE) + } + + return(MADC.mat) +} diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd new file mode 100644 index 0000000..2b161dd --- /dev/null +++ b/man/filterMADC.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filterMADC.R +\name{filterMADC} +\alias{filterMADC} +\title{Filter MADC Files} +\usage{ +filterMADC( + madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.match.mhaps = 10, + min.reads.per.site = NULL, + min.ind.with.reads = NULL, + target_only = FALSE, + fixed_allele_ids = FALSE, + plot.summary = FALSE, + output.file = NULL +) +} +\arguments{ +\item{madc_file}{Path to the MADC file to be filtered} + +\item{min.mean.reads}{Minimum mean read depth for filtering} + +\item{max.mean.reads}{Maximum mean read depth for filtering} + +\item{max.match.mhaps}{Maximum number of matching mhaps per target loci} + +\item{min.reads.per.site}{Minimum number of reads per site for filtering} + +\item{min.ind.with.reads}{Minimum number of individuals with reads for filtering} + +\item{target_only}{Logical indicating whether to filter for target loci only} + +\item{fixed_allele_ids}{Logical indicating whether the MADC file has been pre-processed for fixed allele IDs} + +\item{plot.summary}{Logical indicating whether to plot summary statistics} + +\item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} + +\item{verbose}{Logical indicating whether to print additional information during processing} +} +\value{ +data.frame or saved csv file +} +\description{ +Filter and process MADC files to remove low quality microhaplotypes +} +\details{ +This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, +it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users +can scale and normalize the data in preparation for conversion to relationship matrices, +plot summary statistics, and save the filtered data to a file. +} +\examples{ +#Example... + +##Plots +#Mean read depth +#Number of Altmatch and Refmatch mhaps per target loci + + +} diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd new file mode 100644 index 0000000..6bd0b4f --- /dev/null +++ b/man/madc2gmat.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/madc2gmat.R +\name{madc2gmat} +\alias{madc2gmat} +\title{Convert MADC Files to an Additive Genomic Relationship Matrix} +\usage{ +madc2gmat(madc_file, output.file = NULL) +} +\arguments{ +\item{madc_file}{Path to the MADC file to be filtered} + +\item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} +} +\value{ +data.frame or saved csv file +} +\description{ +Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix. +} +\details{ +This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and +then converts it into an additive genomic relationship matrix using the \code{A.mat} function from the \code{rrBLUP} package. +The resulting matrix can be used for genomic selection or other genetic analyses. +} +\examples{ +#Example... + +##Plots +#Mean read depth +#Number of Altmatch and Refmatch mhaps per target loci + +} +\references{ +Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). +} From e3a312b8ac74b08da2704b327c9a4443bcd54a84 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 16 May 2025 11:48:38 -0400 Subject: [PATCH 02/12] update description --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 52220d3..7906d27 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,6 +53,7 @@ Imports: Rdpack (>= 0.7), readr (>= 2.1.5), reshape2 (>= 1.4.4), + rrBLUP, tidyr (>= 1.3.1), vcfR (>= 1.15.0), Rsamtools, From 5d4097bebc9750b1bb40108293e5431a77610cf0 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 16 May 2025 12:58:51 -0400 Subject: [PATCH 03/12] added madc2gmat test --- R/madc2gmat.R | 11 ++++++++-- man/madc2gmat.Rd | 2 +- tests/testthat/test-madc2gmat.R | 38 +++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-madc2gmat.R diff --git a/R/madc2gmat.R b/R/madc2gmat.R index b39b364..39a8b54 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -28,8 +28,12 @@ #' #'@export madc2gmat <- function(madc_file, + seed = NULL, output.file = NULL) { - + #set seed if not null + if (!is.null(seed)) { + set.seed(seed) + } #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) @@ -96,11 +100,14 @@ madc2gmat <- function(madc_file, #Making additive relationship matrix MADC.mat <- A.mat(t(filtered_df)) + rm(filtered_df) + #Save the output to disk if file name provided if (!is.null(output.file)) { message("Saving filtered data to file") write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE) + } else { + return(MADC.mat) } - return(MADC.mat) } diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 6bd0b4f..d2cf998 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -4,7 +4,7 @@ \alias{madc2gmat} \title{Convert MADC Files to an Additive Genomic Relationship Matrix} \usage{ -madc2gmat(madc_file, output.file = NULL) +madc2gmat(madc_file, seed = NULL, output.file = NULL) } \arguments{ \item{madc_file}{Path to the MADC file to be filtered} diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R new file mode 100644 index 0000000..5497f21 --- /dev/null +++ b/tests/testthat/test-madc2gmat.R @@ -0,0 +1,38 @@ +context("MADC 2 Gmatrix") + + +test_that("test madc2gmat",{ + #Input variables + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + + #Calculations + temp <- tempfile() + + # Converting to additive relationship matrix + gmat <- madc2gmat(madc_file, + seed = 123, + output.file = NULL) + + #When output a file + madc2gmat(madc_file, + seed = 123, + output.file = temp) + + #Test that a valid output was provided + expect_true(file.exists(paste0(temp, ".csv"))) + + #Check + expect_true(all(dim(gmat) == c("10","10"))) + expect_true(all(row.names(gmat) == row.names(gmat))) + expect_equal(sum(gmat), 1.00614e-16)#, tolerance = 1e-16) + expect_true(is.matrix(gmat), "Output should be a matrix") + + # Read the output file + output_data <- read.csv(paste0(temp,".csv"), row.names = 1) + + # Test the content of the output file + expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") + expect_true(all(dim(output_data) == c("10","10"))) + expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") + expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-16) +}) From e0709ee44e0d7d1e972274feddd802c73d387b20 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 08:48:48 -0400 Subject: [PATCH 04/12] cleaned --- R/madc2gmat.R | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 39a8b54..c386b90 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -50,25 +50,16 @@ madc2gmat <- function(madc_file, #Read the madc file filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) - #Remove extra text after Ref and Alt (_001 or _002) - #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID) - } else { #Read the madc file filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) - - #Remove extra text after Ref and Alt (_001 or _002) - #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) - } + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) + #Removing extra columns row.names(filtered_df) <- filtered_df$AlleleID filtered_df <- filtered_df %>% @@ -77,9 +68,6 @@ madc2gmat <- function(madc_file, #Scale and normalized data message("Scaling and normalizing data to be -1,1") - #filtered_df <- filtered_df %>% - # mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric())) - # Function to scale a matrix to be between -1 and 1 for rrBLUP scale_matrix <- function(mat) { min_val <- min(mat) From ca6aa80750aefa278267d907865c0e4f8a676d40 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 11:11:36 -0400 Subject: [PATCH 05/12] filterMADC added --- R/filterMADC.R | 129 +++++++++++++++++++++++++++++++++++++++++----- man/filterMADC.Rd | 23 ++++----- 2 files changed, 126 insertions(+), 26 deletions(-) diff --git a/R/filterMADC.R b/R/filterMADC.R index 8c8259e..3cc2472 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -5,8 +5,7 @@ #' @details #' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, #' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users -#' can scale and normalize the data in preparation for conversion to relationship matrices, -#' plot summary statistics, and save the filtered data to a file. +#' can plot summary statistics and save the filtered data to a file. #' #'@import dplyr #'@importFrom utils read.csv @@ -14,14 +13,13 @@ #'@param madc_file Path to the MADC file to be filtered #'@param min.mean.reads Minimum mean read depth for filtering #'@param max.mean.reads Maximum mean read depth for filtering -#'@param max.match.mhaps Maximum number of matching mhaps per target loci -#'@param min.reads.per.site Minimum number of reads per site for filtering -#'@param min.ind.with.reads Minimum number of individuals with reads for filtering -#'@param target_only Logical indicating whether to filter for target loci only -#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs +#'@param max.mhaps.per.loci Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold. +#'@param min.reads.per.site Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored +#'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering +#'@param target.only Logical indicating whether to filter for target loci only +#'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed. #'@param plot.summary Logical indicating whether to plot summary statistics #'@param output.file Path to save the filtered data (if NULL, data will not be saved) -#'@param verbose Logical indicating whether to print additional information during processing #' #'@return data.frame or saved csv file #' @@ -37,11 +35,11 @@ filterMADC <- function(madc_file, min.mean.reads = NULL, max.mean.reads = NULL, - max.match.mhaps = 10, - min.reads.per.site = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, min.ind.with.reads = NULL, - target_only = FALSE, - fixed_allele_ids = FALSE, + target.only = FALSE, + n.summary.columns = NULL, plot.summary = FALSE, output.file = NULL) { @@ -75,18 +73,123 @@ filterMADC <- function(madc_file, filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) } + #Check for extra columns + #Save the three columns for later adding to the output + saved_columns <- filtered_df[,1:3] + + if (!is.null(n.summary.columns)) { + #Remove the first n.summary.columns columns + filtered_df <- filtered_df[,-c(4:n.summary.columns)] + }else{ + rm.col <- c("ClusterConsensusSequence", + "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp") + + filtered_df <- filtered_df[, !(colnames(filtered_df) %in% rm.col)] + } + + #Now add rownames + rownames(filtered_df) <- saved_columns[,1] #Remove refmatch and altmatch if wanted - if (target_only) { + if (target.only) { message("Retaining target markers only") #Retain only the Ref and Alt haplotypes filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ] } ## Filtering + + #Min mean reads if (!is.null(min.mean.reads)) { message("Filtering for minimum mean reads across all samples") + #Get the mean value for each row, and remove the rows below that threshold + filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ] + #Remove the MeanReads column + filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")] + } + + #Max mean reads + if (!is.null(max.mean.reads)) { + message("Filtering for maximum mean reads across all samples") + #Get the mean value for each row, and remove the rows above that threshold + filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) + filtered_df <- filtered_df[filtered_df$MeanReads <= max.mean.reads, ] + #Remove the MeanReads column + filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")] + } + + #Max mhaps per loci + if (!is.null(max.mhaps.per.loci)) { + message("Filtering for maximum number of matching mhaps per target loci") + #Get the number of matching mhaps for loci, and remove the mhaps at those loci that exceed the max number + mhap_counts <- filtered_df %>% + group_by(CloneID) %>% + summarise(Count = n(), .groups = 'drop') %>% + filter(Count > max.mhaps.per.loci) + + patterns_to_search <- "\\|AltMatch|\\|RefMatch" + clone_ids_to_target <- mhap_counts$CloneID + + filtered_df <- filtered_df %>% + filter( + !( # "keep rows that DO NOT match both conditions" + CloneID %in% clone_ids_to_target & # Condition 1: CloneID is one of the targeted IDs + grepl(patterns_to_search, AlleleID) # Condition 2: AlleleID contains one of the patterns + ) + ) + } + + #Min individuals with reads + if (!is.null(min.ind.with.reads)) { + message("Filtering for minimum number of individuals with reads per site") + message(past0("Minimum number of individuals with reads per site: ", min.ind.with.reads)) + message(past0("Minimum number of reads per site: ", min.reads.per.site)) + + #Getting colnames + cols_to_check <- colnames(filtered_df)[-(1:3)] + + filtered_df <- filtered_df %>% + rowwise() %>% # Process data row by row + mutate( + # For each row, count how many of the 'cols_to_check' meet the criterion + qualifying_sites_count = sum( + c_across(all_of(cols_to_check)) >= min.reads.per.site, + na.rm = TRUE # Treats NAs in data as not meeting the criterion + ) + ) %>% + ungroup() %>% # Always ungroup after rowwise operations + # Filter rows where this count meets the 'min.ind.with.reads' threshold + filter(qualifying_sites_count >= min.ind.with.reads) %>% + # Optionally, remove the temporary count column if it's no longer needed + select(-qualifying_sites_count) + } + + #Plots + if (plot.summary) { + message("Plotting summary statistics") + #Plot mean read depth + mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) + hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency") + + #Plot number of Altmatch and Refmatch mhaps per target loci + altmatch_counts <- filtered_df %>% + filter(grepl("\\|AltMatch", AlleleID)) %>% + group_by(CloneID) %>% + summarise(Count = n(), .groups = 'drop') + + refmatch_counts <- filtered_df %>% + filter(grepl("\\|RefMatch", AlleleID)) %>% + group_by(CloneID) %>% + summarise(Count = n(), .groups = 'drop') + + barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE, + names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps", + xlab = "Clone ID", ylab = "Count") + + #Plot density of number of CloneID per site on a marker distribution plot + } #Save the output to disk if file name provided diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd index 2b161dd..c2df9d2 100644 --- a/man/filterMADC.Rd +++ b/man/filterMADC.Rd @@ -8,11 +8,11 @@ filterMADC( madc_file, min.mean.reads = NULL, max.mean.reads = NULL, - max.match.mhaps = 10, - min.reads.per.site = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, min.ind.with.reads = NULL, - target_only = FALSE, - fixed_allele_ids = FALSE, + target.only = FALSE, + n.summary.columns = NULL, plot.summary = FALSE, output.file = NULL ) @@ -24,21 +24,19 @@ filterMADC( \item{max.mean.reads}{Maximum mean read depth for filtering} -\item{max.match.mhaps}{Maximum number of matching mhaps per target loci} +\item{max.mhaps.per.loci}{Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold.} -\item{min.reads.per.site}{Minimum number of reads per site for filtering} +\item{min.reads.per.site}{Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored} -\item{min.ind.with.reads}{Minimum number of individuals with reads for filtering} +\item{min.ind.with.reads}{Minimum number of individuals with \code{min.reads.per.site} reads for filtering} -\item{target_only}{Logical indicating whether to filter for target loci only} +\item{target.only}{Logical indicating whether to filter for target loci only} -\item{fixed_allele_ids}{Logical indicating whether the MADC file has been pre-processed for fixed allele IDs} +\item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.} \item{plot.summary}{Logical indicating whether to plot summary statistics} \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} - -\item{verbose}{Logical indicating whether to print additional information during processing} } \value{ data.frame or saved csv file @@ -49,8 +47,7 @@ Filter and process MADC files to remove low quality microhaplotypes \details{ This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users -can scale and normalize the data in preparation for conversion to relationship matrices, -plot summary statistics, and save the filtered data to a file. +can plot summary statistics and save the filtered data to a file. } \examples{ #Example... From e4873fa97afcf36b0a2ae283858d51589b877f05 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 11:47:24 -0400 Subject: [PATCH 06/12] added filterMADC tests --- R/filterMADC.R | 54 ++++++------ man/filterMADC.Rd | 3 - tests/testthat/test-filterMADC.R | 144 +++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 29 deletions(-) create mode 100644 tests/testthat/test-filterMADC.R diff --git a/R/filterMADC.R b/R/filterMADC.R index 3cc2472..2ab79dd 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -18,7 +18,7 @@ #'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering #'@param target.only Logical indicating whether to filter for target loci only #'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed. -#'@param plot.summary Logical indicating whether to plot summary statistics +#@param plot.summary Logical indicating whether to plot summary statistics #'@param output.file Path to save the filtered data (if NULL, data will not be saved) #' #'@return data.frame or saved csv file @@ -40,7 +40,7 @@ filterMADC <- function(madc_file, min.ind.with.reads = NULL, target.only = FALSE, n.summary.columns = NULL, - plot.summary = FALSE, + #plot.summary = FALSE, output.file = NULL) { @@ -144,8 +144,8 @@ filterMADC <- function(madc_file, #Min individuals with reads if (!is.null(min.ind.with.reads)) { message("Filtering for minimum number of individuals with reads per site") - message(past0("Minimum number of individuals with reads per site: ", min.ind.with.reads)) - message(past0("Minimum number of reads per site: ", min.reads.per.site)) + message(paste0("Minimum number of individuals with reads per site: ", min.ind.with.reads)) + message(paste0("Minimum number of reads per site: ", min.reads.per.site)) #Getting colnames cols_to_check <- colnames(filtered_df)[-(1:3)] @@ -167,36 +167,38 @@ filterMADC <- function(madc_file, } #Plots - if (plot.summary) { - message("Plotting summary statistics") - #Plot mean read depth - mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) - hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency") - - #Plot number of Altmatch and Refmatch mhaps per target loci - altmatch_counts <- filtered_df %>% - filter(grepl("\\|AltMatch", AlleleID)) %>% - group_by(CloneID) %>% - summarise(Count = n(), .groups = 'drop') - - refmatch_counts <- filtered_df %>% - filter(grepl("\\|RefMatch", AlleleID)) %>% - group_by(CloneID) %>% - summarise(Count = n(), .groups = 'drop') - - barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE, - names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps", - xlab = "Clone ID", ylab = "Count") + #if (plot.summary) { + # message("Plotting summary statistics") + # #Plot mean read depth + # mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) + # hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency") + + # #Plot number of Altmatch and Refmatch mhaps per target loci + # altmatch_counts <- filtered_df %>% + # filter(grepl("\\|AltMatch", AlleleID)) %>% + # group_by(CloneID) %>% + # summarise(Count = n(), .groups = 'drop') + + # refmatch_counts <- filtered_df %>% + # filter(grepl("\\|RefMatch", AlleleID)) %>% + # group_by(CloneID) %>% + # summarise(Count = n(), .groups = 'drop') + + # barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE, + # names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps", + # xlab = "Clone ID", ylab = "Count") #Plot density of number of CloneID per site on a marker distribution plot - } + #} #Save the output to disk if file name provided if (!is.null(output.file)) { message("Saving filtered data to file") write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE) + } else { + message("No output file provided. Returning filtered data.") + return(filtered_df) } - return(filtered_df) } diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd index c2df9d2..a028653 100644 --- a/man/filterMADC.Rd +++ b/man/filterMADC.Rd @@ -13,7 +13,6 @@ filterMADC( min.ind.with.reads = NULL, target.only = FALSE, n.summary.columns = NULL, - plot.summary = FALSE, output.file = NULL ) } @@ -34,8 +33,6 @@ filterMADC( \item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.} -\item{plot.summary}{Logical indicating whether to plot summary statistics} - \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} } \value{ diff --git a/tests/testthat/test-filterMADC.R b/tests/testthat/test-filterMADC.R new file mode 100644 index 0000000..b7cf687 --- /dev/null +++ b/tests/testthat/test-filterMADC.R @@ -0,0 +1,144 @@ +context("Filter MADC") + + +test_that("test filter madc",{ + #Input variables + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + + #Calculations + temp <- tempfile() + + # Filtering (target only) + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = TRUE, + n.summary.columns = NULL, + output.file = NULL) + + + #Test that a valid output was provided + expect_equal(nrow(filtered_df), 41) + #Check that it is a dataframe + expect_true(is.data.frame(filtered_df)) + + # Checking for no filtering + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 51) + expect_equal(sum(filtered_df[,-c(1:3)]), 53952) + expect_true(all(names(filtered_df[1:3]) == c("AlleleID", "CloneID", "AlleleSequence"))) + + #Checking for min.mean.reads filtering + filtered_df <- filterMADC(madc_file, + min.mean.reads = 10, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 36) + expect_equal(ncol(filtered_df), 13) + + #Checking for max.mean.reads filtering + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = 10, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 15) + expect_equal(ncol(filtered_df), 13) + + #Remove max mhaps + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = 3, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 44) + expect_equal(ncol(filtered_df), 13) + + #Remove min ind with reads + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 10, + min.ind.with.reads = 10, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 9) + expect_equal(ncol(filtered_df), 13) + expect_equal(sum(filtered_df[,-c(1:3)]), 31642) + + #Check that the output file is created + filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = temp) + + expect_true(file.exists(paste0(temp,".csv"))) + + #Check that the plots are created in the console + #filtered_df <- filterMADC(madc_file, + # min.mean.reads = NULL, + # max.mean.reads = NULL, + # max.mhaps.per.loci = 3, + # min.reads.per.site = 1, + # min.ind.with.reads = NULL, + # target.only = FALSE, + # n.summary.columns = NULL, + # plot.summary = TRUE, + # output.file = NULL) + + #expect_true(is.numeric(dev.cur())) + #expect_true(dev.cur() > 1) + + #Now checking that all paramaters can work together + filtered_df <- filterMADC(madc_file, + min.mean.reads =1, + max.mean.reads = 150, + max.mhaps.per.loci = 3, + min.reads.per.site = 10, + min.ind.with.reads = 10, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 3) + expect_equal(ncol(filtered_df), 13) + expect_equal(sum(filtered_df[,-c(1:3)]), 3960) + + +}) From ee6bc5ec9af96bbbf8d330fcc5612993f8ce63f5 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 11:50:12 -0400 Subject: [PATCH 07/12] Add example --- R/filterMADC.R | 19 +++++++++++++++---- man/filterMADC.Rd | 19 +++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/R/filterMADC.R b/R/filterMADC.R index 2ab79dd..a9b2cec 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -24,11 +24,22 @@ #'@return data.frame or saved csv file #' #'@examples -#' #Example... +#' #Example +#' +#' #Example MADC +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") +#' +#' #Remove mhaps exceeding 3 per target region including the ref and alt target mhaps +#' filtered_df <- filterMADC(madc_file, +#' min.mean.reads = NULL, +#' max.mean.reads = NULL, +#' max.mhaps.per.loci = 3, +#' min.reads.per.site = 1, +#' min.ind.with.reads = NULL, +#' target.only = FALSE, +#' n.summary.columns = NULL, +#' output.file = NULL) #' -#' ##Plots -#' #Mean read depth -#' #Number of Altmatch and Refmatch mhaps per target loci #' #' #'@export diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd index a028653..671a599 100644 --- a/man/filterMADC.Rd +++ b/man/filterMADC.Rd @@ -47,11 +47,22 @@ it can filter based on mean read depth, number of mhaps per target loci, and oth can plot summary statistics and save the filtered data to a file. } \examples{ -#Example... +#Example + +#Example MADC +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + +#Remove mhaps exceeding 3 per target region including the ref and alt target mhaps +filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = 3, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) -##Plots -#Mean read depth -#Number of Altmatch and Refmatch mhaps per target loci } From 85f12101993b047f7971acb0b9f092d370f3a9be Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 09:21:16 -0400 Subject: [PATCH 08/12] Example updates --- R/madc2gmat.R | 13 +++++++++---- README.md | 1 + man/madc2gmat.Rd | 13 +++++++++---- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index c386b90..2092155 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -17,11 +17,16 @@ #'@return data.frame or saved csv file #' #'@examples -#' #Example... +#' #Input variables +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") #' -#' ##Plots -#' #Mean read depth -#' #Number of Altmatch and Refmatch mhaps per target loci +#' #Calculations +#' temp <- tempfile() +#' +#' # Converting to additive relationship matrix +#' gmat <- madc2gmat(madc_file, +#' seed = 123, +#' output.file = NULL) #' #'@references #'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). diff --git a/README.md b/README.md index 87fcdb2..e913a9c 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![R-CMD-check](https://github.com/Breeding-Insight/BIGr/workflows/R-CMD-check/badge.svg)](https://github.com/Breeding-Insight/BIGr/actions) ![GitHub Release](https://img.shields.io/github/v/release/Breeding-Insight/BIGr) [![Development Status](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) +[![CRAN Status Badge](https://www.r-pkg.org/badges/version/BIGr)](https://cran.r-project.org/package=BIGr) ![GitHub License](https://img.shields.io/github/license/Breeding-Insight/BIGr) [![codecov](https://app.codecov.io/gh/Breeding-Insight/BIGr/graph/badge.svg?token=PJUZMRN1NF)](https://app.codecov.io/gh/Breeding-Insight/BIGr) diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index d2cf998..43e135e 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -23,11 +23,16 @@ then converts it into an additive genomic relationship matrix using the \code{A. The resulting matrix can be used for genomic selection or other genetic analyses. } \examples{ -#Example... +#Input variables +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -##Plots -#Mean read depth -#Number of Altmatch and Refmatch mhaps per target loci +#Calculations +temp <- tempfile() + +# Converting to additive relationship matrix +gmat <- madc2gmat(madc_file, + seed = 123, + output.file = NULL) } \references{ From 0b54561e9776d790139357a7fba2d4a4eee3ca3c Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 09:28:51 -0400 Subject: [PATCH 09/12] updated test --- tests/testthat/test-madc2gmat.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index 5497f21..0b447c3 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -34,5 +34,5 @@ test_that("test madc2gmat",{ expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") expect_true(all(dim(output_data) == c("10","10"))) expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") - expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-16) + expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-15) }) From c50d224b2e06e987584355087194408184fb8dee Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 09:49:04 -0400 Subject: [PATCH 10/12] update documentation --- R/madc2gmat.R | 1 + man/madc2gmat.Rd | 2 ++ 2 files changed, 3 insertions(+) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 2092155..74e08c9 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -12,6 +12,7 @@ #'@importFrom rrBLUP A.mat #' #'@param madc_file Path to the MADC file to be filtered +#'@param seed Optional seed for random number generation (default is NULL) #'@param output.file Path to save the filtered data (if NULL, data will not be saved) #' #'@return data.frame or saved csv file diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 43e135e..c7eda35 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -9,6 +9,8 @@ madc2gmat(madc_file, seed = NULL, output.file = NULL) \arguments{ \item{madc_file}{Path to the MADC file to be filtered} +\item{seed}{Optional seed for random number generation (default is NULL)} + \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} } \value{ From 11877f9e2b024ad1a1241b4b8667dde9f78dab08 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 10:54:12 -0400 Subject: [PATCH 11/12] Update Description --- DESCRIPTION | 2 +- NEWS.md | 53 +++++++++++++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7906d27..c582046 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.5.5 +Version: 0.6.0 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='ams866@cornell.edu', diff --git a/NEWS.md b/NEWS.md index 02cb34e..6e7d1f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,25 +1,21 @@ -# BIGr 0.3.3 +# BIGr 0.6.0 -- Adapt updog2vcf to model f1, f1pp, s1 and s1pp +- Added new functions for filtering MADC files and converting to relationship matrices -# BIGr 0.3.2 +# BIGr 0.5.5 -- updog2vcf function option to output compressed VCF (.vcf.gz) - set as default -- remove need for defining ploidy -- add metadata at the VCF header +- Updated DESCRIPTION +- Added return value for merge_MADCs +- Added optional seed for check_ped +- Added verbose option -# BIGr 0.5.0 +# BIGr 0.5.4 -- Add imputation_concordance function to estimate accuracy of imputed and original dataset -- Add get_OffTargets function to extract target and off-target SNPs from a MADC file -- Add merge_MADCs function to merge two or more MADC files together -- Improved documentation and examples for all functions -- Add tests for all functions +- Updated dosage2vcf example -# BIGr 0.5.1 +# BIGr 0.5.3 -- Improvements of testthat tests -- Add check_replicates and check_homozygous_trios for pedigree relationship quality check +- Updated madc2vcf_all example # BIGr 0.5.2 @@ -27,17 +23,26 @@ - get_OffTargets function changed to madc2vcf_all - Updates to testthat tests and function examples -# BIGr 0.5.3 +# BIGr 0.5.1 -- Updated madc2vcf_all example +- Improvements of testthat tests +- Add check_replicates and check_homozygous_trios for pedigree relationship quality check -# BIGr 0.5.4 +# BIGr 0.5.0 -- Updated dosage2vcf example +- Add imputation_concordance function to estimate accuracy of imputed and original dataset +- Add get_OffTargets function to extract target and off-target SNPs from a MADC file +- Add merge_MADCs function to merge two or more MADC files together +- Improved documentation and examples for all functions +- Add tests for all functions -# BIGr 0.5.5 +# BIGr 0.3.3 + +- Adapt updog2vcf to model f1, f1pp, s1 and s1pp + +# BIGr 0.3.2 + +- updog2vcf function option to output compressed VCF (.vcf.gz) - set as default +- remove need for defining ploidy +- add metadata at the VCF header -- Updated DESCRIPTION -- Added return value for merge_MADCs -- Added optional seed for check_ped -- Added verbose option From 1d4e3bbe171dec2506f84630bd2bfbab018393cf Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 21 May 2025 12:07:43 -0400 Subject: [PATCH 12/12] removed ref and alt name changes --- R/filterMADC.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/filterMADC.R b/R/filterMADC.R index a9b2cec..5c1056f 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -71,8 +71,8 @@ filterMADC <- function(madc_file, filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) } else { @@ -80,8 +80,8 @@ filterMADC <- function(madc_file, filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) } #Check for extra columns