Skip to content
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ export(check_ped)
export(check_replicates)
export(dosage2vcf)
export(dosage_ratios)
export(filterMADC)
export(filterVCF)
export(flip_dosage)
export(get_countsMADC)
export(imputation_concordance)
export(madc2gmat)
export(madc2vcf_all)
export(madc2vcf_targets)
export(merge_MADCs)
Expand All @@ -31,6 +33,7 @@ importFrom(pwalign,pairwiseAlignment)
importFrom(readr,read_csv)
importFrom(reshape2,dcast)
importFrom(reshape2,melt)
importFrom(rrBLUP,A.mat)
importFrom(stats,cor)
importFrom(stats,setNames)
importFrom(utils,packageVersion)
Expand Down
99 changes: 99 additions & 0 deletions R/filterMADC.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#' Filter MADC Files
#'
#' Filter and process MADC files to remove low quality microhaplotypes
#'
#' @details
#' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
#' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
#' can scale and normalize the data in preparation for conversion to relationship matrices,
#' plot summary statistics, and save the filtered data to a file.
#'
#'@import dplyr
#'@importFrom utils read.csv
#'
#'@param madc_file Path to the MADC file to be filtered
#'@param min.mean.reads Minimum mean read depth for filtering
#'@param max.mean.reads Maximum mean read depth for filtering
#'@param max.match.mhaps Maximum number of matching mhaps per target loci
#'@param min.reads.per.site Minimum number of reads per site for filtering
#'@param min.ind.with.reads Minimum number of individuals with reads for filtering
#'@param target_only Logical indicating whether to filter for target loci only
#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs
#'@param plot.summary Logical indicating whether to plot summary statistics
#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
#'@param verbose Logical indicating whether to print additional information during processing
#'
#'@return data.frame or saved csv file
#'
#'@examples
#' #Example...
#'
#' ##Plots
#' #Mean read depth
#' #Number of Altmatch and Refmatch mhaps per target loci
#'
#'
#'@export
filterMADC <- function(madc_file,
min.mean.reads = NULL,
max.mean.reads = NULL,
max.match.mhaps = 10,
min.reads.per.site = NULL,
min.ind.with.reads = NULL,
target_only = FALSE,
fixed_allele_ids = FALSE,
plot.summary = FALSE,
output.file = NULL) {


#Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))

#Check if all entries in the first column are either blank or "*"
check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))

#Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
if (check_entries) {
#Note: This assumes that the first 7 rows are placeholder info from DArT processing

warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")

#Read the madc file
filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)

#Remove extra text after Ref and Alt (_001 or _002)
filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)

} else {

#Read the madc file
filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)

#Remove extra text after Ref and Alt (_001 or _002)
filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)

}

#Remove refmatch and altmatch if wanted
if (target_only) {
message("Retaining target markers only")
#Retain only the Ref and Alt haplotypes
filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ]
}

## Filtering
if (!is.null(min.mean.reads)) {
message("Filtering for minimum mean reads across all samples")
filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ]
}

#Save the output to disk if file name provided
if (!is.null(output.file)) {
message("Saving filtered data to file")
write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE)
}

return(filtered_df)
}
106 changes: 106 additions & 0 deletions R/madc2gmat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#' Convert MADC Files to an Additive Genomic Relationship Matrix
#'
#' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix.
#'
#'@details
#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and
#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package.
#' The resulting matrix can be used for genomic selection or other genetic analyses.
#'
#'@import dplyr
#'@importFrom utils read.csv write.csv
#'@importFrom rrBLUP A.mat
#'
#'@param madc_file Path to the MADC file to be filtered
#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
#'
#'@return data.frame or saved csv file
#'
#'@examples
#' #Example...
#'
#' ##Plots
#' #Mean read depth
#' #Number of Altmatch and Refmatch mhaps per target loci
#'
#'@references
#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
#'
#'@export
madc2gmat <- function(madc_file,
output.file = NULL) {


#Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))

#Check if all entries in the first column are either blank or "*"
check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))

#Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
if (check_entries) {
#Note: This assumes that the first 7 rows are placeholder info from DArT processing

warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")

#Read the madc file
filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)

#Remove extra text after Ref and Alt (_001 or _002)
#filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
#filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID)

} else {

#Read the madc file
filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)

#Remove extra text after Ref and Alt (_001 or _002)
#filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
#filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)

}

#Removing extra columns
row.names(filtered_df) <- filtered_df$AlleleID
filtered_df <- filtered_df %>%
select(-c(AlleleID, CloneID, AlleleSequence))


#Scale and normalized data
message("Scaling and normalizing data to be -1,1")
#filtered_df <- filtered_df %>%
# mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric()))

# Function to scale a matrix to be between -1 and 1 for rrBLUP
scale_matrix <- function(mat) {
min_val <- min(mat)
max_val <- max(mat)

# Normalize to [0, 1]
normalized <- (mat - min_val) / (max_val - min_val)

# Scale to [-1, 1]
scaled <- 2 * normalized - 1

return(scaled)
}

# Apply the scaling function
filtered_df <- scale_matrix(filtered_df)

#Making additive relationship matrix
MADC.mat <- A.mat(t(filtered_df))

#Save the output to disk if file name provided
if (!is.null(output.file)) {
message("Saving filtered data to file")
write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE)
}

return(MADC.mat)
}
63 changes: 63 additions & 0 deletions man/filterMADC.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions man/madc2gmat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading