Breeding-Insight · alex-sandercock · May 21, 2025 · May 16, 2025 · May 16, 2025 · May 16, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,10 +7,12 @@ export(check_ped)
 export(check_replicates)
 export(dosage2vcf)
 export(dosage_ratios)
+export(filterMADC)
 export(filterVCF)
 export(flip_dosage)
 export(get_countsMADC)
 export(imputation_concordance)
+export(madc2gmat)
 export(madc2vcf_all)
 export(madc2vcf_targets)
 export(merge_MADCs)
@@ -31,6 +33,7 @@ importFrom(pwalign,pairwiseAlignment)
 importFrom(readr,read_csv)
 importFrom(reshape2,dcast)
 importFrom(reshape2,melt)
+importFrom(rrBLUP,A.mat)
 importFrom(stats,cor)
 importFrom(stats,setNames)
 importFrom(utils,packageVersion)

diff --git a/R/filterMADC.R b/R/filterMADC.R
@@ -0,0 +1,99 @@
+#' Filter MADC Files
+#'
+#' Filter and process MADC files to remove low quality microhaplotypes
+#'
+#' @details
+#' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
+#' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
+#' can scale and normalize the data in preparation for conversion to relationship matrices,
+#' plot summary statistics, and save the filtered data to a file.
+#'
+#'@import dplyr
+#'@importFrom utils read.csv
+#'
+#'@param madc_file Path to the MADC file to be filtered
+#'@param min.mean.reads Minimum mean read depth for filtering
+#'@param max.mean.reads Maximum mean read depth for filtering
+#'@param max.match.mhaps Maximum number of matching mhaps per target loci
+#'@param min.reads.per.site Minimum number of reads per site for filtering
+#'@param min.ind.with.reads Minimum number of individuals with reads for filtering
+#'@param target_only Logical indicating whether to filter for target loci only
+#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs
+#'@param plot.summary Logical indicating whether to plot summary statistics
+#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
+#'@param verbose Logical indicating whether to print additional information during processing
+#'
+#'@return data.frame or saved csv file
+#'
+#'@examples
+#' #Example...
+#'
+#' ##Plots
+#' #Mean read depth
+#' #Number of Altmatch and Refmatch mhaps per target loci
+#'
+#'
+#'@export
+filterMADC <- function(madc_file,
+                       min.mean.reads = NULL,
+                       max.mean.reads = NULL,
+                       max.match.mhaps = 10,
+                       min.reads.per.site = NULL,
+                       min.ind.with.reads = NULL,
+                       target_only = FALSE,
+                       fixed_allele_ids = FALSE,
+                       plot.summary = FALSE,
+                       output.file = NULL) {
+
+
+  #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
+  first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))
+
+  #Check if all entries in the first column are either blank or "*"
+  check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))
+
+  #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
+  if (check_entries) {
+    #Note: This assumes that the first 7 rows are placeholder info from DArT processing
+
+    warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
+
+  } else {
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
+
+  }
+
+  #Remove refmatch and altmatch if wanted
+  if (target_only) {
+    message("Retaining target markers only")
+    #Retain only the Ref and Alt haplotypes
+    filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ]
+  }
+
+  ## Filtering
+  if (!is.null(min.mean.reads)) {
+    message("Filtering for minimum mean reads across all samples")
+    filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ]
+  }
+
+  #Save the output to disk if file name provided
+  if (!is.null(output.file)) {
+    message("Saving filtered data to file")
+    write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE)
+  }
+
+  return(filtered_df)
+}
diff --git a/R/madc2gmat.R b/R/madc2gmat.R
@@ -0,0 +1,106 @@
+#' Convert MADC Files to an Additive Genomic Relationship Matrix
+#'
+#' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix.
+#'
+#'@details
+#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and
+#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package.
+#' The resulting matrix can be used for genomic selection or other genetic analyses.
+#'
+#'@import dplyr
+#'@importFrom utils read.csv write.csv
+#'@importFrom rrBLUP A.mat
+#'
+#'@param madc_file Path to the MADC file to be filtered
+#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
+#'
+#'@return data.frame or saved csv file
+#'
+#'@examples
+#' #Example...
+#'
+#' ##Plots
+#' #Mean read depth
+#' #Number of Altmatch and Refmatch mhaps per target loci
+#'
+#'@references
+#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
+#'
+#'@export
+madc2gmat <- function(madc_file,
+                      output.file = NULL) {
+
+
+  #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
+  first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))
+
+  #Check if all entries in the first column are either blank or "*"
+  check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))
+
+  #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
+  if (check_entries) {
+    #Note: This assumes that the first 7 rows are placeholder info from DArT processing
+
+    warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID)
+
+  } else {
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)
+
+  }
+
+  #Removing extra columns
+  row.names(filtered_df) <- filtered_df$AlleleID
+  filtered_df <- filtered_df %>%
+    select(-c(AlleleID, CloneID, AlleleSequence))
+
+
+  #Scale and normalized data
+  message("Scaling and normalizing data to be -1,1")
+  #filtered_df <- filtered_df %>%
+  #  mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric()))
+
+  # Function to scale a matrix to be between -1 and 1 for rrBLUP
+  scale_matrix <- function(mat) {
+    min_val <- min(mat)
+    max_val <- max(mat)
+
+    # Normalize to [0, 1]
+    normalized <- (mat - min_val) / (max_val - min_val)
+
+    # Scale to [-1, 1]
+    scaled <- 2 * normalized - 1
+
+    return(scaled)
+  }
+
+  # Apply the scaling function
+  filtered_df <- scale_matrix(filtered_df)
+
+  #Making additive relationship matrix
+  MADC.mat <- A.mat(t(filtered_df))
+
+  #Save the output to disk if file name provided
+  if (!is.null(output.file)) {
+    message("Saving filtered data to file")
+    write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE)
+  }
+
+  return(MADC.mat)
+}
diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd
diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd