From d2fc874a9c4fa6c864725aa51f2261ce148daeda Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Fri, 16 May 2025 11:43:06 -0400
Subject: [PATCH 01/12] Added MADC Functions

---
 NAMESPACE         |   3 ++
 R/filterMADC.R    |  99 +++++++++++++++++++++++++++++++++++++++++++
 R/madc2gmat.R     | 106 ++++++++++++++++++++++++++++++++++++++++++++++
 man/filterMADC.Rd |  63 +++++++++++++++++++++++++++
 man/madc2gmat.Rd  |  35 +++++++++++++++
 5 files changed, 306 insertions(+)
 create mode 100644 R/filterMADC.R
 create mode 100644 R/madc2gmat.R
 create mode 100644 man/filterMADC.Rd
 create mode 100644 man/madc2gmat.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 2ca9c31..9b60af0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,10 +7,12 @@ export(check_ped)
 export(check_replicates)
 export(dosage2vcf)
 export(dosage_ratios)
+export(filterMADC)
 export(filterVCF)
 export(flip_dosage)
 export(get_countsMADC)
 export(imputation_concordance)
+export(madc2gmat)
 export(madc2vcf_all)
 export(madc2vcf_targets)
 export(merge_MADCs)
@@ -31,6 +33,7 @@ importFrom(pwalign,pairwiseAlignment)
 importFrom(readr,read_csv)
 importFrom(reshape2,dcast)
 importFrom(reshape2,melt)
+importFrom(rrBLUP,A.mat)
 importFrom(stats,cor)
 importFrom(stats,setNames)
 importFrom(utils,packageVersion)
diff --git a/R/filterMADC.R b/R/filterMADC.R
new file mode 100644
index 0000000..8c8259e
--- /dev/null
+++ b/R/filterMADC.R
@@ -0,0 +1,99 @@
+#' Filter MADC Files
+#'
+#' Filter and process MADC files to remove low quality microhaplotypes
+#'
+#' @details
+#' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
+#' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
+#' can scale and normalize the data in preparation for conversion to relationship matrices,
+#' plot summary statistics, and save the filtered data to a file.
+#'
+#'@import dplyr
+#'@importFrom utils read.csv
+#'
+#'@param madc_file Path to the MADC file to be filtered
+#'@param min.mean.reads Minimum mean read depth for filtering
+#'@param max.mean.reads Maximum mean read depth for filtering
+#'@param max.match.mhaps Maximum number of matching mhaps per target loci
+#'@param min.reads.per.site Minimum number of reads per site for filtering
+#'@param min.ind.with.reads Minimum number of individuals with reads for filtering
+#'@param target_only Logical indicating whether to filter for target loci only
+#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs
+#'@param plot.summary Logical indicating whether to plot summary statistics
+#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
+#'@param verbose Logical indicating whether to print additional information during processing
+#'
+#'@return data.frame or saved csv file
+#'
+#'@examples
+#' #Example...
+#'
+#' ##Plots
+#' #Mean read depth
+#' #Number of Altmatch and Refmatch mhaps per target loci
+#'
+#'
+#'@export
+filterMADC <- function(madc_file,
+                       min.mean.reads = NULL,
+                       max.mean.reads = NULL,
+                       max.match.mhaps = 10,
+                       min.reads.per.site = NULL,
+                       min.ind.with.reads = NULL,
+                       target_only = FALSE,
+                       fixed_allele_ids = FALSE,
+                       plot.summary = FALSE,
+                       output.file = NULL) {
+
+
+  #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
+  first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))
+
+  #Check if all entries in the first column are either blank or "*"
+  check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))
+
+  #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
+  if (check_entries) {
+    #Note: This assumes that the first 7 rows are placeholder info from DArT processing
+
+    warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
+
+  } else {
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
+
+  }
+
+  #Remove refmatch and altmatch if wanted
+  if (target_only) {
+    message("Retaining target markers only")
+    #Retain only the Ref and Alt haplotypes
+    filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ]
+  }
+
+  ## Filtering
+  if (!is.null(min.mean.reads)) {
+    message("Filtering for minimum mean reads across all samples")
+    filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ]
+  }
+
+  #Save the output to disk if file name provided
+  if (!is.null(output.file)) {
+    message("Saving filtered data to file")
+    write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE)
+  }
+
+  return(filtered_df)
+}
diff --git a/R/madc2gmat.R b/R/madc2gmat.R
new file mode 100644
index 0000000..b39b364
--- /dev/null
+++ b/R/madc2gmat.R
@@ -0,0 +1,106 @@
+#' Convert MADC Files to an Additive Genomic Relationship Matrix
+#'
+#' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix.
+#'
+#'@details
+#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and
+#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package.
+#' The resulting matrix can be used for genomic selection or other genetic analyses.
+#'
+#'@import dplyr
+#'@importFrom utils read.csv write.csv
+#'@importFrom rrBLUP A.mat
+#'
+#'@param madc_file Path to the MADC file to be filtered
+#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
+#'
+#'@return data.frame or saved csv file
+#'
+#'@examples
+#' #Example...
+#'
+#' ##Plots
+#' #Mean read depth
+#' #Number of Altmatch and Refmatch mhaps per target loci
+#'
+#'@references
+#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
+#'
+#'@export
+madc2gmat <- function(madc_file,
+                      output.file = NULL) {
+
+
+  #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
+  first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))
+
+  #Check if all entries in the first column are either blank or "*"
+  check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))
+
+  #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
+  if (check_entries) {
+    #Note: This assumes that the first 7 rows are placeholder info from DArT processing
+
+    warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID)
+
+  } else {
+
+    #Read the madc file
+    filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
+
+    #Remove extra text after Ref and Alt (_001 or _002)
+    #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
+    filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)
+
+  }
+
+  #Removing extra columns
+  row.names(filtered_df) <- filtered_df$AlleleID
+  filtered_df <- filtered_df %>%
+    select(-c(AlleleID, CloneID, AlleleSequence))
+
+
+  #Scale and normalized data
+  message("Scaling and normalizing data to be -1,1")
+  #filtered_df <- filtered_df %>%
+  #  mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric()))
+
+  # Function to scale a matrix to be between -1 and 1 for rrBLUP
+  scale_matrix <- function(mat) {
+    min_val <- min(mat)
+    max_val <- max(mat)
+
+    # Normalize to [0, 1]
+    normalized <- (mat - min_val) / (max_val - min_val)
+
+    # Scale to [-1, 1]
+    scaled <- 2 * normalized - 1
+
+    return(scaled)
+  }
+
+  # Apply the scaling function
+  filtered_df <- scale_matrix(filtered_df)
+
+  #Making additive relationship matrix
+  MADC.mat <- A.mat(t(filtered_df))
+
+  #Save the output to disk if file name provided
+  if (!is.null(output.file)) {
+    message("Saving filtered data to file")
+    write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE)
+  }
+
+  return(MADC.mat)
+}
diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd
new file mode 100644
index 0000000..2b161dd
--- /dev/null
+++ b/man/filterMADC.Rd
@@ -0,0 +1,63 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filterMADC.R
+\name{filterMADC}
+\alias{filterMADC}
+\title{Filter MADC Files}
+\usage{
+filterMADC(
+  madc_file,
+  min.mean.reads = NULL,
+  max.mean.reads = NULL,
+  max.match.mhaps = 10,
+  min.reads.per.site = NULL,
+  min.ind.with.reads = NULL,
+  target_only = FALSE,
+  fixed_allele_ids = FALSE,
+  plot.summary = FALSE,
+  output.file = NULL
+)
+}
+\arguments{
+\item{madc_file}{Path to the MADC file to be filtered}
+
+\item{min.mean.reads}{Minimum mean read depth for filtering}
+
+\item{max.mean.reads}{Maximum mean read depth for filtering}
+
+\item{max.match.mhaps}{Maximum number of matching mhaps per target loci}
+
+\item{min.reads.per.site}{Minimum number of reads per site for filtering}
+
+\item{min.ind.with.reads}{Minimum number of individuals with reads for filtering}
+
+\item{target_only}{Logical indicating whether to filter for target loci only}
+
+\item{fixed_allele_ids}{Logical indicating whether the MADC file has been pre-processed for fixed allele IDs}
+
+\item{plot.summary}{Logical indicating whether to plot summary statistics}
+
+\item{output.file}{Path to save the filtered data (if NULL, data will not be saved)}
+
+\item{verbose}{Logical indicating whether to print additional information during processing}
+}
+\value{
+data.frame or saved csv file
+}
+\description{
+Filter and process MADC files to remove low quality microhaplotypes
+}
+\details{
+This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
+it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
+can scale and normalize the data in preparation for conversion to relationship matrices,
+plot summary statistics, and save the filtered data to a file.
+}
+\examples{
+#Example...
+
+##Plots
+#Mean read depth
+#Number of Altmatch and Refmatch mhaps per target loci
+
+
+}
diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd
new file mode 100644
index 0000000..6bd0b4f
--- /dev/null
+++ b/man/madc2gmat.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/madc2gmat.R
+\name{madc2gmat}
+\alias{madc2gmat}
+\title{Convert MADC Files to an Additive Genomic Relationship Matrix}
+\usage{
+madc2gmat(madc_file, output.file = NULL)
+}
+\arguments{
+\item{madc_file}{Path to the MADC file to be filtered}
+
+\item{output.file}{Path to save the filtered data (if NULL, data will not be saved)}
+}
+\value{
+data.frame or saved csv file
+}
+\description{
+Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix.
+}
+\details{
+This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and
+then converts it into an additive genomic relationship matrix using the \code{A.mat} function from the \code{rrBLUP} package.
+The resulting matrix can be used for genomic selection or other genetic analyses.
+}
+\examples{
+#Example...
+
+##Plots
+#Mean read depth
+#Number of Altmatch and Refmatch mhaps per target loci
+
+}
+\references{
+Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
+}

From e3a312b8ac74b08da2704b327c9a4443bcd54a84 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Fri, 16 May 2025 11:48:38 -0400
Subject: [PATCH 02/12] update description

---
 DESCRIPTION | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 52220d3..7906d27 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -53,6 +53,7 @@ Imports:
     Rdpack (>= 0.7),
     readr (>= 2.1.5),
     reshape2 (>= 1.4.4),
+    rrBLUP,
     tidyr (>= 1.3.1),
     vcfR (>= 1.15.0),
     Rsamtools,

From 5d4097bebc9750b1bb40108293e5431a77610cf0 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Fri, 16 May 2025 12:58:51 -0400
Subject: [PATCH 03/12] added madc2gmat test

---
 R/madc2gmat.R                   | 11 ++++++++--
 man/madc2gmat.Rd                |  2 +-
 tests/testthat/test-madc2gmat.R | 38 +++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 3 deletions(-)
 create mode 100644 tests/testthat/test-madc2gmat.R

diff --git a/R/madc2gmat.R b/R/madc2gmat.R
index b39b364..39a8b54 100644
--- a/R/madc2gmat.R
+++ b/R/madc2gmat.R
@@ -28,8 +28,12 @@
 #'
 #'@export
 madc2gmat <- function(madc_file,
+                      seed = NULL,
                       output.file = NULL) {
-
+  #set seed if not null
+  if (!is.null(seed)) {
+    set.seed(seed)
+  }
 
   #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
   first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))
@@ -96,11 +100,14 @@ madc2gmat <- function(madc_file,
   #Making additive relationship matrix
   MADC.mat <- A.mat(t(filtered_df))
 
+  rm(filtered_df)
+
   #Save the output to disk if file name provided
   if (!is.null(output.file)) {
     message("Saving filtered data to file")
     write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE)
+  } else {
+    return(MADC.mat)
   }
 
-  return(MADC.mat)
 }
diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd
index 6bd0b4f..d2cf998 100644
--- a/man/madc2gmat.Rd
+++ b/man/madc2gmat.Rd
@@ -4,7 +4,7 @@
 \alias{madc2gmat}
 \title{Convert MADC Files to an Additive Genomic Relationship Matrix}
 \usage{
-madc2gmat(madc_file, output.file = NULL)
+madc2gmat(madc_file, seed = NULL, output.file = NULL)
 }
 \arguments{
 \item{madc_file}{Path to the MADC file to be filtered}
diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R
new file mode 100644
index 0000000..5497f21
--- /dev/null
+++ b/tests/testthat/test-madc2gmat.R
@@ -0,0 +1,38 @@
+context("MADC 2 Gmatrix")
+
+
+test_that("test madc2gmat",{
+  #Input variables
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+
+  #Calculations
+  temp <- tempfile()
+
+  # Converting to additive relationship matrix
+  gmat <- madc2gmat(madc_file,
+                    seed = 123,
+                    output.file = NULL)
+
+  #When output a file
+  madc2gmat(madc_file,
+            seed = 123,
+            output.file = temp)
+
+  #Test that a valid output was provided
+  expect_true(file.exists(paste0(temp, ".csv")))
+
+  #Check
+  expect_true(all(dim(gmat) == c("10","10")))
+  expect_true(all(row.names(gmat) == row.names(gmat)))
+  expect_equal(sum(gmat), 1.00614e-16)#, tolerance = 1e-16)
+  expect_true(is.matrix(gmat), "Output should be a matrix")
+
+  # Read the output file
+  output_data <- read.csv(paste0(temp,".csv"), row.names = 1)
+
+  # Test the content of the output file
+  expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix")
+  expect_true(all(dim(output_data) == c("10","10")))
+  expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical")
+  expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-16)
+})

From e0709ee44e0d7d1e972274feddd802c73d387b20 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Sat, 17 May 2025 08:48:48 -0400
Subject: [PATCH 04/12] cleaned

---
 R/madc2gmat.R | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/R/madc2gmat.R b/R/madc2gmat.R
index 39a8b54..c386b90 100644
--- a/R/madc2gmat.R
+++ b/R/madc2gmat.R
@@ -50,25 +50,16 @@ madc2gmat <- function(madc_file,
     #Read the madc file
     filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
 
-    #Remove extra text after Ref and Alt (_001 or _002)
-    #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
-    #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID)
-
   } else {
 
     #Read the madc file
     filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
-
-    #Remove extra text after Ref and Alt (_001 or _002)
-    #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
-    #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)
-
   }
 
+  #Remove extra text after Ref and Alt (_001 or _002)
+  filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
+  filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)
+
   #Removing extra columns
   row.names(filtered_df) <- filtered_df$AlleleID
   filtered_df <- filtered_df %>%
@@ -77,9 +68,6 @@ madc2gmat <- function(madc_file,
 
   #Scale and normalized data
   message("Scaling and normalizing data to be -1,1")
-  #filtered_df <- filtered_df %>%
-  #  mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric()))
-
   # Function to scale a matrix to be between -1 and 1 for rrBLUP
   scale_matrix <- function(mat) {
     min_val <- min(mat)

From ca6aa80750aefa278267d907865c0e4f8a676d40 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Sat, 17 May 2025 11:11:36 -0400
Subject: [PATCH 05/12] filterMADC added

---
 R/filterMADC.R    | 129 +++++++++++++++++++++++++++++++++++++++++-----
 man/filterMADC.Rd |  23 ++++-----
 2 files changed, 126 insertions(+), 26 deletions(-)

diff --git a/R/filterMADC.R b/R/filterMADC.R
index 8c8259e..3cc2472 100644
--- a/R/filterMADC.R
+++ b/R/filterMADC.R
@@ -5,8 +5,7 @@
 #' @details
 #' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
 #' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
-#' can scale and normalize the data in preparation for conversion to relationship matrices,
-#' plot summary statistics, and save the filtered data to a file.
+#' can plot summary statistics and save the filtered data to a file.
 #'
 #'@import dplyr
 #'@importFrom utils read.csv
@@ -14,14 +13,13 @@
 #'@param madc_file Path to the MADC file to be filtered
 #'@param min.mean.reads Minimum mean read depth for filtering
 #'@param max.mean.reads Maximum mean read depth for filtering
-#'@param max.match.mhaps Maximum number of matching mhaps per target loci
-#'@param min.reads.per.site Minimum number of reads per site for filtering
-#'@param min.ind.with.reads Minimum number of individuals with reads for filtering
-#'@param target_only Logical indicating whether to filter for target loci only
-#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs
+#'@param max.mhaps.per.loci Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold.
+#'@param min.reads.per.site Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored
+#'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering
+#'@param target.only Logical indicating whether to filter for target loci only
+#'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.
 #'@param plot.summary Logical indicating whether to plot summary statistics
 #'@param output.file Path to save the filtered data (if NULL, data will not be saved)
-#'@param verbose Logical indicating whether to print additional information during processing
 #'
 #'@return data.frame or saved csv file
 #'
@@ -37,11 +35,11 @@
 filterMADC <- function(madc_file,
                        min.mean.reads = NULL,
                        max.mean.reads = NULL,
-                       max.match.mhaps = 10,
-                       min.reads.per.site = NULL,
+                       max.mhaps.per.loci = NULL,
+                       min.reads.per.site = 1,
                        min.ind.with.reads = NULL,
-                       target_only = FALSE,
-                       fixed_allele_ids = FALSE,
+                       target.only = FALSE,
+                       n.summary.columns = NULL,
                        plot.summary = FALSE,
                        output.file = NULL) {
 
@@ -75,18 +73,123 @@ filterMADC <- function(madc_file,
     filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
 
   }
+  #Check for extra columns
+  #Save the three columns for later adding to the output
+  saved_columns <- filtered_df[,1:3]
+
+  if (!is.null(n.summary.columns)) {
+    #Remove the first n.summary.columns columns
+    filtered_df <- filtered_df[,-c(4:n.summary.columns)]
+  }else{
+    rm.col <- c("ClusterConsensusSequence",
+                "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp",
+                "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp")
+
+    filtered_df <- filtered_df[, !(colnames(filtered_df) %in% rm.col)]
+  }
+
+  #Now add rownames
+  rownames(filtered_df) <- saved_columns[,1]
 
   #Remove refmatch and altmatch if wanted
-  if (target_only) {
+  if (target.only) {
     message("Retaining target markers only")
     #Retain only the Ref and Alt haplotypes
     filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ]
   }
 
   ## Filtering
+
+  #Min mean reads
   if (!is.null(min.mean.reads)) {
     message("Filtering for minimum mean reads across all samples")
+    #Get the mean value for each row, and remove the rows below that threshold
+    filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
     filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ]
+    #Remove the MeanReads column
+    filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")]
+  }
+
+  #Max mean reads
+  if (!is.null(max.mean.reads)) {
+    message("Filtering for maximum mean reads across all samples")
+    #Get the mean value for each row, and remove the rows above that threshold
+    filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
+    filtered_df <- filtered_df[filtered_df$MeanReads <= max.mean.reads, ]
+    #Remove the MeanReads column
+    filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")]
+  }
+
+  #Max mhaps per loci
+  if (!is.null(max.mhaps.per.loci)) {
+    message("Filtering for maximum number of matching mhaps per target loci")
+    #Get the number of matching mhaps for loci, and remove the mhaps at those loci that exceed the max number
+    mhap_counts <- filtered_df %>%
+      group_by(CloneID) %>%
+      summarise(Count = n(), .groups = 'drop') %>%
+      filter(Count > max.mhaps.per.loci)
+
+    patterns_to_search <- "\\|AltMatch|\\|RefMatch"
+    clone_ids_to_target <- mhap_counts$CloneID
+
+    filtered_df <- filtered_df %>%
+      filter(
+        !( # "keep rows that DO NOT match both conditions"
+          CloneID %in% clone_ids_to_target &  # Condition 1: CloneID is one of the targeted IDs
+            grepl(patterns_to_search, AlleleID) # Condition 2: AlleleID contains one of the patterns
+        )
+      )
+  }
+
+  #Min individuals with reads
+  if (!is.null(min.ind.with.reads)) {
+    message("Filtering for minimum number of individuals with reads per site")
+    message(past0("Minimum number of individuals with reads per site: ", min.ind.with.reads))
+    message(past0("Minimum number of reads per site: ", min.reads.per.site))
+
+    #Getting colnames
+    cols_to_check <- colnames(filtered_df)[-(1:3)]
+
+    filtered_df <- filtered_df %>%
+      rowwise() %>%  # Process data row by row
+      mutate(
+        # For each row, count how many of the 'cols_to_check' meet the criterion
+        qualifying_sites_count = sum(
+          c_across(all_of(cols_to_check)) >= min.reads.per.site,
+          na.rm = TRUE # Treats NAs in data as not meeting the criterion
+        )
+      ) %>%
+      ungroup() %>% # Always ungroup after rowwise operations
+      # Filter rows where this count meets the 'min.ind.with.reads' threshold
+      filter(qualifying_sites_count >= min.ind.with.reads) %>%
+      # Optionally, remove the temporary count column if it's no longer needed
+      select(-qualifying_sites_count)
+  }
+
+  #Plots
+  if (plot.summary) {
+    message("Plotting summary statistics")
+    #Plot mean read depth
+    mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
+    hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency")
+
+    #Plot number of Altmatch and Refmatch mhaps per target loci
+    altmatch_counts <- filtered_df %>%
+      filter(grepl("\\|AltMatch", AlleleID)) %>%
+      group_by(CloneID) %>%
+      summarise(Count = n(), .groups = 'drop')
+
+    refmatch_counts <- filtered_df %>%
+      filter(grepl("\\|RefMatch", AlleleID)) %>%
+      group_by(CloneID) %>%
+      summarise(Count = n(), .groups = 'drop')
+
+    barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE,
+            names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps",
+            xlab = "Clone ID", ylab = "Count")
+
+    #Plot density of number of CloneID per site on a marker distribution plot
+
   }
 
   #Save the output to disk if file name provided
diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd
index 2b161dd..c2df9d2 100644
--- a/man/filterMADC.Rd
+++ b/man/filterMADC.Rd
@@ -8,11 +8,11 @@ filterMADC(
   madc_file,
   min.mean.reads = NULL,
   max.mean.reads = NULL,
-  max.match.mhaps = 10,
-  min.reads.per.site = NULL,
+  max.mhaps.per.loci = NULL,
+  min.reads.per.site = 1,
   min.ind.with.reads = NULL,
-  target_only = FALSE,
-  fixed_allele_ids = FALSE,
+  target.only = FALSE,
+  n.summary.columns = NULL,
   plot.summary = FALSE,
   output.file = NULL
 )
@@ -24,21 +24,19 @@ filterMADC(
 
 \item{max.mean.reads}{Maximum mean read depth for filtering}
 
-\item{max.match.mhaps}{Maximum number of matching mhaps per target loci}
+\item{max.mhaps.per.loci}{Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold.}
 
-\item{min.reads.per.site}{Minimum number of reads per site for filtering}
+\item{min.reads.per.site}{Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored}
 
-\item{min.ind.with.reads}{Minimum number of individuals with reads for filtering}
+\item{min.ind.with.reads}{Minimum number of individuals with \code{min.reads.per.site} reads for filtering}
 
-\item{target_only}{Logical indicating whether to filter for target loci only}
+\item{target.only}{Logical indicating whether to filter for target loci only}
 
-\item{fixed_allele_ids}{Logical indicating whether the MADC file has been pre-processed for fixed allele IDs}
+\item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.}
 
 \item{plot.summary}{Logical indicating whether to plot summary statistics}
 
 \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)}
-
-\item{verbose}{Logical indicating whether to print additional information during processing}
 }
 \value{
 data.frame or saved csv file
@@ -49,8 +47,7 @@ Filter and process MADC files to remove low quality microhaplotypes
 \details{
 This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
 it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
-can scale and normalize the data in preparation for conversion to relationship matrices,
-plot summary statistics, and save the filtered data to a file.
+can plot summary statistics and save the filtered data to a file.
 }
 \examples{
 #Example...

From e4873fa97afcf36b0a2ae283858d51589b877f05 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Sat, 17 May 2025 11:47:24 -0400
Subject: [PATCH 06/12] added filterMADC tests

---
 R/filterMADC.R                   |  54 ++++++------
 man/filterMADC.Rd                |   3 -
 tests/testthat/test-filterMADC.R | 144 +++++++++++++++++++++++++++++++
 3 files changed, 172 insertions(+), 29 deletions(-)
 create mode 100644 tests/testthat/test-filterMADC.R

diff --git a/R/filterMADC.R b/R/filterMADC.R
index 3cc2472..2ab79dd 100644
--- a/R/filterMADC.R
+++ b/R/filterMADC.R
@@ -18,7 +18,7 @@
 #'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering
 #'@param target.only Logical indicating whether to filter for target loci only
 #'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.
-#'@param plot.summary Logical indicating whether to plot summary statistics
+#@param plot.summary Logical indicating whether to plot summary statistics
 #'@param output.file Path to save the filtered data (if NULL, data will not be saved)
 #'
 #'@return data.frame or saved csv file
@@ -40,7 +40,7 @@ filterMADC <- function(madc_file,
                        min.ind.with.reads = NULL,
                        target.only = FALSE,
                        n.summary.columns = NULL,
-                       plot.summary = FALSE,
+                       #plot.summary = FALSE,
                        output.file = NULL) {
 
 
@@ -144,8 +144,8 @@ filterMADC <- function(madc_file,
   #Min individuals with reads
   if (!is.null(min.ind.with.reads)) {
     message("Filtering for minimum number of individuals with reads per site")
-    message(past0("Minimum number of individuals with reads per site: ", min.ind.with.reads))
-    message(past0("Minimum number of reads per site: ", min.reads.per.site))
+    message(paste0("Minimum number of individuals with reads per site: ", min.ind.with.reads))
+    message(paste0("Minimum number of reads per site: ", min.reads.per.site))
 
     #Getting colnames
     cols_to_check <- colnames(filtered_df)[-(1:3)]
@@ -167,36 +167,38 @@ filterMADC <- function(madc_file,
   }
 
   #Plots
-  if (plot.summary) {
-    message("Plotting summary statistics")
-    #Plot mean read depth
-    mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
-    hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency")
-
-    #Plot number of Altmatch and Refmatch mhaps per target loci
-    altmatch_counts <- filtered_df %>%
-      filter(grepl("\\|AltMatch", AlleleID)) %>%
-      group_by(CloneID) %>%
-      summarise(Count = n(), .groups = 'drop')
-
-    refmatch_counts <- filtered_df %>%
-      filter(grepl("\\|RefMatch", AlleleID)) %>%
-      group_by(CloneID) %>%
-      summarise(Count = n(), .groups = 'drop')
-
-    barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE,
-            names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps",
-            xlab = "Clone ID", ylab = "Count")
+  #if (plot.summary) {
+  #  message("Plotting summary statistics")
+  #  #Plot mean read depth
+  #  mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
+  #  hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency")
+
+  #  #Plot number of Altmatch and Refmatch mhaps per target loci
+  #  altmatch_counts <- filtered_df %>%
+  #    filter(grepl("\\|AltMatch", AlleleID)) %>%
+  #    group_by(CloneID) %>%
+  #    summarise(Count = n(), .groups = 'drop')
+
+  #  refmatch_counts <- filtered_df %>%
+  #    filter(grepl("\\|RefMatch", AlleleID)) %>%
+  #    group_by(CloneID) %>%
+  #    summarise(Count = n(), .groups = 'drop')
+
+  #  barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE,
+  #          names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps",
+  #          xlab = "Clone ID", ylab = "Count")
 
     #Plot density of number of CloneID per site on a marker distribution plot
 
-  }
+  #}
 
   #Save the output to disk if file name provided
   if (!is.null(output.file)) {
     message("Saving filtered data to file")
     write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE)
+  } else {
+    message("No output file provided. Returning filtered data.")
+    return(filtered_df)
   }
 
-  return(filtered_df)
 }
diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd
index c2df9d2..a028653 100644
--- a/man/filterMADC.Rd
+++ b/man/filterMADC.Rd
@@ -13,7 +13,6 @@ filterMADC(
   min.ind.with.reads = NULL,
   target.only = FALSE,
   n.summary.columns = NULL,
-  plot.summary = FALSE,
   output.file = NULL
 )
 }
@@ -34,8 +33,6 @@ filterMADC(
 
 \item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.}
 
-\item{plot.summary}{Logical indicating whether to plot summary statistics}
-
 \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)}
 }
 \value{
diff --git a/tests/testthat/test-filterMADC.R b/tests/testthat/test-filterMADC.R
new file mode 100644
index 0000000..b7cf687
--- /dev/null
+++ b/tests/testthat/test-filterMADC.R
@@ -0,0 +1,144 @@
+context("Filter MADC")
+
+
+test_that("test filter madc",{
+  #Input variables
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+
+  #Calculations
+  temp <- tempfile()
+
+  # Filtering (target only)
+  filtered_df <- filterMADC(madc_file,
+                         min.mean.reads = NULL,
+                         max.mean.reads = NULL,
+                         max.mhaps.per.loci = NULL,
+                         min.reads.per.site = 1,
+                         min.ind.with.reads = NULL,
+                         target.only = TRUE,
+                         n.summary.columns = NULL,
+                         output.file = NULL)
+
+
+  #Test that a valid output was provided
+  expect_equal(nrow(filtered_df), 41)
+  #Check that it is a dataframe
+  expect_true(is.data.frame(filtered_df))
+
+  # Checking for no filtering
+  filtered_df <- filterMADC(madc_file,
+                            min.mean.reads = NULL,
+                            max.mean.reads = NULL,
+                            max.mhaps.per.loci = NULL,
+                            min.reads.per.site = 1,
+                            min.ind.with.reads = NULL,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = NULL)
+
+  expect_equal(nrow(filtered_df), 51)
+  expect_equal(sum(filtered_df[,-c(1:3)]), 53952)
+  expect_true(all(names(filtered_df[1:3]) == c("AlleleID", "CloneID", "AlleleSequence")))
+
+  #Checking for min.mean.reads filtering
+  filtered_df <- filterMADC(madc_file,
+                            min.mean.reads = 10,
+                            max.mean.reads = NULL,
+                            max.mhaps.per.loci = NULL,
+                            min.reads.per.site = 1,
+                            min.ind.with.reads = NULL,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = NULL)
+
+  expect_equal(nrow(filtered_df), 36)
+  expect_equal(ncol(filtered_df), 13)
+
+  #Checking for max.mean.reads filtering
+  filtered_df <- filterMADC(madc_file,
+                            min.mean.reads = NULL,
+                            max.mean.reads = 10,
+                            max.mhaps.per.loci = NULL,
+                            min.reads.per.site = 1,
+                            min.ind.with.reads = NULL,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = NULL)
+
+  expect_equal(nrow(filtered_df), 15)
+  expect_equal(ncol(filtered_df), 13)
+
+  #Remove max mhaps
+  filtered_df <- filterMADC(madc_file,
+                            min.mean.reads = NULL,
+                            max.mean.reads = NULL,
+                            max.mhaps.per.loci = 3,
+                            min.reads.per.site = 1,
+                            min.ind.with.reads = NULL,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = NULL)
+
+  expect_equal(nrow(filtered_df), 44)
+  expect_equal(ncol(filtered_df), 13)
+
+  #Remove min ind with reads
+  filtered_df <- filterMADC(madc_file,
+                            min.mean.reads = NULL,
+                            max.mean.reads = NULL,
+                            max.mhaps.per.loci = NULL,
+                            min.reads.per.site = 10,
+                            min.ind.with.reads = 10,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = NULL)
+
+  expect_equal(nrow(filtered_df), 9)
+  expect_equal(ncol(filtered_df), 13)
+  expect_equal(sum(filtered_df[,-c(1:3)]), 31642)
+
+  #Check that the output file is created
+  filterMADC(madc_file,
+                            min.mean.reads = NULL,
+                            max.mean.reads = NULL,
+                            max.mhaps.per.loci = NULL,
+                            min.reads.per.site = 1,
+                            min.ind.with.reads = NULL,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = temp)
+
+  expect_true(file.exists(paste0(temp,".csv")))
+
+  #Check that the plots are created in the console
+  #filtered_df <- filterMADC(madc_file,
+  #                          min.mean.reads = NULL,
+  #                          max.mean.reads = NULL,
+  #                          max.mhaps.per.loci = 3,
+  #                          min.reads.per.site = 1,
+  #                          min.ind.with.reads = NULL,
+  #                          target.only = FALSE,
+  #                          n.summary.columns = NULL,
+  #                          plot.summary = TRUE,
+  #                          output.file = NULL)
+
+  #expect_true(is.numeric(dev.cur()))
+  #expect_true(dev.cur() > 1)
+
+  #Now checking that all paramaters can work together
+  filtered_df <- filterMADC(madc_file,
+                            min.mean.reads =1,
+                            max.mean.reads = 150,
+                            max.mhaps.per.loci = 3,
+                            min.reads.per.site = 10,
+                            min.ind.with.reads = 10,
+                            target.only = FALSE,
+                            n.summary.columns = NULL,
+                            output.file = NULL)
+
+  expect_equal(nrow(filtered_df), 3)
+  expect_equal(ncol(filtered_df), 13)
+  expect_equal(sum(filtered_df[,-c(1:3)]), 3960)
+
+
+})

From ee6bc5ec9af96bbbf8d330fcc5612993f8ce63f5 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Sat, 17 May 2025 11:50:12 -0400
Subject: [PATCH 07/12] Add example

---
 R/filterMADC.R    | 19 +++++++++++++++----
 man/filterMADC.Rd | 19 +++++++++++++++----
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/R/filterMADC.R b/R/filterMADC.R
index 2ab79dd..a9b2cec 100644
--- a/R/filterMADC.R
+++ b/R/filterMADC.R
@@ -24,11 +24,22 @@
 #'@return data.frame or saved csv file
 #'
 #'@examples
-#' #Example...
+#' #Example
+#'
+#' #Example MADC
+#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+#'
+#' #Remove mhaps exceeding 3 per target region including the ref and alt target mhaps
+#' filtered_df <- filterMADC(madc_file,
+#'                          min.mean.reads = NULL,
+#'                          max.mean.reads = NULL,
+#'                          max.mhaps.per.loci = 3,
+#'                          min.reads.per.site = 1,
+#'                          min.ind.with.reads = NULL,
+#'                          target.only = FALSE,
+#'                          n.summary.columns = NULL,
+#'                          output.file = NULL)
 #'
-#' ##Plots
-#' #Mean read depth
-#' #Number of Altmatch and Refmatch mhaps per target loci
 #'
 #'
 #'@export
diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd
index a028653..671a599 100644
--- a/man/filterMADC.Rd
+++ b/man/filterMADC.Rd
@@ -47,11 +47,22 @@ it can filter based on mean read depth, number of mhaps per target loci, and oth
 can plot summary statistics and save the filtered data to a file.
 }
 \examples{
-#Example...
+#Example
+
+#Example MADC
+madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+
+#Remove mhaps exceeding 3 per target region including the ref and alt target mhaps
+filtered_df <- filterMADC(madc_file,
+                         min.mean.reads = NULL,
+                         max.mean.reads = NULL,
+                         max.mhaps.per.loci = 3,
+                         min.reads.per.site = 1,
+                         min.ind.with.reads = NULL,
+                         target.only = FALSE,
+                         n.summary.columns = NULL,
+                         output.file = NULL)
 
-##Plots
-#Mean read depth
-#Number of Altmatch and Refmatch mhaps per target loci
 
 
 }

From 85f12101993b047f7971acb0b9f092d370f3a9be Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Mon, 19 May 2025 09:21:16 -0400
Subject: [PATCH 08/12] Example updates

---
 R/madc2gmat.R    | 13 +++++++++----
 README.md        |  1 +
 man/madc2gmat.Rd | 13 +++++++++----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/R/madc2gmat.R b/R/madc2gmat.R
index c386b90..2092155 100644
--- a/R/madc2gmat.R
+++ b/R/madc2gmat.R
@@ -17,11 +17,16 @@
 #'@return data.frame or saved csv file
 #'
 #'@examples
-#' #Example...
+#' #Input variables
+#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
 #'
-#' ##Plots
-#' #Mean read depth
-#' #Number of Altmatch and Refmatch mhaps per target loci
+#' #Calculations
+#' temp <- tempfile()
+#'
+#' # Converting to additive relationship matrix
+#' gmat <- madc2gmat(madc_file,
+#'                  seed = 123,
+#'                  output.file = NULL)
 #'
 #'@references
 #'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
diff --git a/README.md b/README.md
index 87fcdb2..e913a9c 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 [![R-CMD-check](https://github.com/Breeding-Insight/BIGr/workflows/R-CMD-check/badge.svg)](https://github.com/Breeding-Insight/BIGr/actions)
 ![GitHub Release](https://img.shields.io/github/v/release/Breeding-Insight/BIGr)
 [![Development Status](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg)
+[![CRAN Status Badge](https://www.r-pkg.org/badges/version/BIGr)](https://cran.r-project.org/package=BIGr)
 ![GitHub License](https://img.shields.io/github/license/Breeding-Insight/BIGr)
 [![codecov](https://app.codecov.io/gh/Breeding-Insight/BIGr/graph/badge.svg?token=PJUZMRN1NF)](https://app.codecov.io/gh/Breeding-Insight/BIGr)
 
diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd
index d2cf998..43e135e 100644
--- a/man/madc2gmat.Rd
+++ b/man/madc2gmat.Rd
@@ -23,11 +23,16 @@ then converts it into an additive genomic relationship matrix using the \code{A.
 The resulting matrix can be used for genomic selection or other genetic analyses.
 }
 \examples{
-#Example...
+#Input variables
+madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
 
-##Plots
-#Mean read depth
-#Number of Altmatch and Refmatch mhaps per target loci
+#Calculations
+temp <- tempfile()
+
+# Converting to additive relationship matrix
+gmat <- madc2gmat(madc_file,
+                 seed = 123,
+                 output.file = NULL)
 
 }
 \references{

From 0b54561e9776d790139357a7fba2d4a4eee3ca3c Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Mon, 19 May 2025 09:28:51 -0400
Subject: [PATCH 09/12] updated test

---
 tests/testthat/test-madc2gmat.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R
index 5497f21..0b447c3 100644
--- a/tests/testthat/test-madc2gmat.R
+++ b/tests/testthat/test-madc2gmat.R
@@ -34,5 +34,5 @@ test_that("test madc2gmat",{
   expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix")
   expect_true(all(dim(output_data) == c("10","10")))
   expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical")
-  expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-16)
+  expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-15)
 })

From c50d224b2e06e987584355087194408184fb8dee Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Mon, 19 May 2025 09:49:04 -0400
Subject: [PATCH 10/12] update documentation

---
 R/madc2gmat.R    | 1 +
 man/madc2gmat.Rd | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/R/madc2gmat.R b/R/madc2gmat.R
index 2092155..74e08c9 100644
--- a/R/madc2gmat.R
+++ b/R/madc2gmat.R
@@ -12,6 +12,7 @@
 #'@importFrom rrBLUP A.mat
 #'
 #'@param madc_file Path to the MADC file to be filtered
+#'@param seed Optional seed for random number generation (default is NULL)
 #'@param output.file Path to save the filtered data (if NULL, data will not be saved)
 #'
 #'@return data.frame or saved csv file
diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd
index 43e135e..c7eda35 100644
--- a/man/madc2gmat.Rd
+++ b/man/madc2gmat.Rd
@@ -9,6 +9,8 @@ madc2gmat(madc_file, seed = NULL, output.file = NULL)
 \arguments{
 \item{madc_file}{Path to the MADC file to be filtered}
 
+\item{seed}{Optional seed for random number generation (default is NULL)}
+
 \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)}
 }
 \value{

From 11877f9e2b024ad1a1241b4b8667dde9f78dab08 Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Mon, 19 May 2025 10:54:12 -0400
Subject: [PATCH 11/12] Update Description

---
 DESCRIPTION |  2 +-
 NEWS.md     | 53 +++++++++++++++++++++++++++++------------------------
 2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7906d27..c582046 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: BIGr
 Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species
-Version: 0.5.5
+Version: 0.6.0
 Authors@R: c(person(given='Alexander M.',
                     family='Sandercock',
                     email='ams866@cornell.edu',
diff --git a/NEWS.md b/NEWS.md
index 02cb34e..6e7d1f8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,25 +1,21 @@
-# BIGr 0.3.3
+# BIGr 0.6.0
 
--   Adapt updog2vcf to model f1, f1pp, s1 and s1pp
+- Added new functions for filtering MADC files and converting to relationship matrices
 
-# BIGr 0.3.2
+# BIGr 0.5.5
 
--   updog2vcf function option to output compressed VCF (.vcf.gz) - set as default
--   remove need for defining ploidy
--   add metadata at the VCF header
+- Updated DESCRIPTION
+- Added return value for merge_MADCs
+- Added optional seed for check_ped
+- Added verbose option
 
-# BIGr 0.5.0
+# BIGr 0.5.4
 
--   Add imputation_concordance function to estimate accuracy of imputed and original dataset
--   Add get_OffTargets function to extract target and off-target SNPs from a MADC file
--   Add merge_MADCs function to merge two or more MADC files together
--   Improved documentation and examples for all functions
--   Add tests for all functions
+-   Updated dosage2vcf example
 
-# BIGr 0.5.1
+# BIGr 0.5.3
 
--   Improvements of testthat tests
--   Add check_replicates and check_homozygous_trios for pedigree relationship quality check
+-   Updated madc2vcf_all example
 
 # BIGr 0.5.2
 
@@ -27,17 +23,26 @@
 -   get_OffTargets function changed to madc2vcf_all
 -   Updates to testthat tests and function examples
 
-# BIGr 0.5.3
+# BIGr 0.5.1
 
--   Updated madc2vcf_all example
+-   Improvements of testthat tests
+-   Add check_replicates and check_homozygous_trios for pedigree relationship quality check
 
-# BIGr 0.5.4
+# BIGr 0.5.0
 
--   Updated dosage2vcf example
+-   Add imputation_concordance function to estimate accuracy of imputed and original dataset
+-   Add get_OffTargets function to extract target and off-target SNPs from a MADC file
+-   Add merge_MADCs function to merge two or more MADC files together
+-   Improved documentation and examples for all functions
+-   Add tests for all functions
 
-# BIGr 0.5.5
+# BIGr 0.3.3
+
+-   Adapt updog2vcf to model f1, f1pp, s1 and s1pp
+
+# BIGr 0.3.2
+
+-   updog2vcf function option to output compressed VCF (.vcf.gz) - set as default
+-   remove need for defining ploidy
+-   add metadata at the VCF header
 
-- Updated DESCRIPTION
-- Added return value for merge_MADCs
-- Added optional seed for check_ped
-- Added verbose option

From 1d4e3bbe171dec2506f84630bd2bfbab018393cf Mon Sep 17 00:00:00 2001
From: alex-sandercock <sandercock.alex@gmail.com>
Date: Wed, 21 May 2025 12:07:43 -0400
Subject: [PATCH 12/12] removed ref and alt name changes

---
 R/filterMADC.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/filterMADC.R b/R/filterMADC.R
index a9b2cec..5c1056f 100644
--- a/R/filterMADC.R
+++ b/R/filterMADC.R
@@ -71,8 +71,8 @@ filterMADC <- function(madc_file,
     filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
 
     #Remove extra text after Ref and Alt (_001 or _002)
-    filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
 
   } else {
 
@@ -80,8 +80,8 @@ filterMADC <- function(madc_file,
     filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
 
     #Remove extra text after Ref and Alt (_001 or _002)
-    filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
+    #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)
 
   }
   #Check for extra columns