Skip to content

Commit be4f2ab

Browse files
Added thinSNP function
1 parent ffbbc98 commit be4f2ab

4 files changed

Lines changed: 215 additions & 0 deletions

File tree

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@ export(madc2gmat)
1616
export(madc2vcf_all)
1717
export(madc2vcf_targets)
1818
export(merge_MADCs)
19+
export(thinSNP)
1920
export(updog2vcf)
2021
import(dplyr)
2122
import(janitor)
2223
import(parallel)
2324
import(quadprog)
25+
import(rlang)
2426
import(tibble)
2527
import(tidyr)
2628
import(vcfR)

R/thinSNP.R

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#' Thin a dataframe of SNPs based on genomic position
2+
#'
3+
#' This function groups SNPs by chromosome, sorts them by physical position,
4+
#' and then iteratively selects SNPs such that no two selected SNPs within
5+
#' the same chromosome are closer than a specified minimum distance.
6+
#'
7+
#' @param df The input dataframe.
8+
#' @param chrom_col_name A string specifying the name of the chromosome column.
9+
#' @param pos_col_name A string specifying the name of the physical position column.
10+
#' @param min_distance A numeric value for the minimum distance between selected SNPs.
11+
#' The unit of this distance should match the unit of the `pos_col_name` column (e.g., base pairs).
12+
#'
13+
#' @import dplyr
14+
#' @import rlang
15+
#' @return A thinned dataframe with the same columns as the input.
16+
#'
17+
#' @examples
18+
#' # Create sample SNP data
19+
#' set.seed(123)
20+
#' n_snps <- 20
21+
#' snp_data <- data.frame(
22+
#' MarkerID = paste0("SNP", 1:n_snps),
23+
#' Chrom = sample(c("chr1", "chr2"), n_snps, replace = TRUE),
24+
#' ChromPosPhysical = c(
25+
#' sort(sample(1:1000, 5)), # SNPs on chr1
26+
#' sort(sample(1:1000, 5)) + 500, # More SNPs on chr1
27+
#' sort(sample(1:2000, 10)) # SNPs on chr2
28+
#' ),
29+
#' Allele = sample(c("A/T", "G/C"), n_snps, replace = TRUE)
30+
#' )
31+
#' # Ensure it's sorted by Chrom and ChromPosPhysical for clarity in example
32+
#' snp_data <- snp_data[order(snp_data$Chrom, snp_data$ChromPosPhysical), ]
33+
#' rownames(snp_data) <- NULL
34+
#'
35+
#' print("Original SNP data:")
36+
#' print(snp_data)
37+
#'
38+
#' # Thin the SNPs, keeping a minimum distance of 100 units (e.g., bp)
39+
#' thinned_snps <- thinSNP(
40+
#' df = snp_data,
41+
#' chrom_col_name = "Chrom",
42+
#' pos_col_name = "ChromPosPhysical",
43+
#' min_distance = 100
44+
#' )
45+
#'
46+
#' print("Thinned SNP data (min_distance = 100):")
47+
#' print(thinned_snps)
48+
#'
49+
#' # Thin with a larger distance
50+
#' thinned_snps_large_dist <- thinSNP(
51+
#' df = snp_data,
52+
#' chrom_col_name = "Chrom",
53+
#' pos_col_name = "ChromPosPhysical",
54+
#' min_distance = 500
55+
#' )
56+
#' print("Thinned SNP data (min_distance = 500):")
57+
#' print(thinned_snps_large_dist)
58+
#' @export
59+
thinSNP <- function(df, chrom_col_name, pos_col_name, min_distance) {
60+
# Convert column name strings to symbols for dplyr
61+
chrom_sym <- rlang::sym(chrom_col_name)
62+
pos_sym <- rlang::sym(pos_col_name)
63+
64+
df %>%
65+
dplyr::group_by(!!chrom_sym) %>%
66+
dplyr::arrange(!!pos_sym, .by_group = TRUE) %>%
67+
# Apply thinning logic to each chromosome group
68+
dplyr::group_modify(~ {
69+
# .x is the subset of data for the current chromosome, already sorted by position
70+
if (nrow(.x) == 0) {
71+
# Return an empty tibble with the same structure if the group is empty
72+
return(tibble::as_tibble(.x[0, , drop = FALSE]))
73+
}
74+
75+
# Vector to store indices of rows to keep
76+
kept_indices <- integer(nrow(.x))
77+
num_kept <- 0
78+
79+
# Always keep the first SNP in the sorted group
80+
num_kept <- num_kept + 1
81+
kept_indices[num_kept] <- 1
82+
last_selected_pos <- .x[[pos_col_name]][1] # Get position of the first SNP
83+
84+
# Iterate through the rest of the SNPs in the current chromosome
85+
if (nrow(.x) > 1) {
86+
for (i in 2:nrow(.x)) {
87+
current_pos <- .x[[pos_col_name]][i]
88+
# If current SNP is far enough from the last selected SNP, keep it
89+
if (current_pos >= last_selected_pos + min_distance) {
90+
num_kept <- num_kept + 1
91+
kept_indices[num_kept] <- i
92+
last_selected_pos <- current_pos
93+
}
94+
}
95+
}
96+
# Subset the group to include only the kept SNPs
97+
.x[kept_indices[1:num_kept], , drop = FALSE]
98+
}) %>%
99+
dplyr::ungroup() # Ungroup to return a single dataframe
100+
}

man/thinSNP.Rd

Lines changed: 68 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-thinSNP.R

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
context("Thinning")
2+
3+
4+
test_that("Thinning SNPs",{
5+
6+
##' # Create sample SNP data
7+
set.seed(123)
8+
n_snps <- 20
9+
snp_data <- data.frame(
10+
MarkerID = paste0("SNP", 1:n_snps),
11+
Chrom = sample(c("chr1", "chr2"), n_snps, replace = TRUE),
12+
ChromPosPhysical = c(
13+
sort(sample(1:1000, 5)), # SNPs on chr1
14+
sort(sample(1:1000, 5)) + 500, # More SNPs on chr1
15+
sort(sample(1:2000, 10)) # SNPs on chr2
16+
),
17+
Allele = sample(c("A/T", "G/C"), n_snps, replace = TRUE)
18+
)
19+
# Ensure it's sorted by Chrom and ChromPosPhysical
20+
snp_data <- snp_data[order(snp_data$Chrom, snp_data$ChromPosPhysical), ]
21+
rownames(snp_data) <- NULL
22+
23+
# Thin the SNPs, keeping a minimum distance of 100 units (e.g., bp)
24+
thinned_snps <- thinSNP(
25+
df = snp_data,
26+
chrom_col_name = "Chrom",
27+
pos_col_name = "ChromPosPhysical",
28+
min_distance = 100
29+
)
30+
31+
# Thin with a larger distance
32+
thinned_snps_large_dist <- thinSNP(
33+
df = snp_data,
34+
chrom_col_name = "Chrom",
35+
pos_col_name = "ChromPosPhysical",
36+
min_distance = 500
37+
)
38+
39+
# Check the results
40+
expect_true(all(dim(thinned_snps) == c(14, 4))) # Expect 14 SNPs
41+
expect_true(all(dim(thinned_snps_large_dist) == c(5, 4))) # Expect 5 SNPs
42+
expect_equal(ncol(thinned_snps), ncol(snp_data)) # Same number of columns
43+
44+
45+
})

0 commit comments

Comments
 (0)