Skip to content

Commit

Permalink
improved (more accurate) depth calcs
Browse files Browse the repository at this point in the history
  • Loading branch information
pdimens committed Feb 28, 2025
1 parent 9305276 commit 3b9466d
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 26 deletions.
4 changes: 4 additions & 0 deletions harpy/_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def highlight_params(text):
return text.replace("container:", "[bold default]container:[/bold default]").rstrip()
if text.startswith(" shell:"):
return text.replace("shell:", "[bold default]shell:[/bold default]").rstrip()
if text.startswith(" wildcards:"):
return text.replace("wildcards:", "[bold default]wildcards:[/bold default]").rstrip()
if text.startswith(" affected files:"):
return text.replace("affected files:", "[bold default]affected files:[/bold default]").rstrip()
return text.rstrip()

def purge_empty_logs(target_dir):
Expand Down
4 changes: 2 additions & 2 deletions harpy/bin/molecule_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

def new_intervals(contig_len, windowsize) -> list:
starts = list(range(0, contig_len + 1, windowsize))
ends = [i for i in starts[1:]]
ends = [i - 1 for i in starts[1:]]
if not ends or ends[-1] != contig_len:
ends.append(contig_len)
return [range(i,j) for i,j in zip(starts,ends)]
Expand All @@ -69,7 +69,7 @@ def which_overlap(start: int, end: int, binlist: list):
def print_depth_counts(contig, counter_obj, intervals):
"""Print the Counter object to stdout"""
for idx,int_bin in enumerate(intervals):
sys.stdout.write(f"{contig}\t{int_bin.stop}\t{counter_obj[idx]}\n")
sys.stdout.write(f"{contig}\t{int_bin.start}\t{int_bin.stop}\t{counter_obj[idx]/args.window}\n")

with gzip.open(args.statsfile, "rt") as statsfile:
aln_ranges = []
Expand Down
31 changes: 13 additions & 18 deletions harpy/reports/align_stats.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,8 @@ tb <- read.table(covfile, header = F)
if(nrow(tb) == 0){
print(paste("Input data file", covfile, "is empty"))
knitr::knit_exit()
} else {
tb <- tb[, c(1,2,3,5)]
}
tbmol <- read.table(molcovfile, header = F)
if(nrow(tbmol) == 0){
Expand All @@ -416,9 +418,9 @@ if(nrow(tbmol) == 0){
```

```{r setup_cov_table}
colnames(tb) <- c("contig", "position", "depth")
colnames(tbmol) <- c("contig", "position", "mol_depth")
tb <- merge(tb, tbmol, by = c("contig", "position"))
colnames(tb) <- c("contig", "position", "position_end", "depth")
colnames(tbmol) <- c("contig", "position", "position_end", "mol_depth")
tb <- merge(tb, tbmol, by = c("contig", "position", "position_end"))
q99 <- quantile(tb$depth, 0.99)
mol_q99 <- quantile(tb$mol_depth, 0.99)
names(q99) <- NULL
Expand All @@ -434,8 +436,8 @@ mol_global_avg <- mean(tb$mol_depth)
mol_global_sd <- sd(tb$mol_depth)
tb$outlier <- tb$depth > q99
outliers <- tb[tb$outlier, -5]
nonoutliers <- tb[!(tb$outlier), -5]
outliers <- tb[tb$outlier, -6]
nonoutliers <- tb[!(tb$outlier), -6]
contig_avg <- group_by(tb, contig) %>%
summarize(average = mean(depth), mol_average = mean(mol_depth), sdv = sd(depth), mol_sdv = sd(mol_depth))
Expand Down Expand Up @@ -608,13 +610,6 @@ DT::datatable(
:::

# Depth Plot
```{r disttext}
if(params$mol_dist > 0){
dist_text <- paste0("from the linked-read information with a distance threshold of ", params$mol_dist)
} else {
dist_text <- "by the internal EMA aligner heuristics"
}
```
##
::: {.card title="Depth and Coverage, Visualized" fill="false"}
This is a circular vizualization the depth information across up to 30 of the largest contigs (unless specific contigs were provided).
Expand All @@ -624,7 +619,7 @@ Each arc (segment) is a different contig, from position 0 to the end of the cont
The internal (grey) rings are a barplot where each bar represents the alignment depth at a `r windowskb` kilobase
genomic interval. The inner ring (grey bars) is the number of reads that had a _proper_ alignment in the `r windowskb` kilobase interval, where
"proper" refers to a read not marked as a duplicate or flagged with the SAM `UNMAP`, `SECONDARY`, or `QCFAIL` flags. The outer ring (magenta bars),
is the number of _molecules_ spanning that interval, where molecules are inferred `r dist_text`.
is the number of _molecules_ spanning that interval.
:::

##
Expand All @@ -639,7 +634,6 @@ be able to scroll the report instead of zooming on the plot.
###
```{r limit_contigs}
plot_contigs <- params$contigs
#plot_contigs <- "default"
# Find the 30 largest contigs
contigs <- group_by(tb, contig) %>%
summarize(size = max(position)) %>%
Expand Down Expand Up @@ -679,7 +673,7 @@ for (i in names(genomeChr)){
tracks <- tracks + BioCircosBarTrack(
paste0("cov_", i),
chromosome = i,
starts = chrcov$position - windowsize, ends = chrcov$position,
starts = chrcov$position, ends = chrcov$position_end,
values = pmin(chrcov$depth, q99),
color = "#757575",
minRadius = inner[1],
Expand All @@ -688,7 +682,7 @@ for (i in names(genomeChr)){
tracks <- tracks + BioCircosBarTrack(
paste0("molcov_", i),
chromosome = i,
starts = chrcov$position - windowsize, ends = chrcov$position,
starts = chrcov$position, ends = chrcov$position_end,
values = pmin(chrcov$mol_depth, mol_q99),
color = "#9c3b94",
minRadius = outer[1],
Expand Down Expand Up @@ -717,9 +711,10 @@ BioCircos(
chrPad = 0.02,
genomeTicksDisplay = F,
BARMouseOverColor = "#1cff42",
BARMouseOverTooltipsHtml05 = "Mean Depth: ",
BARMouseOverTooltipsHtml05 = "Depth: ",
genomeLabelDy = 0,
width = "100%"
width = "100%",
height = 1000
)
```

Expand Down
23 changes: 21 additions & 2 deletions harpy/snakefiles/align_bwa.smk
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,24 @@ rule samtools_faidx:
fi
"""

rule make_depth_intervals:
input:
f"Genome/{bn}.fai"
output:
outdir + "/reports/data/coverage/coverage.bed"
run:
with open(input[0], "r") as fai, open(output[0], "w") as bed:
for line in fai:
splitline = line.split()
contig = splitline[0]
length = int(splitline[1])
starts = list(range(0, length, windowsize))
ends = [i - 1 for i in starts[1:]]
if not ends or ends[-1] != length:
ends.append(length)
for start,end in zip(starts,ends):
bed.write(f"{contig}\t{start}\t{end}\n")

rule bwa_index:
input:
f"Genome/{bn}"
Expand Down Expand Up @@ -189,15 +207,16 @@ rule molecule_coverage:
rule alignment_coverage:
input:
bam = outdir + "/{sample}.bam",
bai = outdir + "/{sample}.bam.bai"
bai = outdir + "/{sample}.bam.bai",
bed = outdir + "/reports/data/coverage/coverage.bed"
output:
outdir + "/reports/data/coverage/{sample}.cov.gz"
params:
windowsize
container:
None
shell:
"samtools depth -a {input.bam} | depth_windows.py {params} | gzip > {output}"
"samtools bedcov -c {input.bed} {input.bam} | awk '{{ $5 = $5 / {params}; print }}' | gzip > {output}"

rule report_config:
input:
Expand Down
23 changes: 21 additions & 2 deletions harpy/snakefiles/align_ema.smk
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,24 @@ rule bwa_index:
shell:
"bwa index {input} 2> {log}"

rule make_depth_intervals:
input:
f"Genome/{bn}.fai"
output:
outdir + "/reports/data/coverage/coverage.bed"
run:
with open(input[0], "r") as fai, open(output[0], "w") as bed:
for line in fai:
splitline = line.split()
contig = splitline[0]
length = int(splitline[1])
starts = list(range(0, length, windowsize))
ends = [i - 1 for i in starts[1:]]
if not ends or ends[-1] != length:
ends.append(length)
for start,end in zip(starts,ends):
bed.write(f"{contig}\t{start}\t{end}\n")

rule ema_count:
input:
get_fq
Expand Down Expand Up @@ -257,15 +275,16 @@ rule concat_alignments:
rule alignment_coverage:
input:
bam = outdir + "/{sample}.bam",
bai = outdir + "/{sample}.bam.bai"
bai = outdir + "/{sample}.bam.bai",
bed = outdir + "/reports/data/coverage/coverage.bed"
output:
outdir + "/reports/data/coverage/{sample}.cov.gz"
params:
windowsize
container:
None
shell:
"samtools depth -a {input.bam} | depth_windows.py {params} | gzip > {output}"
"samtools bedcov -c {input.bed} {input.bam} | awk '{{ $5 = $5 / {params}; print }}' | gzip > {output}"

rule barcode_stats:
input:
Expand Down
23 changes: 21 additions & 2 deletions harpy/snakefiles/align_strobealign.smk
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,24 @@ rule index_genome:
shell:
"samtools faidx --fai-idx {output} {input} 2> {log}"

rule make_depth_intervals:
input:
f"Genome/{bn}.fai"
output:
outdir + "/reports/data/coverage/coverage.bed"
run:
with open(input[0], "r") as fai, open(output[0], "w") as bed:
for line in fai:
splitline = line.split()
contig = splitline[0]
length = int(splitline[1])
starts = list(range(0, length, windowsize))
ends = [i - 1 for i in starts[1:]]
if not ends or ends[-1] != length:
ends.append(length)
for start,end in zip(starts,ends):
bed.write(f"{contig}\t{start}\t{end}\n")

rule strobe_index:
input:
f"Genome/{bn}"
Expand Down Expand Up @@ -180,15 +198,16 @@ rule molecule_coverage:
rule alignment_coverage:
input:
bam = outdir + "/{sample}.bam",
bai = outdir + "/{sample}.bam.bai"
bai = outdir + "/{sample}.bam.bai",
bed = outdir + "/reports/data/coverage/coverage.bed"
output:
outdir + "/reports/data/coverage/{sample}.cov.gz"
params:
windowsize
container:
None
shell:
"samtools depth -a {input.bam} | depth_windows.py {params} | gzip > {output}"
"samtools bedcov -c {input.bed} {input.bam} | awk '{{ $5 = $5 / {params}; print }}' | gzip > {output}"

rule report_config:
input:
Expand Down

0 comments on commit 3b9466d

Please sign in to comment.