You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Not really an issue, just showing some performance metrics of hifiasm with a simulated diploid assembly of the T2T chr8 sequence, with simulated PacBio HiFi reads. Used default hifiasm settings except one more round of error correction and -l3 purging.
# get chr8 T2T sequence
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/016/894/425/GCA_016894425.1_ASM1689442v1/GCA_016894425.1_ASM1689442v1_genomic.fna.gz
# convert masked bases to upper case
~/bin/seqtk/seqtk seq -U GCA_016894425.1_ASM1689442v1_genomic.fna.gz | \
gzip > GCA_016894425.1_ASM1689442v1_genomic.fna_upper.gz
# convert to diploid and add about 2% heterozygosity rate
~/bin/bbmap-38.90/mutate.sh \
in=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.gz \
ow=t \
vcf=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.vcf.gz \
out=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz \
ploidy=2 \
subrate=0.0192 \
indelrate=0.001 \
maxindel=20 \
nohomopolymers=t \
hetrate=1 2> GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.log.txt
# genome size
gunzip -c GCA_016894425.1_ASM1689442v1_genomic.fna_upper.fasta.gz | \
grep -v ">"|wc -m
# 146259671
# 2% heterozygosity is how many bases
0.02\*146259671
# 2925193.42
# number of mutations added
gunzip -c GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.vcf.gz| \
grep -v "^#"|wc -l
# 2942229
# ~2% het rate
Originally wrote Q30-Q40 when actually the next command makes Q20-Q30 reads
# simulate 15x per haplotype coverage of PacBio HiFi reads (Q20-Q30, 9000-12000 bases)
~/bin/bbmap-38.90/randomreads.sh \
ow=t seed=1 \
ref=GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz \
illuminanames=t addslash=t pacbio=t pbmin=0.001 pbmax=0.01 \
coverage=15 paired=f gaussianlength=t minlength=9000 midlength=10000 \
maxlength=12000 out=hifi-30x.fasta.gz
# assemble with
# hifiasm 0.15.1-r331
~/bin/hifiasm-new/hifiasm -r 4 -l3 -t 32 -o hifi-30x.fasta hifi-30x.fasta.gz
# look at haplotype assembly stats with gfatools Version: 0.4-r214-dirty and BBStats.sh from BBMAP 38.90
~/bin/gfatools/gfatools gfa2fa hifi-30x.fasta.bp.hap1.p_ctg.gfa |~/bin/bbmap-38.90/bbstats.sh in=stdin
#A C G T N IUPAC Other GC GC_stdev
#0.2985 0.2017 0.2020 0.2978 0.0000 0.0000 0.0000 0.4037 0.0249
#Main genome scaffold total: 5
#Main genome contig total: 5
#Main genome scaffold sequence total: 146.263 MB
#Main genome contig sequence total: 146.263 MB 0.000% gap
#Main genome scaffold N/L50: 2/42.083 MB
#Main genome contig N/L50: 2/42.083 MB
#Main genome scaffold N/L90: 5/15.892 MB
#Main genome contig N/L90: 5/15.892 MB
#Max scaffold length: 52.495 MB
#Max contig length: 52.495 MB
#Number of scaffolds > 50 KB: 5
#% main genome in scaffolds > 50 KB: 100.00%
#Minimum Number Number Total Total Scaffold
#Scaffold of of Scaffold Contig Contig
#Length Scaffolds Contigs Length Length Coverage
#-------- -------------- -------------- -------------- -------------- --------
# All 5 5 146,263,292 146,263,292 100.00%
# 25 KB 5 5 146,263,292 146,263,292 100.00%
# 50 KB 5 5 146,263,292 146,263,292 100.00%
# 100 KB 5 5 146,263,292 146,263,292 100.00%
# 250 KB 5 5 146,263,292 146,263,292 100.00%
# 500 KB 5 5 146,263,292 146,263,292 100.00%
# 1 MB 5 5 146,263,292 146,263,292 100.00%
# 2.5 MB 5 5 146,263,292 146,263,292 100.00%
# 5 MB 5 5 146,263,292 146,263,292 100.00%
# 10 MB 5 5 146,263,292 146,263,292 100.00%
# 25 MB 2 2 94,577,088 94,577,088 100.00%
# 50 MB 1 1 52,494,515 52,494,515 100.00%
~/bin/gfatools/gfatools gfa2fa hifi-30x.fasta.bp.hap2.p_ctg.gfa |~/bin/bbmap-38.90/bbstats.sh in=stdin
#A C G T N IUPAC Other GC GC_stdev
#0.2982 0.2016 0.2021 0.2981 0.0000 0.0000 0.0000 0.4037 0.0000
#Main genome scaffold total: 1
#Main genome contig total: 1
#Main genome scaffold sequence total: 146.265 MB
#Main genome contig sequence total: 146.265 MB 0.000% gap
#Main genome scaffold N/L50: 1/146.265 MB
#Main genome contig N/L50: 1/146.265 MB
#Main genome scaffold N/L90: 1/146.265 MB
#Main genome contig N/L90: 1/146.265 MB
#Max scaffold length: 146.265 MB
#Max contig length: 146.265 MB
#Number of scaffolds > 50 KB: 1
#% main genome in scaffolds > 50 KB: 100.00%
#Minimum Number Number Total Total Scaffold
#Scaffold of of Scaffold Contig Contig
#Length Scaffolds Contigs Length Length Coverage
#-------- -------------- -------------- -------------- -------------- --------
# All 1 1 146,265,156 146,265,156 100.00%
# 25 KB 1 1 146,265,156 146,265,156 100.00%
# 50 KB 1 1 146,265,156 146,265,156 100.00%
# 100 KB 1 1 146,265,156 146,265,156 100.00%
# 250 KB 1 1 146,265,156 146,265,156 100.00%
# 500 KB 1 1 146,265,156 146,265,156 100.00%
# 1 MB 1 1 146,265,156 146,265,156 100.00%
# 2.5 MB 1 1 146,265,156 146,265,156 100.00%
# 5 MB 1 1 146,265,156 146,265,156 100.00%
# 10 MB 1 1 146,265,156 146,265,156 100.00%
# 25 MB 1 1 146,265,156 146,265,156 100.00%
# 50 MB 1 1 146,265,156 146,265,156 100.00%
# 100 MB 1 1 146,265,156 146,265,156 100.00%
# get haplotype 0 from mutate.sh 38.90
~/bin/seqtk/seqtk seq -l0 GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz | \
head -n 2 > GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap0.fasta
# get haplotype 1 from mutate.sh 38.90
~/bin/seqtk/seqtk seq -l0 GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.fasta.gz | \
tail -n +3 > GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap1.fasta
# get hifiasm's haplotype2 into FASTA
~/bin/gfatools/gfatools gfa2fa hifi-30x.fasta.bp.hap2.p_ctg.gfa \
> hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa
# DipCall 0.2 https://github.com/lh3/dipcall/releases/tag/v0.2
# note had to remove line 100 from dipcall-aux.js
# see https://github.com/lh3/dipcall/issues/1
dipcall.kit/run-dipcall hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa \
hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa \
GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap0.fasta \
GCA_016894425.1_ASM1689442v1_genomic.fna_upper.diploid.hap1.fasta \
> hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.mk
# run DipCall
make -j 1 -f hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.mk \
> hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.mk.log 2>&1
# dipsum
dipcall.kit/k8 dipcall.kit/dipcall-aux2.js dipsum \
hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.dip.bed \
hifi-30x.fasta.bp.hap2.p_ctg.defaults.but.r4.fa.dip.vcf.gz
#Length of confident regions: 146265156
# Hom SNP: 29
# Hom INS: 32
# Hom DEL: 58
# Het SNP: 2804524
# Het INS: 67702
# Het DEL: 68184
# Het mixed: 3
#SNP heterozygosity: 0.019174
#Variant heterozygosity: 0.020103
# There should be 2942229 total_Het_variants (SNPs and indels).
# There are 2940413 dipcall_Het_variants, missing 1816 (total_het_variants-dipcall_Het_variants=1816).
# There are 119 Hom variants (errors in hifiasm assembly?).
# Phred quality score for missing Het variants = -10*LOG10(1816/146265156)=49.060
# Phred quality score for wrong? Hom variants = -10*LOG10(119/146265156)=60.896
Not really an issue, just showing some performance metrics of
hifiasm
with a simulated diploid assembly of the T2T chr8 sequence, with simulated PacBio HiFi reads. Used defaulthifiasm
settings except one more round of error correction and-l3
purging.Originally wrote Q30-Q40 when actually the next command makes Q20-Q30 reads
added trio evaluation
The text was updated successfully, but these errors were encountered: