Skip to content

Commit

Permalink
add new demo
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Aug 24, 2022
1 parent d3c62dd commit f8ec8c6
Show file tree
Hide file tree
Showing 40 changed files with 320 additions and 151 deletions.
6 changes: 2 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@
*trace.out

# taxonomy
delnodes.dmp
merged.dmp
names.dmp
nodes.dmp
!demo-profiling/*.dmp
kmcp/*.dmp

# binary and html
kmcp/kmcp*
Expand Down
128 changes: 128 additions & 0 deletions demo-profiling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Demo of taxonomic profiling

## Dataset

### References

# name.map
ls refs/*.gz | rush 'echo -ne "{%..}\t"; seqkit head -n 1 {} | seqkit seq -n | cut -d " " -f 2-' > name.map

csvtk join -t -f id \
<(seqkit stats -j 10 refs/*.gz -T -b \
| csvtk mutate -t -n id -p "(.+)\.fa") \
<(csvtk add-header -t -n id,name name.map) \
| csvtk cut -t -f file,num_seqs,sum_len,name \
| csvtk sort -t -k name \
| csvtk csv2md -t
|file |num_seqs|sum_len|name |
|:--------------------|:-------|:------|:---------------------------------------------------------------------------------|
|GCF_009759685.1.fa.gz|2 |3990388|Acinetobacter baumannii strain ATCC 19606 chromosome, complete genome |
|GCF_000392875.1.fa.gz|3 |2881400|Enterococcus faecalis ATCC 19433 acAqW-supercont1.1, whole genome shotgun sequence|
|GCF_001544255.1.fa.gz|38 |2484851|Enterococcus faecium NBRC 100486, whole genome shotgun sequence |
|GCF_003697165.2.fa.gz|2 |5034834|Escherichia coli DSM 30083 = JCM 1649 = ATCC 11775 chromosome, complete genome |
|GCF_000742135.1.fa.gz|5 |5545784|Klebsiella pneumoniae strain ATCC 13883 scaffold1, whole genome shotgun sequence |
|GCF_000017205.1.fa.gz|1 |6588339|Pseudomonas aeruginosa PA7, complete genome |
|GCF_000006945.2.fa.gz|2 |4951383|Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome |
|GCF_002949675.1.fa.gz|2 |4578459|Shigella dysenteriae strain ATCC 13313 chromosome, complete genome |
|GCF_002950215.1.fa.gz|3 |4938295|Shigella flexneri 2a strain ATCC 29903 chromosome, complete genome |
|GCF_001027105.1.fa.gz|2 |2782562|Staphylococcus aureus subsp. aureus DSM 20231 chromosome, complete genome |
|GCF_006742205.1.fa.gz|2 |2427041|Staphylococcus epidermidis NBRC 100911 DNA, complete genome |
|GCF_000148585.2.fa.gz|1 |1868883|Streptococcus mitis NCTC 12261 chromosome, complete genome |
|GCF_001096185.1.fa.gz|24 |2117177|Streptococcus pneumoniae strain SMRU824, whole genome shotgun sequence |

### Taxonomy data

Please download and uncompress [taxdump.tar.gz](ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz),
and then copy `names.dmp`, `nodes.dmp`, `delnodes.dmp` and `merged.dmp` to directory `taxdump`.

Or create custom taxdump files with `taxonomy.tsv` using [taxonkit create-taxdump](https://bioinf.shenwei.me/taxonkit/usage/#create-taxdump) (v0.12.1 or later versions required):

taxonkit create-taxdump -A 1 taxonomy.tsv -O taxdump-custom/

## Metagenomic Profiling

Building database:

# computing k-mers
kmcp compute \
--in-dir refs/ \
--ref-name-regexp "^([\w\.\_]+\.\d+)" \
--seq-name-filter "plasmid" \
--kmer 21 \
--split-number 10 \
--split-overlap 150 \
--out-dir refs-k21-n10 \
--force

# indexing k-mers
kmcp index \
--in-dir refs-k21-n10/\
--num-hash 1 \
--false-positive-rate 0.3 \
--out-dir refs-k21-n10.kmcp \
--force

Generating mock dataset.

# generating mock dataset
(seqkit sliding -s 10 -W 150 refs/GCF_003697165.2.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \
seqkit sliding -s 10 -W 150 refs/GCF_002949675.1.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \
seqkit sliding -s 10 -W 150 refs/GCF_002950215.1.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \
seqkit sliding -s 10 -W 150 refs/GCF_000742135.1.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \
seqkit sliding -s 10 -W 150 refs/GCF_000006945.2.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \
seqkit sliding -s 10 -W 150 refs/GCF_000392875.1.fa.gz | seqkit shuffle | seqkit sample -p 0.02 ; \
seqkit sliding -s 10 -W 150 refs/GCF_001544255.1.fa.gz | seqkit shuffle | seqkit sample -p 0.02 ; \
seqkit sliding -s 10 -W 150 refs/GCF_001027105.1.fa.gz | seqkit shuffle | seqkit sample -p 0.01 ; \
seqkit sliding -s 10 -W 150 refs/GCF_006742205.1.fa.gz | seqkit shuffle | seqkit sample -p 0.01 ; \
seqkit sliding -s 10 -W 150 refs/GCF_000148585.2.fa.gz | seqkit shuffle | seqkit sample -p 0.002 ; \
seqkit sliding -s 10 -W 150 refs/GCF_001096185.1.fa.gz | seqkit shuffle | seqkit sample -p 0.002 ; \
seqkit sliding -s 10 -W 150 refs/GCF_000017205.1.fa.gz | seqkit shuffle | seqkit sample -p 0.001 ; \
seqkit sliding -s 10 -W 150 refs/GCF_009759685.1.fa.gz | seqkit shuffle | seqkit sample -p 0.001 ) \
| seqkit shuffle -o mock.fastq.gz

Searching

# searching
for f in *.fastq.gz; do
kmcp search \
--db-dir refs-k21-n10.kmcp/ \
--min-query-cov 0.55 \
$f \
--out-file $f.kmcp.gz
done

Profiling

# profiling using mode 1 for low coverage data
for f in *.kmcp.gz; do
kmcp profile \
--taxid-map taxdump-custom/taxid.map \
--taxdump taxdump-custom/ \
$f \
--mode 1 \
--out-prefix $f.kmcp.profile \
--metaphlan-report $f.metaphlan.profile \
--cami-report $f.cami.profile \
--binning-result $f.binning.gz
done

cat mock.fastq.gz.kmcp.gz.kmcp.profile \
| csvtk cut -t -f ref,percentage,taxname \
| csvtk csv2md -t

|ref |percentage|taxname |
|:--------------|:---------|:-------------------------|
|GCF_003697165.2|23.974382 |Escherichia coli |
|GCF_000742135.1|20.155870 |Klebsiella pneumoniae |
|GCF_000006945.2|19.407535 |Salmonella enterica |
|GCF_002950215.1|15.583846 |Shigella flexneri |
|GCF_002949675.1|14.767599 |Shigella dysenteriae |
|GCF_001544255.1|1.852659 |Enterococcus faecium |
|GCF_000392875.1|1.844333 |Enterococcus faecalis |
|GCF_001027105.1|0.929976 |Staphylococcus aureus |
|GCF_006742205.1|0.923797 |Staphylococcus epidermidis|
|GCF_001096185.1|0.192491 |Streptococcus pneumoniae |
|GCF_000148585.2|0.186310 |Streptococcus mitis |
|GCF_000017205.1|0.091788 |Pseudomonas aeruginosa |
|GCF_009759685.1|0.089414 |Acinetobacter baumannii |
43 changes: 43 additions & 0 deletions demo-profiling/mock.fastq.gz.kmcp.gz.cami.profile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
@SampleID:
@Version:0.10.0
@Ranks:superkingdom|phylum|class|order|family|genus|species|strain
@TaxonomyID:
@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE
609216830 superkingdom 609216830 Bacteria 100.000000
3788559933 phylum 609216830|3788559933 Bacteria|Proteobacteria 94.070434
3642462009 phylum 609216830|3642462009 Bacteria|Firmicutes 5.929566
329474883 class 609216830|3788559933|329474883 Bacteria|Proteobacteria|Gammaproteobacteria 94.070434
1845768359 class 609216830|3642462009|1845768359 Bacteria|Firmicutes|Bacilli 5.929566
3160438580 order 609216830|3788559933|329474883|3160438580 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales 93.889233
185544332 order 609216830|3642462009|1845768359|185544332 Bacteria|Firmicutes|Bacilli|Lactobacillales 4.075793
813944714 order 609216830|3642462009|1845768359|813944714 Bacteria|Firmicutes|Bacilli|Bacillales 1.853773
86398254 order 609216830|3788559933|329474883|86398254 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales 0.091788
2185117029 order 609216830|3788559933|329474883|2185117029 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales 0.089414
2234733759 family 609216830|3788559933|329474883|3160438580|2234733759 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 93.889233
3209851916 family 609216830|3642462009|1845768359|185544332|3209851916 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae 3.696992
1997712377 family 609216830|3642462009|1845768359|813944714|1997712377 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae 1.853773
1255484345 family 609216830|3642462009|1845768359|185544332|1255484345 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae 0.378801
1478401337 family 609216830|3788559933|329474883|86398254|1478401337 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae 0.091788
943158193 family 609216830|3788559933|329474883|2185117029|943158193 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales|Moraxellaceae 0.089414
2258433137 genus 609216830|3788559933|329474883|3160438580|2234733759|2258433137 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Shigella 30.351445
3334977531 genus 609216830|3788559933|329474883|3160438580|2234733759|3334977531 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 23.974382
2440106587 genus 609216830|3788559933|329474883|3160438580|2234733759|2440106587 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Klebsiella 20.155870
794943543 genus 609216830|3788559933|329474883|3160438580|2234733759|794943543 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Salmonella 19.407535
602175708 genus 609216830|3642462009|1845768359|185544332|3209851916|602175708 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae|Enterococcus 3.696992
1824050977 genus 609216830|3642462009|1845768359|813944714|1997712377|1824050977 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae|Staphylococcus 1.853773
2394826844 genus 609216830|3642462009|1845768359|185544332|1255484345|2394826844 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae|Streptococcus 0.378801
1616653803 genus 609216830|3788559933|329474883|86398254|1478401337|1616653803 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae|Pseudomonas 0.091788
568178587 genus 609216830|3788559933|329474883|2185117029|943158193|568178587 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales|Moraxellaceae|Acinetobacter 0.089414
4093283224 species 609216830|3788559933|329474883|3160438580|2234733759|3334977531|4093283224 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 23.974382
3958205156 species 609216830|3788559933|329474883|3160438580|2234733759|2440106587|3958205156 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Klebsiella|Klebsiella pneumoniae 20.155870
1678121664 species 609216830|3788559933|329474883|3160438580|2234733759|794943543|1678121664 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Salmonella|Salmonella enterica 19.407535
2695851945 species 609216830|3788559933|329474883|3160438580|2234733759|2258433137|2695851945 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Shigella|Shigella flexneri 15.583846
524994882 species 609216830|3788559933|329474883|3160438580|2234733759|2258433137|524994882 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Shigella|Shigella dysenteriae 14.767599
4145431389 species 609216830|3642462009|1845768359|185544332|3209851916|602175708|4145431389 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae|Enterococcus|Enterococcus faecium 1.852659
3809813362 species 609216830|3642462009|1845768359|185544332|3209851916|602175708|3809813362 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae|Enterococcus|Enterococcus faecalis 1.844333
1569132721 species 609216830|3642462009|1845768359|813944714|1997712377|1824050977|1569132721 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae|Staphylococcus|Staphylococcus aureus 0.929976
1920251658 species 609216830|3642462009|1845768359|813944714|1997712377|1824050977|1920251658 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae|Staphylococcus|Staphylococcus epidermidis 0.923797
2983929374 species 609216830|3642462009|1845768359|185544332|1255484345|2394826844|2983929374 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae|Streptococcus|Streptococcus pneumoniae 0.192491
1527235303 species 609216830|3642462009|1845768359|185544332|1255484345|2394826844|1527235303 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae|Streptococcus|Streptococcus mitis 0.186310
3843752343 species 609216830|3788559933|329474883|86398254|1478401337|1616653803|3843752343 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae|Pseudomonas|Pseudomonas aeruginosa 0.091788
72054943 species 609216830|3788559933|329474883|2185117029|943158193|568178587|72054943 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales|Moraxellaceae|Acinetobacter|Acinetobacter baumannii 0.089414
14 changes: 14 additions & 0 deletions demo-profiling/mock.fastq.gz.kmcp.gz.kmcp.profile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
ref percentage coverage score chunksFrac chunksRelDepth chunksRelDepthStd reads ureads hicureads refsize refname taxid rank taxname taxpath taxpathsn
GCF_003697165.2 23.974382 3.82 100.00 1.00 0.97;0.98;0.98;0.99;1.01;1.04;1.03;0.98;1.06;0.96 0.03 124850 28495 28481 4903501 4093283224 species Escherichia coli Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Escherichia;Escherichia coli 609216830;3788559933;329474883;3160438580;2234733759;3334977531;4093283224
GCF_000742135.1 20.155870 3.21 100.00 1.00 0.98;0.99;1.00;0.97;0.94;1.08;0.98;0.95;0.99;1.11 0.05 118715 108213 107840 5545864 3958205156 species Klebsiella pneumoniae Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Klebsiella;Klebsiella pneumoniae 609216830;3788559933;329474883;3160438580;2234733759;2440106587;3958205156
GCF_000006945.2 19.407535 3.09 100.00 1.00 1.00;0.98;0.99;0.99;0.99;1.01;0.98;1.04;1.00;1.03 0.02 100118 93635 93634 4857450 1678121664 species Salmonella enterica Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Salmonella;Salmonella enterica 609216830;3788559933;329474883;3160438580;2234733759;794943543;1678121664
GCF_002950215.1 15.583846 2.48 100.00 1.00 0.95;0.99;1.01;1.01;1.02;0.97;0.99;0.97;1.03;1.05 0.03 77116 15037 14921 4659463 2695851945 species Shigella flexneri Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Shigella;Shigella flexneri 609216830;3788559933;329474883;3160438580;2234733759;2258433137;2695851945
GCF_002949675.1 14.767599 2.35 100.00 1.00 1.00;0.93;1.05;1.01;0.97;1.00;0.98;0.99;1.04;1.02 0.03 68941 14176 14145 4395762 524994882 species Shigella dysenteriae Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Shigella;Shigella dysenteriae 609216830;3788559933;329474883;3160438580;2234733759;2258433137;524994882
GCF_001544255.1 1.852659 0.30 100.00 1.00 0.93;0.95;0.94;1.05;1.00;0.97;1.07;0.95;1.08;1.05 0.06 4891 4840 4840 2485591 4145431389 species Enterococcus faecium Bacteria;Firmicutes;Bacilli;Lactobacillales;Enterococcaceae;Enterococcus;Enterococcus faecium 609216830;3642462009;1845768359;185544332;3209851916;602175708;4145431389
GCF_000392875.1 1.844333 0.29 100.00 1.00 0.95;0.92;0.95;1.09;0.99;1.08;0.94;1.08;0.99;1.01 0.06 5644 5575 5575 2881440 3809813362 species Enterococcus faecalis Bacteria;Firmicutes;Bacilli;Lactobacillales;Enterococcaceae;Enterococcus;Enterococcus faecalis 609216830;3642462009;1845768359;185544332;3209851916;602175708;3809813362
GCF_001027105.1 0.929976 0.15 100.00 1.00 0.92;0.90;0.95;1.07;1.00;1.05;1.00;1.10;1.05;0.96 0.06 2721 2677 2677 2755072 1569132721 species Staphylococcus aureus Bacteria;Firmicutes;Bacilli;Bacillales;Staphylococcaceae;Staphylococcus;Staphylococcus aureus 609216830;3642462009;1845768359;813944714;1997712377;1824050977;1569132721
GCF_006742205.1 0.923797 0.15 100.00 1.00 0.92;0.92;0.93;1.01;1.04;0.93;1.13;1.01;1.10;1.01 0.07 2377 2338 2338 2422602 1920251658 species Staphylococcus epidermidis Bacteria;Firmicutes;Bacilli;Bacillales;Staphylococcaceae;Staphylococcus;Staphylococcus epidermidis 609216830;3642462009;1845768359;813944714;1997712377;1824050977;1920251658
GCF_001096185.1 0.192491 0.03 100.00 1.00 0.91;0.74;1.14;0.99;1.05;0.88;1.16;1.11;1.10;0.92 0.13 433 314 314 2117637 2983929374 species Streptococcus pneumoniae Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus;Streptococcus pneumoniae 609216830;3642462009;1845768359;185544332;1255484345;2394826844;2983929374
GCF_000148585.2 0.186310 0.03 100.00 1.00 0.83;0.88;1.00;1.16;0.86;1.01;0.83;1.15;1.31;0.98 0.15 370 268 266 1868883 1527235303 species Streptococcus mitis Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus;Streptococcus mitis 609216830;3642462009;1845768359;185544332;1255484345;2394826844;1527235303
GCF_000017205.1 0.091788 0.01 100.00 1.00 1.11;0.92;1.11;1.05;1.00;0.91;1.00;1.22;0.69;0.99 0.14 642 640 640 6588339 3843752343 species Pseudomonas aeruginosa Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas aeruginosa 609216830;3788559933;329474883;86398254;1478401337;1616653803;3843752343
GCF_009759685.1 0.089414 0.01 100.00 1.00 1.06;1.38;0.93;1.11;0.87;1.01;0.78;0.95;0.66;1.24 0.20 378 370 370 3980848 72054943 species Acinetobacter baumannii Bacteria;Proteobacteria;Gammaproteobacteria;Moraxellales;Moraxellaceae;Acinetobacter;Acinetobacter baumannii 609216830;3788559933;329474883;2185117029;943158193;568178587;72054943
Loading

0 comments on commit f8ec8c6

Please sign in to comment.