-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d3c62dd
commit f8ec8c6
Showing
40 changed files
with
320 additions
and
151 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# Demo of taxonomic profiling | ||
|
||
## Dataset | ||
|
||
### References | ||
|
||
# name.map | ||
ls refs/*.gz | rush 'echo -ne "{%..}\t"; seqkit head -n 1 {} | seqkit seq -n | cut -d " " -f 2-' > name.map | ||
|
||
csvtk join -t -f id \ | ||
<(seqkit stats -j 10 refs/*.gz -T -b \ | ||
| csvtk mutate -t -n id -p "(.+)\.fa") \ | ||
<(csvtk add-header -t -n id,name name.map) \ | ||
| csvtk cut -t -f file,num_seqs,sum_len,name \ | ||
| csvtk sort -t -k name \ | ||
| csvtk csv2md -t | ||
|file |num_seqs|sum_len|name | | ||
|:--------------------|:-------|:------|:---------------------------------------------------------------------------------| | ||
|GCF_009759685.1.fa.gz|2 |3990388|Acinetobacter baumannii strain ATCC 19606 chromosome, complete genome | | ||
|GCF_000392875.1.fa.gz|3 |2881400|Enterococcus faecalis ATCC 19433 acAqW-supercont1.1, whole genome shotgun sequence| | ||
|GCF_001544255.1.fa.gz|38 |2484851|Enterococcus faecium NBRC 100486, whole genome shotgun sequence | | ||
|GCF_003697165.2.fa.gz|2 |5034834|Escherichia coli DSM 30083 = JCM 1649 = ATCC 11775 chromosome, complete genome | | ||
|GCF_000742135.1.fa.gz|5 |5545784|Klebsiella pneumoniae strain ATCC 13883 scaffold1, whole genome shotgun sequence | | ||
|GCF_000017205.1.fa.gz|1 |6588339|Pseudomonas aeruginosa PA7, complete genome | | ||
|GCF_000006945.2.fa.gz|2 |4951383|Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome | | ||
|GCF_002949675.1.fa.gz|2 |4578459|Shigella dysenteriae strain ATCC 13313 chromosome, complete genome | | ||
|GCF_002950215.1.fa.gz|3 |4938295|Shigella flexneri 2a strain ATCC 29903 chromosome, complete genome | | ||
|GCF_001027105.1.fa.gz|2 |2782562|Staphylococcus aureus subsp. aureus DSM 20231 chromosome, complete genome | | ||
|GCF_006742205.1.fa.gz|2 |2427041|Staphylococcus epidermidis NBRC 100911 DNA, complete genome | | ||
|GCF_000148585.2.fa.gz|1 |1868883|Streptococcus mitis NCTC 12261 chromosome, complete genome | | ||
|GCF_001096185.1.fa.gz|24 |2117177|Streptococcus pneumoniae strain SMRU824, whole genome shotgun sequence | | ||
|
||
### Taxonomy data | ||
|
||
Please download and uncompress [taxdump.tar.gz](ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz), | ||
and then copy `names.dmp`, `nodes.dmp`, `delnodes.dmp` and `merged.dmp` to directory `taxdump`. | ||
|
||
Or create custom taxdump files with `taxonomy.tsv` using [taxonkit create-taxdump](https://bioinf.shenwei.me/taxonkit/usage/#create-taxdump) (v0.12.1 or later versions required): | ||
|
||
taxonkit create-taxdump -A 1 taxonomy.tsv -O taxdump-custom/ | ||
|
||
## Metagenomic Profiling | ||
|
||
Building database: | ||
|
||
# computing k-mers | ||
kmcp compute \ | ||
--in-dir refs/ \ | ||
--ref-name-regexp "^([\w\.\_]+\.\d+)" \ | ||
--seq-name-filter "plasmid" \ | ||
--kmer 21 \ | ||
--split-number 10 \ | ||
--split-overlap 150 \ | ||
--out-dir refs-k21-n10 \ | ||
--force | ||
|
||
# indexing k-mers | ||
kmcp index \ | ||
--in-dir refs-k21-n10/\ | ||
--num-hash 1 \ | ||
--false-positive-rate 0.3 \ | ||
--out-dir refs-k21-n10.kmcp \ | ||
--force | ||
|
||
Generating mock dataset. | ||
|
||
# generating mock dataset | ||
(seqkit sliding -s 10 -W 150 refs/GCF_003697165.2.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_002949675.1.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_002950215.1.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_000742135.1.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_000006945.2.fa.gz | seqkit shuffle | seqkit sample -p 0.2 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_000392875.1.fa.gz | seqkit shuffle | seqkit sample -p 0.02 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_001544255.1.fa.gz | seqkit shuffle | seqkit sample -p 0.02 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_001027105.1.fa.gz | seqkit shuffle | seqkit sample -p 0.01 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_006742205.1.fa.gz | seqkit shuffle | seqkit sample -p 0.01 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_000148585.2.fa.gz | seqkit shuffle | seqkit sample -p 0.002 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_001096185.1.fa.gz | seqkit shuffle | seqkit sample -p 0.002 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_000017205.1.fa.gz | seqkit shuffle | seqkit sample -p 0.001 ; \ | ||
seqkit sliding -s 10 -W 150 refs/GCF_009759685.1.fa.gz | seqkit shuffle | seqkit sample -p 0.001 ) \ | ||
| seqkit shuffle -o mock.fastq.gz | ||
|
||
Searching | ||
|
||
# searching | ||
for f in *.fastq.gz; do | ||
kmcp search \ | ||
--db-dir refs-k21-n10.kmcp/ \ | ||
--min-query-cov 0.55 \ | ||
$f \ | ||
--out-file $f.kmcp.gz | ||
done | ||
|
||
Profiling | ||
|
||
# profiling using mode 1 for low coverage data | ||
for f in *.kmcp.gz; do | ||
kmcp profile \ | ||
--taxid-map taxdump-custom/taxid.map \ | ||
--taxdump taxdump-custom/ \ | ||
$f \ | ||
--mode 1 \ | ||
--out-prefix $f.kmcp.profile \ | ||
--metaphlan-report $f.metaphlan.profile \ | ||
--cami-report $f.cami.profile \ | ||
--binning-result $f.binning.gz | ||
done | ||
|
||
cat mock.fastq.gz.kmcp.gz.kmcp.profile \ | ||
| csvtk cut -t -f ref,percentage,taxname \ | ||
| csvtk csv2md -t | ||
|
||
|ref |percentage|taxname | | ||
|:--------------|:---------|:-------------------------| | ||
|GCF_003697165.2|23.974382 |Escherichia coli | | ||
|GCF_000742135.1|20.155870 |Klebsiella pneumoniae | | ||
|GCF_000006945.2|19.407535 |Salmonella enterica | | ||
|GCF_002950215.1|15.583846 |Shigella flexneri | | ||
|GCF_002949675.1|14.767599 |Shigella dysenteriae | | ||
|GCF_001544255.1|1.852659 |Enterococcus faecium | | ||
|GCF_000392875.1|1.844333 |Enterococcus faecalis | | ||
|GCF_001027105.1|0.929976 |Staphylococcus aureus | | ||
|GCF_006742205.1|0.923797 |Staphylococcus epidermidis| | ||
|GCF_001096185.1|0.192491 |Streptococcus pneumoniae | | ||
|GCF_000148585.2|0.186310 |Streptococcus mitis | | ||
|GCF_000017205.1|0.091788 |Pseudomonas aeruginosa | | ||
|GCF_009759685.1|0.089414 |Acinetobacter baumannii | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
@SampleID: | ||
@Version:0.10.0 | ||
@Ranks:superkingdom|phylum|class|order|family|genus|species|strain | ||
@TaxonomyID: | ||
@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE | ||
609216830 superkingdom 609216830 Bacteria 100.000000 | ||
3788559933 phylum 609216830|3788559933 Bacteria|Proteobacteria 94.070434 | ||
3642462009 phylum 609216830|3642462009 Bacteria|Firmicutes 5.929566 | ||
329474883 class 609216830|3788559933|329474883 Bacteria|Proteobacteria|Gammaproteobacteria 94.070434 | ||
1845768359 class 609216830|3642462009|1845768359 Bacteria|Firmicutes|Bacilli 5.929566 | ||
3160438580 order 609216830|3788559933|329474883|3160438580 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales 93.889233 | ||
185544332 order 609216830|3642462009|1845768359|185544332 Bacteria|Firmicutes|Bacilli|Lactobacillales 4.075793 | ||
813944714 order 609216830|3642462009|1845768359|813944714 Bacteria|Firmicutes|Bacilli|Bacillales 1.853773 | ||
86398254 order 609216830|3788559933|329474883|86398254 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales 0.091788 | ||
2185117029 order 609216830|3788559933|329474883|2185117029 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales 0.089414 | ||
2234733759 family 609216830|3788559933|329474883|3160438580|2234733759 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 93.889233 | ||
3209851916 family 609216830|3642462009|1845768359|185544332|3209851916 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae 3.696992 | ||
1997712377 family 609216830|3642462009|1845768359|813944714|1997712377 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae 1.853773 | ||
1255484345 family 609216830|3642462009|1845768359|185544332|1255484345 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae 0.378801 | ||
1478401337 family 609216830|3788559933|329474883|86398254|1478401337 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae 0.091788 | ||
943158193 family 609216830|3788559933|329474883|2185117029|943158193 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales|Moraxellaceae 0.089414 | ||
2258433137 genus 609216830|3788559933|329474883|3160438580|2234733759|2258433137 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Shigella 30.351445 | ||
3334977531 genus 609216830|3788559933|329474883|3160438580|2234733759|3334977531 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 23.974382 | ||
2440106587 genus 609216830|3788559933|329474883|3160438580|2234733759|2440106587 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Klebsiella 20.155870 | ||
794943543 genus 609216830|3788559933|329474883|3160438580|2234733759|794943543 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Salmonella 19.407535 | ||
602175708 genus 609216830|3642462009|1845768359|185544332|3209851916|602175708 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae|Enterococcus 3.696992 | ||
1824050977 genus 609216830|3642462009|1845768359|813944714|1997712377|1824050977 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae|Staphylococcus 1.853773 | ||
2394826844 genus 609216830|3642462009|1845768359|185544332|1255484345|2394826844 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae|Streptococcus 0.378801 | ||
1616653803 genus 609216830|3788559933|329474883|86398254|1478401337|1616653803 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae|Pseudomonas 0.091788 | ||
568178587 genus 609216830|3788559933|329474883|2185117029|943158193|568178587 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales|Moraxellaceae|Acinetobacter 0.089414 | ||
4093283224 species 609216830|3788559933|329474883|3160438580|2234733759|3334977531|4093283224 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 23.974382 | ||
3958205156 species 609216830|3788559933|329474883|3160438580|2234733759|2440106587|3958205156 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Klebsiella|Klebsiella pneumoniae 20.155870 | ||
1678121664 species 609216830|3788559933|329474883|3160438580|2234733759|794943543|1678121664 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Salmonella|Salmonella enterica 19.407535 | ||
2695851945 species 609216830|3788559933|329474883|3160438580|2234733759|2258433137|2695851945 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Shigella|Shigella flexneri 15.583846 | ||
524994882 species 609216830|3788559933|329474883|3160438580|2234733759|2258433137|524994882 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Shigella|Shigella dysenteriae 14.767599 | ||
4145431389 species 609216830|3642462009|1845768359|185544332|3209851916|602175708|4145431389 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae|Enterococcus|Enterococcus faecium 1.852659 | ||
3809813362 species 609216830|3642462009|1845768359|185544332|3209851916|602175708|3809813362 Bacteria|Firmicutes|Bacilli|Lactobacillales|Enterococcaceae|Enterococcus|Enterococcus faecalis 1.844333 | ||
1569132721 species 609216830|3642462009|1845768359|813944714|1997712377|1824050977|1569132721 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae|Staphylococcus|Staphylococcus aureus 0.929976 | ||
1920251658 species 609216830|3642462009|1845768359|813944714|1997712377|1824050977|1920251658 Bacteria|Firmicutes|Bacilli|Bacillales|Staphylococcaceae|Staphylococcus|Staphylococcus epidermidis 0.923797 | ||
2983929374 species 609216830|3642462009|1845768359|185544332|1255484345|2394826844|2983929374 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae|Streptococcus|Streptococcus pneumoniae 0.192491 | ||
1527235303 species 609216830|3642462009|1845768359|185544332|1255484345|2394826844|1527235303 Bacteria|Firmicutes|Bacilli|Lactobacillales|Streptococcaceae|Streptococcus|Streptococcus mitis 0.186310 | ||
3843752343 species 609216830|3788559933|329474883|86398254|1478401337|1616653803|3843752343 Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae|Pseudomonas|Pseudomonas aeruginosa 0.091788 | ||
72054943 species 609216830|3788559933|329474883|2185117029|943158193|568178587|72054943 Bacteria|Proteobacteria|Gammaproteobacteria|Moraxellales|Moraxellaceae|Acinetobacter|Acinetobacter baumannii 0.089414 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
ref percentage coverage score chunksFrac chunksRelDepth chunksRelDepthStd reads ureads hicureads refsize refname taxid rank taxname taxpath taxpathsn | ||
GCF_003697165.2 23.974382 3.82 100.00 1.00 0.97;0.98;0.98;0.99;1.01;1.04;1.03;0.98;1.06;0.96 0.03 124850 28495 28481 4903501 4093283224 species Escherichia coli Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Escherichia;Escherichia coli 609216830;3788559933;329474883;3160438580;2234733759;3334977531;4093283224 | ||
GCF_000742135.1 20.155870 3.21 100.00 1.00 0.98;0.99;1.00;0.97;0.94;1.08;0.98;0.95;0.99;1.11 0.05 118715 108213 107840 5545864 3958205156 species Klebsiella pneumoniae Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Klebsiella;Klebsiella pneumoniae 609216830;3788559933;329474883;3160438580;2234733759;2440106587;3958205156 | ||
GCF_000006945.2 19.407535 3.09 100.00 1.00 1.00;0.98;0.99;0.99;0.99;1.01;0.98;1.04;1.00;1.03 0.02 100118 93635 93634 4857450 1678121664 species Salmonella enterica Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Salmonella;Salmonella enterica 609216830;3788559933;329474883;3160438580;2234733759;794943543;1678121664 | ||
GCF_002950215.1 15.583846 2.48 100.00 1.00 0.95;0.99;1.01;1.01;1.02;0.97;0.99;0.97;1.03;1.05 0.03 77116 15037 14921 4659463 2695851945 species Shigella flexneri Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Shigella;Shigella flexneri 609216830;3788559933;329474883;3160438580;2234733759;2258433137;2695851945 | ||
GCF_002949675.1 14.767599 2.35 100.00 1.00 1.00;0.93;1.05;1.01;0.97;1.00;0.98;0.99;1.04;1.02 0.03 68941 14176 14145 4395762 524994882 species Shigella dysenteriae Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Shigella;Shigella dysenteriae 609216830;3788559933;329474883;3160438580;2234733759;2258433137;524994882 | ||
GCF_001544255.1 1.852659 0.30 100.00 1.00 0.93;0.95;0.94;1.05;1.00;0.97;1.07;0.95;1.08;1.05 0.06 4891 4840 4840 2485591 4145431389 species Enterococcus faecium Bacteria;Firmicutes;Bacilli;Lactobacillales;Enterococcaceae;Enterococcus;Enterococcus faecium 609216830;3642462009;1845768359;185544332;3209851916;602175708;4145431389 | ||
GCF_000392875.1 1.844333 0.29 100.00 1.00 0.95;0.92;0.95;1.09;0.99;1.08;0.94;1.08;0.99;1.01 0.06 5644 5575 5575 2881440 3809813362 species Enterococcus faecalis Bacteria;Firmicutes;Bacilli;Lactobacillales;Enterococcaceae;Enterococcus;Enterococcus faecalis 609216830;3642462009;1845768359;185544332;3209851916;602175708;3809813362 | ||
GCF_001027105.1 0.929976 0.15 100.00 1.00 0.92;0.90;0.95;1.07;1.00;1.05;1.00;1.10;1.05;0.96 0.06 2721 2677 2677 2755072 1569132721 species Staphylococcus aureus Bacteria;Firmicutes;Bacilli;Bacillales;Staphylococcaceae;Staphylococcus;Staphylococcus aureus 609216830;3642462009;1845768359;813944714;1997712377;1824050977;1569132721 | ||
GCF_006742205.1 0.923797 0.15 100.00 1.00 0.92;0.92;0.93;1.01;1.04;0.93;1.13;1.01;1.10;1.01 0.07 2377 2338 2338 2422602 1920251658 species Staphylococcus epidermidis Bacteria;Firmicutes;Bacilli;Bacillales;Staphylococcaceae;Staphylococcus;Staphylococcus epidermidis 609216830;3642462009;1845768359;813944714;1997712377;1824050977;1920251658 | ||
GCF_001096185.1 0.192491 0.03 100.00 1.00 0.91;0.74;1.14;0.99;1.05;0.88;1.16;1.11;1.10;0.92 0.13 433 314 314 2117637 2983929374 species Streptococcus pneumoniae Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus;Streptococcus pneumoniae 609216830;3642462009;1845768359;185544332;1255484345;2394826844;2983929374 | ||
GCF_000148585.2 0.186310 0.03 100.00 1.00 0.83;0.88;1.00;1.16;0.86;1.01;0.83;1.15;1.31;0.98 0.15 370 268 266 1868883 1527235303 species Streptococcus mitis Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus;Streptococcus mitis 609216830;3642462009;1845768359;185544332;1255484345;2394826844;1527235303 | ||
GCF_000017205.1 0.091788 0.01 100.00 1.00 1.11;0.92;1.11;1.05;1.00;0.91;1.00;1.22;0.69;0.99 0.14 642 640 640 6588339 3843752343 species Pseudomonas aeruginosa Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas aeruginosa 609216830;3788559933;329474883;86398254;1478401337;1616653803;3843752343 | ||
GCF_009759685.1 0.089414 0.01 100.00 1.00 1.06;1.38;0.93;1.11;0.87;1.01;0.78;0.95;0.66;1.24 0.20 378 370 370 3980848 72054943 species Acinetobacter baumannii Bacteria;Proteobacteria;Gammaproteobacteria;Moraxellales;Moraxellaceae;Acinetobacter;Acinetobacter baumannii 609216830;3788559933;329474883;2185117029;943158193;568178587;72054943 |
Oops, something went wrong.