add paper draft

PapenfussLab · May 21, 2024 · 5a483dd · 5a483dd
1 parent ef6aacb
commit 5a483dd
Show file tree

Hide file tree

Showing 10 changed files with 467 additions and 2 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -7,3 +7,9 @@ jobs:
       - uses: nixbuild/nix-quick-install-action@v28
       - uses: actions/checkout@v4
       - run: nix build
+  paper:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: nixbuild/nix-quick-install-action@v28
+      - uses: actions/checkout@v4
+      - run: nix build .#paper
diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -4,10 +4,16 @@
     url = "github:cloudflare/zlib";
     flake = false;
   };
+  inputs.inara = {
+    url = "github:openjournals/inara";
+    flake = false;
+  };
+
   outputs = {
     self,
     nixpkgs,
     cloudflareZlib,
+    inara,
   }: let
     system = "x86_64-linux";
     pkgs = import nixpkgs {
@@ -19,7 +25,13 @@
       ];
     };
   in {
-    packages.${system}.default = import ./. {inherit pkgs cloudflareZlib;};
-    devShells.${system}.default = self.packages.${system}.default.env;
+    packages.${system} = {
+      default = import ./. {inherit pkgs cloudflareZlib;};
+      paper = pkgs.callPackage ./paper/paper.nix {inherit inara;};
+    };
+    devShells.${system} = {
+      default = self.packages.${system}.default.env;
+      paper = self.packages.${system}.paper.env;
+    };
   };
 }
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,60 @@
+---
+title: 'Dedumi: approximate deduplication of unaligned reads using unique molecular identifiers'
+tags:
+   - Haskell
+   - bioinformatics
+   - genomics
+authors:
+   - name: Justin Bedő
+     affiliation: "1, 2"
+affiliations:
+   - name: The Walter and Eliza Hall Institute of Medical Research
+     index: 1
+   - name: The University of Melbourne
+     index: 2
+bibliography: references.bib
+---
+
+# Summary
+
+Dedumi is an application for approximately deduplicating unaligned reads with unique molecular identifiers (UMIs).
+Deduplication is through a Cuckoo filter and hence the probability of incorrectly filtering a unique read is bounded.
+By virtue of operating on unaligned reads, there are significant computational savings as duplicate reads are removed prior to alignment.
+
+
+# Statement of Need
+
+Short read sequencing provides cost-effective DNA sequencing and forms a fundamental technique in many biological assays and studies.
+The technique relies on randomly sequencing short fragments of DNA forming a *library*, with much subsequent analysis depending on counts of how frequently certain sequences are observed.
+The formation of a library typically requires amplification of small amounts of DNA through polymerase chain reaction (PCR) prior to sequencing [@Garber2009].
+Unfortunately, PCR can preferentially amplify certain strands of DNA [@Aird2011], which leads to a bias during analysis.
+One method to reduce this bias is to introduce unique molecular identifiers (UMIs) prior to amplification to uniquely identify DNA fragments after amplification.
+These UMIs can then be used to remove fragments (deduplicate) that have been repeatedly sequenced, thus mitigating the bias introduced by PCA.
+
+
+There are several existing approaches to deduplication based on UMIs, most of which operate post-alignment: reads are first aligned to a reference genome and then duplicate reads mapping to the same position are collapsed to a consensuses sequence.
+This is the approach taken by UMI-tools [@Smith2017], and many variants of the core approach exist [@Liu2019].
+The downside to an alignment approach is that there is no computational cost reduction at the alignment stage: all reads, regardless of whether they are duplicates, are aligned to the genome.
+
+In contrast, unaligned deduplication approaches reduce the computational cost of alignment as the duplicate reads can be removed prior to alignment.
+There are some existing methods that deduplicate unaligned reads based on UMIs [@UMIc], however these are resource intensive: as reported by @UMIc, 1M reads required 2.2 hours to process.
+These resource requirements prevent use in sequencing projects where many millions of reads need to be deduplicated.
+
+Dedumi takes an alternative approach and approximately deduplicates unaligned reads.
+UMIs are extracted from reads in a stream, with a Cuckoo filter [@Fan2014] used to detect duplicates.
+A similar approach has recently been implemented in fastp [@Chen2023], however unlike dedumi a Bloom filter [@Bloom1970] is used which does not guarantee an upper bound on the error rate.
+The error rate of the Cuckoo hash is bounded, with memory requirements determined by the chosen error rate.
+
+On synthetic 150 b.p. paired end data generated with a duplication rate of 10% and a base error substitution rate of 0.087% to match the Illumina HiSeq X Ten [@Stoler2021],
+dedumi processes 1M reads in 6.852s^[all results are the median of 11 different simulated datasets] on an Intel i7-14700F using a single core.
+A total of 903,012 reads were conserved (9.7% duplication rate), closely matching the expected 10% duplication rate.
+In comparison, fastp processes 1M reads in 4.259s (multiple threads, 10.667s user+sys) and retains 930,862 reads (6.9% duplication rate).
+Dedumi is therefore 36% faster on a single core and more closely matches the expected duplication rate.
+
+Dedumi is open source and published under the BSD3 licence.
+
+# Acknowledgements
+
+We thank Anthony Papenfuss (A.T.P.) for supporting this work and providing feedback on the manuscript.
+J.B. was supported by the Stafford Fox Medical Research Foundation, Movember Foundation, and by funding to A.T.P. from a National Health and Medical Research Council (NHMRC) Investigator Grant (2026643).
+The research benefitted by support from the Victorian State Government Operational Infrastructure Support and Australian Government NHMRC Independent Research Institute Infrastructure Support.
diff --git a/paper/paper.nix b/paper/paper.nix
@@ -0,0 +1,50 @@
+{
+  stdenvNoCC,
+  inara,
+  pandoc,
+  texlive,
+  hack-font,
+}:
+stdenvNoCC.mkDerivation {
+  name = "paper";
+  src = inara;
+  buildInputs = [
+    pandoc
+    (texlive.combine {
+      inherit
+        (texlive)
+        scheme-basic
+        latexmk
+        marginnote
+        xcolor
+        preprint
+        etoolbox
+        titlesec
+        pgf
+        hyperxmp
+        ifmtarg
+        luacode
+        luatexbase
+        caption
+        orcidlink
+        tcolorbox
+        environ
+        seqsplit
+        xstring
+        float
+        fontspec
+        fontsetup
+        unicode-math
+        lualatex-math
+        newcomputermodern
+        selnolig
+        ;
+    })
+  ];
+  buildPhase = ''
+    export HOME=$(mktemp -d)
+    make ARTICLE=${./.}/paper.md
+  '';
+  installPhase = "cp -r publishing-artifacts $out";
+  OSFONTDIR = hack-font;
+}
diff --git a/paper/references.bib b/paper/references.bib
@@ -0,0 +1,118 @@
+@article{UMIc,
+   incitefulid = {W3171694691},
+   title = {UMIc: A Preprocessing Method for UMI Deduplication and Reads Correction},
+   url = {http://doi.org/https://doi.org/10.3389/fgene.2021.660366},
+   DOI = {https://doi.org/10.3389/fgene.2021.660366},
+   author = {Maria Tsagiopoulou and Maria Christina Maniou and Nikolaos Pechlivanis and Anastasis Togkousidis and Michaela Kotrová and Tobias Hutzenlaub and Ilias Kappas and Anastasia Chatzidimitriou and Fotis Psomopoulos},
+   year = {2021},
+   journal = {Frontiers in Genetics},
+   keyword = {inciteful.xyz}
+}
+@article{Liu2019,
+   incitefulid = {W2996575833},
+   title = {Algorithms for efficiently collapsing reads with Unique Molecular Identifiers},
+   url = {http://doi.org/https://doi.org/10.7717/peerj.8275},
+   DOI = {https://doi.org/10.7717/peerj.8275},
+   author = {Daniel Liu},
+   year = {2019},
+   journal = {PeerJ},
+   keyword = {inciteful.xyz}
+}
+@article{W4283398238,
+   incitefulid = {W4283398238},
+   title = {TrieDedup: A fast trie-based deduplication algorithm to handle ambiguous bases in high-throughput sequencing},
+   url = {http://doi.org/https://doi.org/10.1101/2022.02.20.481170},
+   DOI = {https://doi.org/10.1101/2022.02.20.481170},
+   author = {Jian Hu and Shunlong Luo and Ming Tian and Frederick W. Alt and Adam Yongxin Ye},
+   year = {2022},
+   journal = {bioRxiv (Cold Spring Harbor Laboratory)},
+   keyword = {inciteful.xyz}
+}
+@article{Smith2017,
+   incitefulid = {W2952109315},
+   title = {UMI-tools: modeling sequencing errors in Unique Molecular Identifiers to improve quantification accuracy},
+   url = {http://doi.org/https://doi.org/10.1101/gr.209601.116},
+   DOI = {https://doi.org/10.1101/gr.209601.116},
+   author = {Tom Smith and Andreas Heger and Ian Sudbery},
+   year = {2017},
+   journal = {Genome Research},
+   keyword = {inciteful.xyz}
+}
+@article{Chen2023,
+  title = {Ultrafast one‐pass FASTQ data preprocessing,  quality control,  and deduplication using fastp},
+  volume = {2},
+  ISSN = {2770-596X},
+  url = {http://dx.doi.org/10.1002/imt2.107},
+  DOI = {10.1002/imt2.107},
+  number = {2},
+  journal = {iMeta},
+  publisher = {Wiley},
+  author = {Chen,  Shifu},
+  year = {2023},
+  month = may 
+}
+@article{Stoler2021,
+  title = {Sequencing error profiles of Illumina sequencing instruments},
+  volume = {3},
+  ISSN = {2631-9268},
+  url = {http://dx.doi.org/10.1093/nargab/lqab019},
+  DOI = {10.1093/nargab/lqab019},
+  number = {1},
+  journal = {NAR Genomics and Bioinformatics},
+  publisher = {Oxford University Press (OUP)},
+  author = {Stoler,  Nicholas and Nekrutenko,  Anton},
+  year = {2021},
+  month = jan 
+}
+@article{Aird2011,
+  title = {Analyzing and minimizing PCR amplification bias in Illumina sequencing libraries},
+  volume = {12},
+  ISSN = {1465-6906},
+  url = {http://dx.doi.org/10.1186/gb-2011-12-2-r18},
+  DOI = {10.1186/gb-2011-12-2-r18},
+  number = {2},
+  journal = {Genome Biology},
+  publisher = {Springer Science and Business Media LLC},
+  author = {Aird,  Daniel and Ross,  Michael G and Chen,  Wei-Sheng and Danielsson,  Maxwell and Fennell,  Timothy and Russ,  Carsten and Jaffe,  David B and Nusbaum,  Chad and Gnirke,  Andreas},
+  year = {2011},
+  pages = {R18}
+}
+@article{Garber2009,
+  title = {Closing gaps in the human genome using sequencing by synthesis},
+  volume = {10},
+  ISSN = {1465-6906},
+  url = {http://dx.doi.org/10.1186/gb-2009-10-6-r60},
+  DOI = {10.1186/gb-2009-10-6-r60},
+  number = {6},
+  journal = {Genome Biology},
+  publisher = {Springer Science and Business Media LLC},
+  author = {Garber,  Manuel and Zody,  Michael C and Arachchi,  Harindra M and Berlin,  Aaron and Gnerre,  Sante and Green,  Lisa M and Lennon,  Niall and Nusbaum,  Chad},
+  year = {2009},
+  pages = {R60}
+}
+@inproceedings{Fan2014,
+  series = {CoNEXT ’14},
+  title = {Cuckoo Filter: Practically Better Than Bloom},
+  url = {http://dx.doi.org/10.1145/2674005.2674994},
+  DOI = {10.1145/2674005.2674994},
+  booktitle = {Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies},
+  publisher = {ACM},
+  author = {Fan,  Bin and Andersen,  Dave G. and Kaminsky,  Michael and Mitzenmacher,  Michael D.},
+  year = {2014},
+  month = dec,
+  collection = {CoNEXT ’14}
+}
+@article{Bloom1970,
+  title = {Space/time trade-offs in hash coding with allowable errors},
+  volume = {13},
+  ISSN = {1557-7317},
+  url = {http://dx.doi.org/10.1145/362686.362692},
+  DOI = {10.1145/362686.362692},
+  number = {7},
+  journal = {Communications of the ACM},
+  publisher = {Association for Computing Machinery (ACM)},
+  author = {Bloom,  Burton H.},
+  year = {1970},
+  month = jul,
+  pages = {422–426}
+}
diff --git a/paper/src/simulate-reads/flake.lock b/paper/src/simulate-reads/flake.lock
diff --git a/paper/src/simulate-reads/flake.nix b/paper/src/simulate-reads/flake.nix
@@ -0,0 +1,9 @@
+{
+  inputs.nixpkgs.url = "github:nixos/nixpkgs";
+  outputs = {self, nixpkgs}:
+    let system = "x86_64-linux";
+        pkgs = import nixpkgs {inherit system;};
+    in
+      {packages.${system}.default = pkgs.haskellPackages.callCabal2nix "simulate-reads" ./. {};
+       devShells.${system}.default = self.packages.${system}.default.env;} ;
+}
diff --git a/paper/src/simulate-reads/package.yaml b/paper/src/simulate-reads/package.yaml
@@ -0,0 +1,19 @@
+name: simulate-reads
+version: 0.1
+synopsis: Synthetic read generator with UMIs
+maintainer: Justin Bedo <[email protected]>
+license: MIT
+
+dependencies:
+  - base
+  - streamly-core
+  - random
+  - streaming-commons
+  - bytestring
+  - mtl
+
+ghc-options: [-O2, -Wall, -Wno-name-shadowing, -rtsopts, -prof, -fprof-late]
+
+executables:
+  simulate:
+    main-is: simulate.hs