diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a0d7f87..207a337 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,3 +7,9 @@ jobs: - uses: nixbuild/nix-quick-install-action@v28 - uses: actions/checkout@v4 - run: nix build + paper: + runs-on: ubuntu-latest + steps: + - uses: nixbuild/nix-quick-install-action@v28 + - uses: actions/checkout@v4 + - run: nix build .#paper diff --git a/flake.lock b/flake.lock index 4219465..301478a 100644 --- a/flake.lock +++ b/flake.lock @@ -16,6 +16,22 @@ "type": "github" } }, + "inara": { + "flake": false, + "locked": { + "lastModified": 1694971498, + "narHash": "sha256-i2b//MXoVz13WNoIg8rBnQbCpMIAMAjSi09w/7B5W9I=", + "owner": "openjournals", + "repo": "inara", + "rev": "0739fb5f750a900ef917b973c18f239aafacb1fd", + "type": "github" + }, + "original": { + "owner": "openjournals", + "repo": "inara", + "type": "github" + } + }, "nixpkgs": { "locked": { "lastModified": 1690875999, @@ -34,6 +50,7 @@ "root": { "inputs": { "cloudflareZlib": "cloudflareZlib", + "inara": "inara", "nixpkgs": "nixpkgs" } } diff --git a/flake.nix b/flake.nix index 0d5bcb5..55e9212 100644 --- a/flake.nix +++ b/flake.nix @@ -4,10 +4,16 @@ url = "github:cloudflare/zlib"; flake = false; }; + inputs.inara = { + url = "github:openjournals/inara"; + flake = false; + }; + outputs = { self, nixpkgs, cloudflareZlib, + inara, }: let system = "x86_64-linux"; pkgs = import nixpkgs { @@ -19,7 +25,13 @@ ]; }; in { - packages.${system}.default = import ./. {inherit pkgs cloudflareZlib;}; - devShells.${system}.default = self.packages.${system}.default.env; + packages.${system} = { + default = import ./. {inherit pkgs cloudflareZlib;}; + paper = pkgs.callPackage ./paper/paper.nix {inherit inara;}; + }; + devShells.${system} = { + default = self.packages.${system}.default.env; + paper = self.packages.${system}.paper.env; + }; }; } diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..e7b80f2 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,60 @@ +--- +title: 'Dedumi: approximate deduplication of unaligned reads using unique molecular identifiers' +tags: + - Haskell + - bioinformatics + - genomics +authors: + - name: Justin Bedő + affiliation: "1, 2" +affiliations: + - name: The Walter and Eliza Hall Institute of Medical Research + index: 1 + - name: The University of Melbourne + index: 2 +bibliography: references.bib +--- + +# Summary + +Dedumi is an application for approximately deduplicating unaligned reads with unique molecular identifiers (UMIs). +Deduplication is through a Cuckoo filter and hence the probability of incorrectly filtering a unique read is bounded. +By virtue of operating on unaligned reads, there are significant computational savings as duplicate reads are removed prior to alignment. + + +# Statement of Need + +Short read sequencing provides cost-effective DNA sequencing and forms a fundamental technique in many biological assays and studies. +The technique relies on randomly sequencing short fragments of DNA forming a *library*, with much subsequent analysis depending on counts of how frequently certain sequences are observed. +The formation of a library typically requires amplification of small amounts of DNA through polymerase chain reaction (PCR) prior to sequencing [@Garber2009]. +Unfortunately, PCR can preferentially amplify certain strands of DNA [@Aird2011], which leads to a bias during analysis. +One method to reduce this bias is to introduce unique molecular identifiers (UMIs) prior to amplification to uniquely identify DNA fragments after amplification. +These UMIs can then be used to remove fragments (deduplicate) that have been repeatedly sequenced, thus mitigating the bias introduced by PCA. + + +There are several existing approaches to deduplication based on UMIs, most of which operate post-alignment: reads are first aligned to a reference genome and then duplicate reads mapping to the same position are collapsed to a consensuses sequence. +This is the approach taken by UMI-tools [@Smith2017], and many variants of the core approach exist [@Liu2019]. +The downside to an alignment approach is that there is no computational cost reduction at the alignment stage: all reads, regardless of whether they are duplicates, are aligned to the genome. + +In contrast, unaligned deduplication approaches reduce the computational cost of alignment as the duplicate reads can be removed prior to alignment. +There are some existing methods that deduplicate unaligned reads based on UMIs [@UMIc], however these are resource intensive: as reported by @UMIc, 1M reads required 2.2 hours to process. +These resource requirements prevent use in sequencing projects where many millions of reads need to be deduplicated. + +Dedumi takes an alternative approach and approximately deduplicates unaligned reads. +UMIs are extracted from reads in a stream, with a Cuckoo filter [@Fan2014] used to detect duplicates. +A similar approach has recently been implemented in fastp [@Chen2023], however unlike dedumi a Bloom filter [@Bloom1970] is used which does not guarantee an upper bound on the error rate. +The error rate of the Cuckoo hash is bounded, with memory requirements determined by the chosen error rate. + +On synthetic 150 b.p. paired end data generated with a duplication rate of 10% and a base error substitution rate of 0.087% to match the Illumina HiSeq X Ten [@Stoler2021], +dedumi processes 1M reads in 6.852s^[all results are the median of 11 different simulated datasets] on an Intel i7-14700F using a single core. +A total of 903,012 reads were conserved (9.7% duplication rate), closely matching the expected 10% duplication rate. +In comparison, fastp processes 1M reads in 4.259s (multiple threads, 10.667s user+sys) and retains 930,862 reads (6.9% duplication rate). +Dedumi is therefore 36% faster on a single core and more closely matches the expected duplication rate. + +Dedumi is open source and published under the BSD3 licence. + +# Acknowledgements + +We thank Anthony Papenfuss (A.T.P.) for supporting this work and providing feedback on the manuscript. +J.B. was supported by the Stafford Fox Medical Research Foundation, Movember Foundation, and by funding to A.T.P. from a National Health and Medical Research Council (NHMRC) Investigator Grant (2026643). +The research benefitted by support from the Victorian State Government Operational Infrastructure Support and Australian Government NHMRC Independent Research Institute Infrastructure Support. diff --git a/paper/paper.nix b/paper/paper.nix new file mode 100644 index 0000000..9d2fe01 --- /dev/null +++ b/paper/paper.nix @@ -0,0 +1,50 @@ +{ + stdenvNoCC, + inara, + pandoc, + texlive, + hack-font, +}: +stdenvNoCC.mkDerivation { + name = "paper"; + src = inara; + buildInputs = [ + pandoc + (texlive.combine { + inherit + (texlive) + scheme-basic + latexmk + marginnote + xcolor + preprint + etoolbox + titlesec + pgf + hyperxmp + ifmtarg + luacode + luatexbase + caption + orcidlink + tcolorbox + environ + seqsplit + xstring + float + fontspec + fontsetup + unicode-math + lualatex-math + newcomputermodern + selnolig + ; + }) + ]; + buildPhase = '' + export HOME=$(mktemp -d) + make ARTICLE=${./.}/paper.md + ''; + installPhase = "cp -r publishing-artifacts $out"; + OSFONTDIR = hack-font; +} diff --git a/paper/references.bib b/paper/references.bib new file mode 100644 index 0000000..4bf2654 --- /dev/null +++ b/paper/references.bib @@ -0,0 +1,118 @@ +@article{UMIc, + incitefulid = {W3171694691}, + title = {UMIc: A Preprocessing Method for UMI Deduplication and Reads Correction}, + url = {http://doi.org/https://doi.org/10.3389/fgene.2021.660366}, + DOI = {https://doi.org/10.3389/fgene.2021.660366}, + author = {Maria Tsagiopoulou and Maria Christina Maniou and Nikolaos Pechlivanis and Anastasis Togkousidis and Michaela Kotrová and Tobias Hutzenlaub and Ilias Kappas and Anastasia Chatzidimitriou and Fotis Psomopoulos}, + year = {2021}, + journal = {Frontiers in Genetics}, + keyword = {inciteful.xyz} +} +@article{Liu2019, + incitefulid = {W2996575833}, + title = {Algorithms for efficiently collapsing reads with Unique Molecular Identifiers}, + url = {http://doi.org/https://doi.org/10.7717/peerj.8275}, + DOI = {https://doi.org/10.7717/peerj.8275}, + author = {Daniel Liu}, + year = {2019}, + journal = {PeerJ}, + keyword = {inciteful.xyz} +} +@article{W4283398238, + incitefulid = {W4283398238}, + title = {TrieDedup: A fast trie-based deduplication algorithm to handle ambiguous bases in high-throughput sequencing}, + url = {http://doi.org/https://doi.org/10.1101/2022.02.20.481170}, + DOI = {https://doi.org/10.1101/2022.02.20.481170}, + author = {Jian Hu and Shunlong Luo and Ming Tian and Frederick W. Alt and Adam Yongxin Ye}, + year = {2022}, + journal = {bioRxiv (Cold Spring Harbor Laboratory)}, + keyword = {inciteful.xyz} +} +@article{Smith2017, + incitefulid = {W2952109315}, + title = {UMI-tools: modeling sequencing errors in Unique Molecular Identifiers to improve quantification accuracy}, + url = {http://doi.org/https://doi.org/10.1101/gr.209601.116}, + DOI = {https://doi.org/10.1101/gr.209601.116}, + author = {Tom Smith and Andreas Heger and Ian Sudbery}, + year = {2017}, + journal = {Genome Research}, + keyword = {inciteful.xyz} +} +@article{Chen2023, + title = {Ultrafast one‐pass FASTQ data preprocessing, quality control, and deduplication using fastp}, + volume = {2}, + ISSN = {2770-596X}, + url = {http://dx.doi.org/10.1002/imt2.107}, + DOI = {10.1002/imt2.107}, + number = {2}, + journal = {iMeta}, + publisher = {Wiley}, + author = {Chen, Shifu}, + year = {2023}, + month = may +} +@article{Stoler2021, + title = {Sequencing error profiles of Illumina sequencing instruments}, + volume = {3}, + ISSN = {2631-9268}, + url = {http://dx.doi.org/10.1093/nargab/lqab019}, + DOI = {10.1093/nargab/lqab019}, + number = {1}, + journal = {NAR Genomics and Bioinformatics}, + publisher = {Oxford University Press (OUP)}, + author = {Stoler, Nicholas and Nekrutenko, Anton}, + year = {2021}, + month = jan +} +@article{Aird2011, + title = {Analyzing and minimizing PCR amplification bias in Illumina sequencing libraries}, + volume = {12}, + ISSN = {1465-6906}, + url = {http://dx.doi.org/10.1186/gb-2011-12-2-r18}, + DOI = {10.1186/gb-2011-12-2-r18}, + number = {2}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media LLC}, + author = {Aird, Daniel and Ross, Michael G and Chen, Wei-Sheng and Danielsson, Maxwell and Fennell, Timothy and Russ, Carsten and Jaffe, David B and Nusbaum, Chad and Gnirke, Andreas}, + year = {2011}, + pages = {R18} +} +@article{Garber2009, + title = {Closing gaps in the human genome using sequencing by synthesis}, + volume = {10}, + ISSN = {1465-6906}, + url = {http://dx.doi.org/10.1186/gb-2009-10-6-r60}, + DOI = {10.1186/gb-2009-10-6-r60}, + number = {6}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media LLC}, + author = {Garber, Manuel and Zody, Michael C and Arachchi, Harindra M and Berlin, Aaron and Gnerre, Sante and Green, Lisa M and Lennon, Niall and Nusbaum, Chad}, + year = {2009}, + pages = {R60} +} +@inproceedings{Fan2014, + series = {CoNEXT ’14}, + title = {Cuckoo Filter: Practically Better Than Bloom}, + url = {http://dx.doi.org/10.1145/2674005.2674994}, + DOI = {10.1145/2674005.2674994}, + booktitle = {Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies}, + publisher = {ACM}, + author = {Fan, Bin and Andersen, Dave G. and Kaminsky, Michael and Mitzenmacher, Michael D.}, + year = {2014}, + month = dec, + collection = {CoNEXT ’14} +} +@article{Bloom1970, + title = {Space/time trade-offs in hash coding with allowable errors}, + volume = {13}, + ISSN = {1557-7317}, + url = {http://dx.doi.org/10.1145/362686.362692}, + DOI = {10.1145/362686.362692}, + number = {7}, + journal = {Communications of the ACM}, + publisher = {Association for Computing Machinery (ACM)}, + author = {Bloom, Burton H.}, + year = {1970}, + month = jul, + pages = {422–426} +} \ No newline at end of file diff --git a/paper/src/simulate-reads/flake.lock b/paper/src/simulate-reads/flake.lock new file mode 100644 index 0000000..c100dfb --- /dev/null +++ b/paper/src/simulate-reads/flake.lock @@ -0,0 +1,26 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1713312815, + "narHash": "sha256-vnvYlxrBWLPHbalKqEUEZPd5JF5/uhGWLm/LMagel9g=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "229626d4d005a5a93bcba8eb5ac5c4d93994a2e0", + "type": "github" + }, + "original": { + "owner": "nixos", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/paper/src/simulate-reads/flake.nix b/paper/src/simulate-reads/flake.nix new file mode 100644 index 0000000..2e84fd0 --- /dev/null +++ b/paper/src/simulate-reads/flake.nix @@ -0,0 +1,9 @@ +{ + inputs.nixpkgs.url = "github:nixos/nixpkgs"; + outputs = {self, nixpkgs}: + let system = "x86_64-linux"; + pkgs = import nixpkgs {inherit system;}; + in + {packages.${system}.default = pkgs.haskellPackages.callCabal2nix "simulate-reads" ./. {}; + devShells.${system}.default = self.packages.${system}.default.env;} ; +} diff --git a/paper/src/simulate-reads/package.yaml b/paper/src/simulate-reads/package.yaml new file mode 100644 index 0000000..17f9d4d --- /dev/null +++ b/paper/src/simulate-reads/package.yaml @@ -0,0 +1,19 @@ +name: simulate-reads +version: 0.1 +synopsis: Synthetic read generator with UMIs +maintainer: Justin Bedo +license: MIT + +dependencies: + - base + - streamly-core + - random + - streaming-commons + - bytestring + - mtl + +ghc-options: [-O2, -Wall, -Wno-name-shadowing, -rtsopts, -prof, -fprof-late] + +executables: + simulate: + main-is: simulate.hs diff --git a/paper/src/simulate-reads/simulate.hs b/paper/src/simulate-reads/simulate.hs new file mode 100644 index 0000000..77fa74a --- /dev/null +++ b/paper/src/simulate-reads/simulate.hs @@ -0,0 +1,148 @@ +{-# LANGUAGE GHC2021 #-} +{-# LANGUAGE BlockArguments #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} + +module Main where + +import Control.Monad +import Control.Monad.Trans +import Data.ByteString (ByteString) +import Data.ByteString qualified as B +import Data.ByteString.Char8 qualified as BC +import Data.Function +import Data.Streaming.Zlib +import Streamly.Data.Fold qualified as F +import Streamly.Data.Stream qualified as S +import System.Environment +import System.IO +import System.Random hiding (uniform) +import Prelude hiding (Read, read) + +-- simulation parameters +readLength :: Int +readLength = 150 + +baseErrorRate :: Double +baseErrorRate = 0.00087 -- HiSeq X Ten + +nreads :: Int +nreads = 1_000_000 + +-- Probability monad transformer +newtype P m a = P {runP :: StdGen -> m (a, StdGen)} + +evalP :: (Monad m) => P m a -> StdGen -> m a +evalP (P f) g = fst <$> f g + +instance (Monad m) => Monad (P m) where + P f >>= g = P \q -> do + (x, q') <- f q + runP (g x) q' + +instance (Monad m) => Functor (P m) where + fmap = liftM + +instance (Monad m) => Applicative (P m) where + pure x = P \g -> pure (x, g) + (<*>) = ap + +instance (MonadIO m) => MonadIO (P m) where + liftIO f = P \g -> (,g) <$> liftIO f + +uniform :: Applicative m => P m Double +uniform = P (pure . random) + +poisson :: Monad m => Double -> P m Int +poisson lambda = go 0 0 + where + go i s + | s >= 1 = pure (i - 1) + | otherwise = do + u <- uniform + go (i + 1) (s - log u / lambda) + + +-- Reads/nucleotides and sampling them (with errors) +data Nuc = A | C | G | T + deriving (Eq, Ord, Enum, Show, Bounded) + +nuc :: (Monad m) => P m Nuc +nuc = toEnum . round . (* fromIntegral (fromEnum (maxBound :: Nuc))) <$> uniform + +type Read = [Nuc] + +data Pair a = Pair !a !a + deriving (Eq, Ord, Show, Functor) + +read :: (Monad m) => P m Read +read = replicateM readLength nuc + +readPair :: (Monad m) => P m (Pair Read) +readPair = Pair <$> read <*> read + +dup :: (Monad m) => Double -> Pair Read -> S.Stream (P m) (Pair Read) +dup p r = S.concatEffect do + n <- poisson $ 1 / (1 - p) - 1 + pure $ S.replicate (1 + n) r + +baseError :: (Monad m) => Double -> Read -> P m Read +baseError p = mapM error' + where + error' b = do + q <- uniform + if q < p + then nuc + else pure b + +-- FastQ writer +data Entry = Entry ByteString (Pair Read) deriving (Eq, Ord, Show) + +unparse :: (MonadIO m) => FilePath -> FilePath -> S.Stream m Entry -> m () +unparse l r str = do + lh <- liftIO $ openFile l WriteMode + rh <- liftIO $ openFile r WriteMode + ld <- liftIO $ initDeflate 0 gzipWindow + rd <- liftIO $ initDeflate 0 gzipWindow + fmap unparse' str & S.fold (F.drainMapM $ writeFiles lh rh ld rd) + liftIO $ do + flush rd rh + flush ld lh + hClose rh + hClose lh + where + gzipWindow = WindowBits 31 + flush d h = finishDeflate d & writePopper h + + writeFiles l r ld rd (Pair a b) = do + putCompressed ld l a + putCompressed rd r b + + putCompressed d h chunk = liftIO $ do + popper <- feedDeflate d chunk + writePopper h popper + + writePopper h p = + p >>= \case + PRNext str -> do + B.hPut h str + writePopper h p + PRDone -> pure () + PRError _ -> error "unparse: compression error" + + unparse' :: Entry -> Pair ByteString + unparse' (Entry hdr (Pair l r)) = Pair (BC.unlines ["@" <> hdr, BC.pack $ concatMap show l, "+", qual]) (BC.unlines ["@" <> hdr, BC.pack $ concatMap show r, "+", qual]) + + qual = BC.pack $ replicate readLength '~' + +main :: IO () +main = do + [out1, out2] <- getArgs + q <- getStdGen + S.repeatM readPair -- Generate random reads + & S.concatMap (dup 0.1) -- Duplicate reads + & S.mapM (\(Pair l r) -> Pair <$> baseError baseErrorRate l <*> baseError baseErrorRate r) -- Introduce base errors + & S.take nreads + & S.zipWith (\i rp -> Entry (BC.pack $ show i) rp) (S.fromList [0 :: Int ..]) + & unparse out1 out2 + & flip evalP q