From c9660d7c36b068978a5c892b9bed44c2215fe252 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 13 Aug 2024 02:06:11 +0200 Subject: [PATCH 1/3] doc: list installation options --- docs/index.md | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/docs/index.md b/docs/index.md index db14513..12ce106 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,17 +1,47 @@ # oarfish: transcript quantification from long-read RNA-seq data -### Basic usage +`oarfish` is a program, written in Rust (https://www.rust-lang.org/), for quantifying transcript-level expression from long-read (i.e. Oxford nanopore cDNA and direct RNA and PacBio) sequencing technologies. `oarfish` requires a sample of sequencing reads aligned to the _transcriptome_ (currntly not to the genome). It handles multi-mapping reads through the use of probabilistic allocation via an expectation-maximization (EM) algorithm. -`oarfish` is a program, written in [`rust`](https://www.rust-lang.org/), for quantifying transcript-level expression from long-read (i.e. Oxford nanopore cDNA and direct RNA and PacBio) sequencing technologies. `oarfish` requires a sample of sequencing reads aligned to the *transcriptome* (currntly not to the genome). It handles multi-mapping reads through the use of probabilistic allocation via an expectation-maximization (EM) algorithm. +It optionally employs many filters to help discard alignments that may reduce quantification accuracy. Currently, the set of filters applied in `oarfish` are directly derived from the [`NanoCount`](https://github.com/a-slide/NanoCount)[^Gleeson] tool; both the filters that exist, and the way their values are set (with the exception of the `--three-prime-clip` filter, which is not set by default in `oarfish` but is in `NanoCount`). -It optionally employs many filters to help discard alignments that may reduce quantification accuracy. Currently, the set of filters applied in `oarfish` are directly derived from the [`NanoCount`](https://github.com/a-slide/NanoCount)[^Gleeson] tool; both the filters that exist, and the way their values are set (with the exception of the `--three-prime-clip` filter, which is not set by default in `oarfish` but is in `NanoCount`). +Additionally, `oarfish` provides options to make use of coverage profiles derived from the aligned reads to improve quantification accuracy. The use of this coverage model is enabled with the `--model-coverage` flag. You can read more about `oarfish`[^preprint] in the [preprint](https://www.biorxiv.org/content/10.1101/2024.02.28.582591v1). Please cite the preprint if you use `oarfish` in your work or analysis. -Additionally, `oarfish` provides options to make use of coverage profiles derived from the aligned reads to improve quantification accuracy. The use of this coverage model is enabled with the `--model-coverage` flag. You can read more about `oarfish`[^preprint] in the [preprint](https://www.biorxiv.org/content/10.1101/2024.02.28.582591v1). Please cite the preprint if you use `oarfish` in your work or analysis. +## Installation -Also, please note that `oarfish` is scientific software in active development. Therefore, please check the [GitHub Release](https://github.com/COMBINE-lab/oarfish/releases) page to make sure that you are using the latest version -(also, the `dev` branch should compile from source at all times so feel free to use it, but let us know if you run into any issues). +`oarfish` can be installed in a variety of ways. + +### Precompiled binaries + +Binaries are available via [GitHub Releases](https://github.com/COMBINE-lab/oarfish/releases). + +You can quickly install the latest release using the following helper script: + +```sh +curl --proto '=https' --tlsv1.2 -LsSf https://github.com/COMBINE-lab/oarfish/releases/latest/download/oarfish-installer.sh | sh +``` + +### Using `cargo` + +If you have `cargo` installed, you can install `oarfish` directly from the source code: + +```sh +cargo install oarfish +``` + +You can find the crate on [crates.io](https://crates.io/crates/oarfish). + +### Bioconda + +`oarfish` is available via [Bioconda](https://anaconda.org/bioconda/oarfish): + +```sh +conda install -c bioconda oarfish +``` + +## Basic usage The usage can be provided by passing `-h` at the command line. + ``` A fast, accurate and versatile tool for long-read transcript quantification. @@ -57,7 +87,7 @@ The input should be a `bam` format file, with reads aligned using [`minimap2`](h `-d fw` will allow only alignments in the forward orientation and `-d rc` will allow only alignments in the reverse-complement orientation and `-d both` (the default) will allow both. The `-d` filter, if explicitly provided, overrides the orientation filter in any provided "filter group" so e.g. passing `--filter-group no-filters -d fw` will disable other filters, but will still only admit alignments in the forward orientation. -### Choosing `minimap2` alignment options +## Choosing `minimap2` alignment options Since the purpose of `oarfish` is to estimate transcript abundance from a collection of alignments to the target transcriptome, it is important that the alignments are generated in a fashion that is compatible with this goal. Primarily, this means that the aligner should be configured to report as many optimal (and near-optimal) alignments as exist, so that @@ -73,11 +103,11 @@ accuracy of `oarfish`, but it may make alignment take longer and produce a large **Note (2)**: For very high quality PacBio data, it may be most appropriate to use the `-ax map-hifi` flag in place of `-ax pacbio`. We are currently evaluating the effect of this option, and also welcome feedback if you have experiences to share on the use of data aligned with these different flags with `oarfish`. -### Inferential Replicates +## Inferential Replicates `oarfish` has the ability to compute [_inferential replicates_](https://academic.oup.com/nar/article/47/18/e105/5542870) of its quantification estimates. This is performed by bootstrap sampling of the original read mappings, and subsequently performing inference under each resampling. These inferential replicates allow assessing the variance of the point estimate of transcript abundance, and can lead to improved differential analysis at the transcript level, if using a differential testing tool that takes advantage of this information. The generation of inferential replicates is controlled by the `--num-bootstraps` argument to `oarfish`. The default value is `0`, meaning that no inferential replicates are generated. If you set this to some value greater than `0`, the the requested number of inferential replicates will be generated. It is recommended, if generating inferential replicates, to run `oarfish` with multiple threads, since replicate generation is highly-parallelized. Finally, if replicates are generated, they are written to a [`Parquet`](https://parquet.apache.org/), starting with the specified output stem and ending with `infreps.pq`. -### Output +## Output The `--output` option passed to `oarfish` corresponds to a path prefix (this prefix can contain the path separator character and if it refers to a directory that does not yeat exist, that directory will be created). Based on this path prefix, say `P`, `oarfish` will create 2 files: @@ -85,7 +115,7 @@ The `--output` option passed to `oarfish` corresponds to a path prefix (this pre * `P.quant` - a tab separated file listing the quantified targets, as well as information about their length and other metadata. The `num_reads` column provides the estimate of the number of reads originating from each target. * `P.infreps.pq` - a [`Parquet`](https://parquet.apache.org/) table where each row is a transcript and each column is an inferential replicate, containing the estimated counts for each transcript under each computed inferential replicate. -### References +## References [^Gleeson]: Josie Gleeson, Adrien Leger, Yair D J Prawer, Tracy A Lane, Paul J Harrison, Wilfried Haerty, Michael B Clark, Accurate expression quantification from nanopore direct RNA sequencing with NanoCount, Nucleic Acids Research, Volume 50, Issue 4, 28 February 2022, Page e19, [https://doi.org/10.1093/nar/gkab1129](https://doi.org/10.1093/nar/gkab1129) From 17c09e2f87057c267550b4b04fc7f64603a3febc Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 13 Aug 2024 02:26:51 +0200 Subject: [PATCH 2/3] chore: Improve Github README by replacing with symlink to docs/index.md --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) mode change 100644 => 120000 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 2c90a21..0000000 --- a/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# oarfish: Accurate, fast, and versatile transcript quantification from long-read RNA-seq data - -Briefly, `oarfish` is a program, written in `rust`, for quantifying transcript-level expression from long-read (i.e. Oxford nanopore cDNA and direct RNA and PacBio) sequencing technologies. -For more details on `oarfish`, please refer to the [`oarfish` documentation](https://combine-lab.github.io/oarfish). diff --git a/README.md b/README.md new file mode 120000 index 0000000..e892330 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +docs/index.md \ No newline at end of file From 178074e8ca8d7978c3525b3c29d2f1b9bb524f60 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Mon, 26 Aug 2024 15:00:16 -0400 Subject: [PATCH 3/3] [do_tag] tag oarfish v0.6.0 --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index e730861..7dc59da 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,7 @@ sprs = "0.11.1" minimap2-sys = { version = "0.1.19" } # rely on minimap2-temp until upstream version is pushed minimap2-temp = { version = "0.1.30" } +# alternative sources for dev #git = "https://github.com/rob-p/minimap2-rs.git", branch = "alignment-score" } #git = "https://github.com/jguhlin/minimap2-rs.git", branch = "alignment-score" } needletail = "0.5.1"