From b1e4f73947cb691089bf9fefde110c90e3ff291c Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sat, 2 Mar 2024 21:48:13 -0500 Subject: [PATCH 1/2] Minor iter fix, better meta_info, simplify * Fix off-by-one error in len() method of InMemoryAlignmentStore this was previously returning 1 too many elements. Practically, this is unlikely to result in any difference, but the implementation now should return the proper lenght. Also, `ExactSizeIterator` has been implemented in the idiomatic way (providing and exact `size_hint()` in the `Iterator` implementation and just declaring the iterator as an `ExactSizeIterator`). * Output more information in the `meta_info.json` file produced by `oarfish`. * Simplify the implementation of the SampleWithReplacementIterator; removing the old iterator and replacing it with a new, simpler iterator with direct access to the `InMemoryAlignmentStore`. --- src/bootstrap.rs | 46 ----------------------------- src/em.rs | 8 +----- src/main.rs | 49 ++++++++++++++++++++++--------- src/util/oarfish_types.rs | 59 ++++++++++++++++++++++++++++++++++---- src/util/write_function.rs | 21 ++------------ 5 files changed, 92 insertions(+), 91 deletions(-) diff --git a/src/bootstrap.rs b/src/bootstrap.rs index 8348f5a..6384d7f 100644 --- a/src/bootstrap.rs +++ b/src/bootstrap.rs @@ -1,52 +1,6 @@ use rand::distributions::{Distribution, Uniform}; use rand::Rng; -/// Wraps an iterator `I` to produce a new iterator that samples the original -/// with replacement. -pub struct SampleWithReplaceIter<'b, I: ExactSizeIterator> { - pub inds: &'b [usize], - pub iter: I, - pub idx: usize, - pub e_ptr: usize, - pub prev: Option, -} - -impl<'b, I: ExactSizeIterator> Iterator for SampleWithReplaceIter<'b, I> -where - I::Item: Copy, -{ - type Item = I::Item; - - #[inline] - fn next(&mut self) -> Option { - if let Some(next_ind) = self.inds.get(self.idx) { - // always makes sure we have a previous - // element. This will get the first - // element but wont yet increment - // the pointer (e_ptr) - if self.prev.is_none() { - self.prev = self.iter.next(); - } - // next element is not a duplicate - if self.e_ptr != *next_ind { - // otherwise, call next until we have the - // correct element - loop { - self.prev = self.iter.next(); - self.e_ptr += 1; - if self.e_ptr == *next_ind { - break; - } - } - } - self.idx += 1; - self.prev - } else { - None - } - } -} - pub fn get_sample_inds(n: usize, rng: &mut R) -> Vec { let dist = Uniform::new(0, n); let mut inds: Vec = dist.sample_iter(rng).take(n).collect(); diff --git a/src/em.rs b/src/em.rs index ddf2d4f..f4b83ff 100644 --- a/src/em.rs +++ b/src/em.rs @@ -237,13 +237,7 @@ pub fn do_bootstrap(em_info: &EMInfo) -> Vec { // function that produces an iterator over the // scored alignments on demand - let make_iter = || bootstrap::SampleWithReplaceIter { - inds: &inds, - iter: em_info.eq_map.iter(), - idx: 0, - e_ptr: 0, - prev: None, - }; + let make_iter = || em_info.eq_map.random_sampling_iter(&inds); do_em(em_info, make_iter, false) } diff --git a/src/main.rs b/src/main.rs index a63c1c4..03b0dae 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,8 @@ use std::{ }; use num_format::{Locale, ToFormattedString}; +use serde::Serialize; +use serde_json::json; use tracing::info; use tracing_subscriber::{filter::LevelFilter, fmt, prelude::*, EnvFilter}; @@ -30,14 +32,14 @@ use crate::util::write_function::{write_infrep_file, write_output}; /// These represent different "meta-options", specific settings /// for all of the different filters that should be applied in /// different cases. -#[derive(Clone, Debug, clap::ValueEnum)] +#[derive(Clone, Debug, clap::ValueEnum, Serialize)] enum FilterGroup { NoFilters, NanocountFilters, } /// accurate transcript quantification from long-read RNA-seq data -#[derive(Parser, Debug)] +#[derive(Parser, Debug, Serialize)] #[clap(author, version, about, long_about = None)] struct Args { /// be quiet (i.e. don't output log messages that aren't at least warnings) @@ -168,6 +170,31 @@ fn get_filter_opts(args: &Args) -> AlignmentFilters { } } +fn get_json_info(args: &Args, emi: &EMInfo) -> serde_json::Value { + let prob = if args.model_coverage { + "binomial" + } else { + "no_coverage" + }; + + json!({ + "prob_model" : prob, + "num_bins" : args.bins, + "filter_options" : &emi.eq_map.filter_opts, + "discard_table" : &emi.eq_map.discard_table, + "alignments": &args.alignments, + "output": &args.output, + "verbose": &args.verbose, + "quiet": &args.quiet, + "em_max_iter": &args.max_em_iter, + "em_convergence_thresh": &args.convergence_thresh, + "threads": &args.threads, + "filter_group": &args.filter_group, + "short_quant": &args.short_quant, + "num_bootstraps": &args.num_bootstraps + }) +} + fn main() -> anyhow::Result<()> { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) @@ -254,8 +281,8 @@ fn main() -> anyhow::Result<()> { // if we are seeding the quantification estimates with short read // abundances, then read those in here. - let init_abundances = args.short_quant.map(|sr_path| { - read_short_quant_vec(&sr_path, &txps_name).unwrap_or_else(|e| panic!("{}", e)) + let init_abundances = args.short_quant.as_ref().map(|sr_path| { + read_short_quant_vec(sr_path, &txps_name).unwrap_or_else(|e| panic!("{}", e)) }); // wrap up all of the relevant information we need for estimation @@ -274,16 +301,12 @@ fn main() -> anyhow::Result<()> { em::em(&emi, args.threads) }; + // prepare the JSON object we'll write + // to meta_info.json + let json_info = get_json_info(&args, &emi); + // write the output - write_output( - &args.output, - &args.model_coverage, - args.bins, - args.num_bootstraps, - &emi, - &header, - &counts, - )?; + write_output(&args.output, json_info, &header, &counts)?; // if the user requested bootstrap replicates, // compute and write those out now. diff --git a/src/util/oarfish_types.rs b/src/util/oarfish_types.rs index 01a7314..4c79376 100644 --- a/src/util/oarfish_types.rs +++ b/src/util/oarfish_types.rs @@ -240,10 +240,44 @@ pub struct InMemoryAlignmentStore<'h> { impl<'h> InMemoryAlignmentStore<'h> { #[inline] pub fn len(&self) -> usize { - self.boundaries.len() + self.boundaries.len().saturating_sub(1) } } +pub struct InMemoryAlignmentStoreSamplingWithReplacementIter<'a, 'h, 'b> { + pub store: &'a InMemoryAlignmentStore<'h>, + pub rand_inds: std::slice::Iter<'b, usize>, +} + +impl<'a, 'b, 'h> Iterator for InMemoryAlignmentStoreSamplingWithReplacementIter<'a, 'b, 'h> { + type Item = (&'a [AlnInfo], &'a [f32], &'a [f64]); + + #[inline] + fn next(&mut self) -> Option { + if let Some(next_ind) = self.rand_inds.next() { + let start = self.store.boundaries[*next_ind]; + let end = self.store.boundaries[*next_ind + 1]; + Some(( + &self.store.alignments[start..end], + &self.store.as_probabilities[start..end], + &self.store.coverage_probabilities[start..end], + )) + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (self.rand_inds.len(), Some(self.rand_inds.len())) + } +} + +impl<'a, 'b, 'h> ExactSizeIterator + for InMemoryAlignmentStoreSamplingWithReplacementIter<'a, 'b, 'h> +{ +} + pub struct InMemoryAlignmentStoreIter<'a, 'h> { pub store: &'a InMemoryAlignmentStore<'h>, pub idx: usize, @@ -267,15 +301,15 @@ impl<'a, 'h> Iterator for InMemoryAlignmentStoreIter<'a, 'h> { )) } } -} -impl<'a, 'h> ExactSizeIterator for InMemoryAlignmentStoreIter<'a, 'h> { #[inline] - fn len(&self) -> usize { - self.store.boundaries.len() + fn size_hint(&self) -> (usize, Option) { + (self.store.len(), Some(self.store.len())) } } +impl<'a, 'h> ExactSizeIterator for InMemoryAlignmentStoreIter<'a, 'h> {} + impl<'h> InMemoryAlignmentStore<'h> { pub fn new(fo: AlignmentFilters, header: &'h Header) -> Self { InMemoryAlignmentStore { @@ -296,6 +330,19 @@ impl<'h> InMemoryAlignmentStore<'h> { } } + pub fn random_sampling_iter<'a, 'b>( + &'a self, + inds: &'b [usize], + ) -> InMemoryAlignmentStoreSamplingWithReplacementIter + where + 'b: 'a, + { + InMemoryAlignmentStoreSamplingWithReplacementIter { + store: self, + rand_inds: inds.iter(), + } + } + pub fn add_group( &mut self, txps: &mut [TranscriptInfo], @@ -326,7 +373,7 @@ impl<'h> InMemoryAlignmentStore<'h> { pub fn num_aligned_reads(&self) -> usize { if !self.boundaries.is_empty() { - self.boundaries.len() - 1 + self.len() } else { 0 } diff --git a/src/util/write_function.rs b/src/util/write_function.rs index 547d13c..fe77b0b 100644 --- a/src/util/write_function.rs +++ b/src/util/write_function.rs @@ -7,7 +7,7 @@ use arrow2::{ datatypes::{Field, Schema}, }; use path_tools::WithAdditionalExtension; -use serde_json::json; + use std::path::{Path, PathBuf}; use std::{ fs, @@ -20,10 +20,7 @@ use std::{ //this part is taken from dev branch pub fn write_output( output: &PathBuf, - prob_flag: &bool, - bins: u32, - num_bootstraps: u32, - em_info: &EMInfo, + info: serde_json::Value, header: &noodles_sam::header::Header, counts: &[f64], ) -> io::Result<()> { @@ -37,21 +34,7 @@ pub fn write_output( } } - let prob = if *prob_flag { - "binomial" - } else { - "no_coverage" - }; - { - let info = json!({ - "prob_model" : prob, - "num_bins" : bins, - "num_bootstraps" : num_bootstraps, - "filter_options" : em_info.eq_map.filter_opts, - "discard_table" : em_info.eq_map.discard_table - }); - let info_path = output.with_additional_extension(".meta_info.json"); let write = OpenOptions::new() .write(true) From 96883b26e646d81c1caa47b43e4169466e3fcede Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sat, 2 Mar 2024 22:10:12 -0500 Subject: [PATCH 2/2] fix: bump deps --- Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f4cc2d2..0e2cfaa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "oarfish" -version = "0.3.0" +version = "0.3.1" edition = "2021" authors = [ "Zahra Zare Jousheghani ", @@ -34,9 +34,9 @@ anyhow = "1.0.80" bio-types = "1.0.1" clap = { version = "4.5.1", features = ["derive"] } coitrees = "0.4.0" -noodles-bam = "0.55.0" +noodles-bam = "0.56.0" noodles-gtf = "0.23.0" -noodles-sam = "0.52.0" +noodles-sam = "0.53.0" num-format = "0.4.4" tabled = "0.15.0" tracing = "0.1.40"