stjude-rust-labs · a-frantz · Dec 3, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 5, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,6 +19,7 @@ git-testament = "0.2.1"
 indexmap = "1.9.1"
 indicatif = "0.16.2"
 itertools = "0.10.5"
+lazy_static = "1.4.0"
 noodles = { version = "0.34.0", features = [
     "async",
     "bam",

diff --git a/src/convert/bam.rs b/src/convert/bam.rs
@@ -45,7 +45,7 @@ pub async fn to_sam_async(
         .await
         .with_context(|| "writing SAM header")?;
 
-    let mut counter = RecordCounter::new();
+    let mut counter = RecordCounter::default();
     let mut record = Record::default();
 
     // (4) Write each record in the BAM file to the SAM file.
@@ -131,7 +131,7 @@ pub async fn to_cram_async(
         .await
         .with_context(|| "writing CRAM file header")?;
 
-    let mut counter = RecordCounter::new();
+    let mut counter = RecordCounter::default();
     let mut record = Record::default();
 
     // (6) Write each record in the BAM file to the CRAM file.

diff --git a/src/convert/command.rs b/src/convert/command.rs
@@ -31,8 +31,13 @@ pub struct ConvertArgs {
     to: PathBuf,
 
     /// Number of records to process before exiting the conversion.
-    #[arg(short = 'n', long, value_name = "USIZE")]
-    num_records: Option<usize>,
+    #[arg(
+        short,
+        long,
+        default_value_t,
+        value_name = "'all' or a positive, non-zero integer"
+    )]
+    num_records: NumberOfRecords,
 
     /// If available, the FASTA reference file used to generate the file.
     #[arg(short, long)]
@@ -91,7 +96,7 @@ pub fn convert(args: ConvertArgs) -> anyhow::Result<()> {
     // Number of Records //
     //===================//
 
-    let max_records = NumberOfRecords::from(args.num_records);
+    let max_records = args.num_records;
 
     //==========================//
     // Bioinformatics File Pair //

diff --git a/src/convert/cram.rs b/src/convert/cram.rs
@@ -52,7 +52,7 @@ pub async fn to_sam_async(
         .with_context(|| "writing SAM header")?;
 
     // (5) Write each record in the CRAM file to the SAM file.
-    let mut counter = RecordCounter::new();
+    let mut counter = RecordCounter::default();
     let mut records = reader.records(&repository, &header.parsed);
 
     while let Some(record) = records
@@ -125,7 +125,7 @@ pub async fn to_bam_async(
         .with_context(|| "writing BAM reference sequences")?;
 
     // (6) Write each record in the CRAM file to the BAM file.
-    let mut counter = RecordCounter::new();
+    let mut counter = RecordCounter::default();
     let mut records = reader.records(&repository, &header.parsed);
 
     while let Some(record) = records

diff --git a/src/convert/sam.rs b/src/convert/sam.rs
@@ -58,7 +58,7 @@ pub async fn to_bam_async(
         .await
         .with_context(|| "writing BAM reference sequences")?;
 
-    let mut counter = RecordCounter::new();
+    let mut counter = RecordCounter::default();
     let mut record = Record::default();
 
     // (5) Write each record in the BAM file to the SAM file.
@@ -151,7 +151,7 @@ pub async fn to_cram_async(
         .await
         .with_context(|| "writing CRAM file header")?;
 
-    let mut counter = RecordCounter::new();
+    let mut counter = RecordCounter::default();
     let mut record = Record::default();
 
     // (6) Write each record in the SAM file to the CRAM file.

diff --git a/src/derive.rs b/src/derive.rs
@@ -1,4 +1,9 @@
 //! Functionality related to the `ngs derive` subcommand.
 
 pub mod command;
+pub mod encoding;
+pub mod endedness;
 pub mod instrument;
+pub mod junction_annotation;
+pub mod readlen;
+pub mod strandedness;
diff --git a/src/derive/command.rs b/src/derive/command.rs
@@ -1,6 +1,11 @@
 //! Functionality related to the `ngs derive` subcommand itself.
 
+pub mod encoding;
+pub mod endedness;
 pub mod instrument;
+pub mod junction_annotation;
+pub mod readlen;
+pub mod strandedness;
 
 use clap::Args;
 use clap::Subcommand;
@@ -20,6 +25,23 @@ pub struct DeriveArgs {
 /// All possible subcommands for `ngs derive`.
 #[derive(Subcommand)]
 pub enum DeriveSubcommand {
+    /// Derives the quality score encoding used to produce the file.
+    Encoding(self::encoding::DeriveEncodingArgs),
+
+    /// Derives the endedness of the file.
+    Endedness(self::endedness::DeriveEndednessArgs),
+
     /// Derives the instrument used to produce the file.
     Instrument(self::instrument::DeriveInstrumentArgs),
+
+    /// Derives the read length of the file.
+    Readlen(self::readlen::DeriveReadlenArgs),
+
+    /// Derives the strandedness of the RNA-Seq file.
+    Strandedness(self::strandedness::DeriveStrandednessArgs),
+
+    /// Annotates junctions in the file.
+    /// Note that, technically, this command doesn't derive anything—it will moved in the future to a better home.
+    /// convenience.
+    JunctionAnnotation(self::junction_annotation::JunctionAnnotationArgs),
 }
diff --git a/src/derive/command/encoding.rs b/src/derive/command/encoding.rs
@@ -0,0 +1,78 @@
+//! Functionality relating to the `ngs derive encoding` subcommand itself.
+
+use anyhow::{Context, Ok};
+use clap::Args;
+use noodles::bam;
+use num_format::{Locale, ToFormattedString};
+use std::collections::HashSet;
+use std::io::BufReader;
+use std::path::PathBuf;
+use tracing::info;
+
+use crate::derive::encoding::compute;
+use crate::utils::args::NumberOfRecords;
+use crate::utils::display::RecordCounter;
+
+/// Clap arguments for the `ngs derive encoding` subcommand.
+#[derive(Args)]
+pub struct DeriveEncodingArgs {
+    /// Source BAM.
+    #[arg(value_name = "BAM")]
+    src: PathBuf,
+
+    /// Examine the first `n` records in the file.
+    #[arg(
+        short,
+        long,
+        default_value_t,
+        value_name = "'all' or a positive, non-zero integer"
+    )]
+    num_records: NumberOfRecords,
+}
+
+/// Main function for the `ngs derive encoding` subcommand.
+pub fn derive(args: DeriveEncodingArgs) -> anyhow::Result<()> {
+    info!("Starting derive encoding subcommand.");
+
+    let file = std::fs::File::open(args.src);
+    let reader = file
+        .map(BufReader::new)
+        .with_context(|| "opening BAM file")?;
+    let mut reader = bam::Reader::new(reader);
+    let _header: String = reader.read_header()?.parse()?;
+    reader.read_reference_sequences()?;
+
+    let mut score_set: HashSet<u8> = HashSet::new();
+
+    // (1) Collect quality scores from reads within the
+    // file. Support for sampling only a portion of the reads is provided.
+    let mut counter = RecordCounter::default();
+    for result in reader.lazy_records() {
+        let record = result?;
+
+        for i in 0..record.quality_scores().len() {
+            let score = record.quality_scores().as_ref()[i];
+            score_set.insert(score);
+        }
+
+        counter.inc();
+        if counter.time_to_break(&args.num_records) {
+            break;
+        }
+    }
+
+    info!(
+        "Processed {} records.",
+        counter.get().to_formatted_string(&Locale::en)
+    );
+
+    // (2) Derive encoding from the observed quality scores
+    let result = compute::predict(score_set)?;
+
+    // (3) Print the output to stdout as JSON (more support for different output
+    // types may be added in the future, but for now, only JSON).
+    let output = serde_json::to_string_pretty(&result).unwrap();
+    println!("{}", output);
+
+    Ok(())
+}
diff --git a/src/derive/command/endedness.rs b/src/derive/command/endedness.rs
@@ -0,0 +1,164 @@
+//! Functionality relating to the `ngs derive endedness` subcommand itself.
+
+use anyhow::Context;
+use clap::Args;
+use num_format::{Locale, ToFormattedString};
+use std::collections::{HashMap, HashSet};
+use std::path::PathBuf;
+use std::sync::Arc;
+use tracing::{info, trace};
+
+use crate::derive::endedness::compute;
+use crate::derive::endedness::compute::OrderingFlagsCounts;
+use crate::utils::args::arg_in_range as deviance_in_range;
+use crate::utils::args::NumberOfRecords;
+use crate::utils::display::RecordCounter;
+use crate::utils::formats::bam::ParsedBAMFile;
+use crate::utils::formats::utils::IndexCheck;
+use crate::utils::read_groups::{get_read_group, validate_read_group_info, ReadGroupPtr};
+
+/// Clap arguments for the `ngs derive endedness` subcommand.
+#[derive(Args)]
+pub struct DeriveEndednessArgs {
+    /// Source BAM.
+    #[arg(value_name = "BAM")]
+    src: PathBuf,
+
+    /// Examine the first `n` records in the file.
+    #[arg(
+        short,
+        long,
+        default_value_t,
+        value_name = "'all' or a positive, non-zero integer"
+    )]
+    num_records: NumberOfRecords,
+
+    /// Distance from 0.5 split between number of f+l- reads and f-l+ reads
+    /// allowed to be called 'Paired-End'. The default value of `0.0` is only appropriate
+    /// if the whole file is being processed.
+    #[arg(long, value_name = "F64", default_value = "0.0")]
+    paired_deviance: f64,
+
+    /// Calculate and output Reads-Per-Template. This will produce a more
+    /// sophisticated estimate for endedness, but uses substantially more memory.
+    #[arg(long, default_value = "false")]
+    calculate_reads_per_template: bool,
+
+    /// Round RPT to the nearest INT before comparing to expected values.
+    /// Appropriate if using `-n` > 0. Unrounded value is reported in output.
+    #[arg(long, default_value = "false")]
+    round_reads_per_template: bool,
+}
+
+/// Main function for the `ngs derive endedness` subcommand.
+pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> {
+    // (0) Parse arguments needed for subcommand.
+    let paired_deviance = deviance_in_range(args.paired_deviance, 0.0..=0.5)
+        .with_context(|| "Paired deviance is not within acceptable range")?;
+
+    info!("Starting derive endedness subcommand.");
+
+    let mut found_rgs = HashSet::new();
+
+    let mut ordering_flags: HashMap<ReadGroupPtr, OrderingFlagsCounts> = HashMap::new();
+
+    // only used if args.calc_rpt is true
+    let mut read_names: Option<HashMap<String, Vec<ReadGroupPtr>>> = None;
+
+    let ParsedBAMFile {
+        mut reader, header, ..
+    } = crate::utils::formats::bam::open_and_parse(args.src, IndexCheck::None)?;
+
+    // (1) Collect ordering flags (and QNAMEs) from reads within the
+    // file. Support for sampling only a portion of the reads is provided.
+    let mut counter = RecordCounter::default();
+    for result in reader.records(&header.parsed) {
+        let record = result?;
+
+        // Only count primary alignments and unmapped reads.
+        if (record.flags().is_secondary() || record.flags().is_supplementary())
+            && !record.flags().is_unmapped()
+        {
+            continue;
+        }
+
+        let read_group = get_read_group(&record, Some(&mut found_rgs));
+
+        if args.calculate_reads_per_template {
+            let read_name_map = read_names.get_or_insert_with(HashMap::new);
+            match record.read_name() {
+                Some(rn) => {
+                    let rn = rn.to_string();
+                    let rg_vec = read_name_map.get_mut(&rn);
+
+                    match rg_vec {
+                        Some(rg_vec) => {
+                            rg_vec.push(Arc::clone(&read_group));
+                        }
+                        None => {
+                            read_name_map.insert(rn, vec![(Arc::clone(&read_group))]);
+                        }
+                    }
+                }
+                None => {
+                    trace!("Could not parse a QNAME from a read in the file.");
+                    trace!("Skipping this read and proceeding.");
+                    continue;
+                }
+            }
+        }
+
+        match (
+            record.flags().is_segmented(),
+            record.flags().is_first_segment(),
+            record.flags().is_last_segment(),
+        ) {
+            (false, _, _) => {
+                ordering_flags.entry(read_group).or_default().unsegmented += 1;
+            }
+            (true, true, false) => {
+                ordering_flags.entry(read_group).or_default().first += 1;
+            }
+            (true, false, true) => {
+                ordering_flags.entry(read_group).or_default().last += 1;
+            }
+            (true, true, true) => {
+                ordering_flags.entry(read_group).or_default().both += 1;
+            }
+            (true, false, false) => {
+                ordering_flags.entry(read_group).or_default().neither += 1;
+            }
+        }
+
+        counter.inc();
+        if counter.time_to_break(&args.num_records) {
+            break;
+        }
+    }
+
+    info!(
+        "Processed {} records.",
+        counter.get().to_formatted_string(&Locale::en)
+    );
+
+    // (2) Validate the read group information.
+    let rgs_in_header_not_records = validate_read_group_info(&found_rgs, &header.parsed);
+    for rg_id in rgs_in_header_not_records {
+        ordering_flags.insert(Arc::new(rg_id), OrderingFlagsCounts::new());
+    }
+
+    // (3) Derive the endedness based on the ordering flags gathered.
+    let result = compute::predict(
+        ordering_flags,
+        read_names,
+        paired_deviance,
+        args.round_reads_per_template,
+    );
+
+    // (4) Print the output to stdout as JSON (more support for different output
+    // types may be added in the future, but for now, only JSON).
+    let output = serde_json::to_string_pretty(&result).unwrap();
+    println!("{}", output);
+
+    anyhow::Ok(())
+}