From c1a590595b6d69a85cf28e5bb9b83dbf6c40fb5b Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Fri, 3 Jan 2025 22:28:37 -0500 Subject: [PATCH] trim-adapters --- Cargo.toml | 1 + src/bin/fasten_kmer.rs | 64 +++++++++++- src/bin/fasten_normalize.rs | 2 - src/bin/fasten_trim.rs | 190 ++++++++++++++++++++++++++++++++---- src/lib.rs | 27 +++++ 5 files changed, 260 insertions(+), 24 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 47f911f2..088ed8c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -104,6 +104,7 @@ path = "src/bin/fasten_normalize.rs" [dependencies] regex = "1.10" +fancy-regex = "0.13" getopts = "0.2.21" statistical = "1.0" multiqueue = "0.3.2" diff --git a/src/bin/fasten_kmer.rs b/src/bin/fasten_kmer.rs index d5c0a470..bb512f5d 100644 --- a/src/bin/fasten_kmer.rs +++ b/src/bin/fasten_kmer.rs @@ -48,16 +48,22 @@ extern crate fasten; extern crate statistical; extern crate getopts; +extern crate rand; +extern crate fancy_regex; use std::io::BufReader; use std::io::BufRead; use std::io::stdin; use std::io::Stdin; +use rand::Rng; +//use rand::seq::SliceRandom; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; use fasten::logmsg; +use fancy_regex::Regex; + use std::collections::HashMap; /// Glues together paired end reads internally and is a @@ -149,6 +155,11 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo // keep track of which sequences start with which kmers let mut kmer_to_seqs :HashMap> = HashMap::new(); + + // Some randomness + let mut rng = rand::thread_rng(); + // Regular expression to find uncomplex kmers + let low_complexity = Regex::new(r"N|(.)\1{2,}|(.)(.)(\2\3){1,}").unwrap(); // read the file let my_buffer=BufReader::new(stdin); @@ -170,6 +181,7 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo or_insert(0); *kmer_count += value; } + let kmer_keys: Vec<&String> = entry_kmers.keys().collect(); // If this is paired end and if we're saving the second pair's // read, then reserve a declaired variable here for the string. @@ -201,7 +213,24 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo // Remember the read that initiated this if remember_reads { - let init_kmer = String::from(&seq[0..kmer_length]); + // Get the kmer substring + // let init_kmer = String::from(&seq[0..kmer_length]); + // Get the minimizer as the initial kmer + //let init_kmer = calculate_minimizer(&seq, kmer_length); + + //let init_kmer_pos = rand::thread_rng().gen_range(0 .. seq.len() - kmer_length + 1); + //let init_kmer = seq[init_kmer_pos .. init_kmer_pos+kmer_length].to_string(); + let mut random_index = rng.gen_range(0..kmer_keys.len()); + let mut init_kmer = kmer_keys[random_index].to_string(); + let mut kmer_tries :u16 = 0; + while kmer_tries < 1000 && low_complexity.is_match(&init_kmer).unwrap() { + random_index = rng.gen_range(0..kmer_keys.len()); + init_kmer = kmer_keys[random_index].to_string(); + kmer_tries += 1; + //fasten::logmsg(format!("skipping kmer {}, try {}",&init_kmer, kmer_tries)); + } + + // Get the vector of sequences for this kmer, or else initialize an empty vector let init_kmer_vec = kmer_to_seqs.entry(init_kmer).or_insert(vec![]); // get the formatted entry @@ -276,6 +305,39 @@ fn kmers_in_str (seq:&str, kmer_length:usize, should_revcomp:bool) -> HashMap String { + // Ensure the sequence length is greater than or equal to k + if sequence.len() < k { + panic!("Sequence length is less than k"); + } + + // Initialize variables to keep track of the minimum k-mer and its hash + let mut min_kmer = &sequence[0..k]; + let mut min_hash = hash_kmer(min_kmer); + + // Iterate over the sequence to find the minimum k-mer and its hash + for i in 1..=(sequence.len() - k) { + let current_kmer = &sequence[i..(i + k)]; + let current_hash = hash_kmer(current_kmer); + + // Update the minimum k-mer and its hash if a smaller hash is found + if current_hash < min_hash { + min_kmer = current_kmer; + min_hash = current_hash; + } + } + + // Return the minimizer k-mer + min_kmer.to_string() +} + +fn hash_kmer(kmer: &str) -> u64 { + // Simple hash function that converts each nucleotide to its ASCII value and sums them up + kmer.bytes().map(|b| b as u64).sum() +} +*/ + /// reverse-complement a dna sequence // Thanks Henk for supplying these functions. fn revcomp(dna: &str) -> String { diff --git a/src/bin/fasten_normalize.rs b/src/bin/fasten_normalize.rs index 037cdc74..4087c375 100644 --- a/src/bin/fasten_normalize.rs +++ b/src/bin/fasten_normalize.rs @@ -129,8 +129,6 @@ fn normalize_coverage (stdin:Stdin, target_depth:u32, paired_end:bool) { // number of reads to keep is the target depth / kmer coverage * number of reads present let mut num_reads_to_keep :usize = min( - //(target_depth as f32 / count as f32 * f.len() as f32).ceil() as usize, - //(target_depth as f32 / num_reads_orig as f32).ceil() as usize, target_depth, num_reads_orig as u32 ) as usize; diff --git a/src/bin/fasten_trim.rs b/src/bin/fasten_trim.rs index d52d440e..eaf352da 100644 --- a/src/bin/fasten_trim.rs +++ b/src/bin/fasten_trim.rs @@ -1,17 +1,39 @@ -//! Blunt-end trims using 0-based coordinates +//! Trims reads using 0-based coordinates //! //! # Examples //! -//! ## Trim five bases from the right side +//! ## Adapters +//! +//! ### Download the adapter files +//! +//! ```bash +//! mkdir -pv $HOME/db +//! pushd $HOME/db # step into the db directory +//! git clone https://github.com/lskatz/adapterseqs +//! ADAPTERS=$(find $HOME/db/adapterseqs -name '*.fa') +//! popd # return to the original directory +//! ``` +//! +//! ### Trim the adapters +//! +//! ```bash +//! cat file.fastq | fasten_trim +//! ``` +//! +//! ## Blunt-end trim five bases from the right side +//! //! ```bash //! cat file.fastq | fasten_trim -l -5 > trimmed.fastq //! ``` //! -//! ## Keep a maximum of 100bp +//! ## Keep a maximum of 100bp with blunt-end trimming on the right side +//! //! ```bash //! cat file.fastq | fasten_trim -l 99 > trimmed.fastq //! ``` -//! ## Trim 5bp from the left side +//! +//! ## Blunt-end trim 5bp from the left side +//! //! ```bash //! cat file.fastq | fasten_trim -f 4 > trimmed.fastq //! ``` @@ -28,9 +50,22 @@ //! -v, --verbose Print more status messages //! -f, --first-base INT //! The first base to keep (default: 0) -//! -l, --last-base INT The last base to keep. If negative, counts from the -//! right. (default: 0) +//! -l, --last-base INT The last base to keep. (default: 0) //! ``` +//! +//! # Notes +//! +//! The algorithm is as follows: +//! +//! 1. marks the first and last bases for trimming as 0 and the last base, respectively +//! 2. if an adapter is found at the beginning of the sequence, then move the marker for where it will be trimmed +//! 3. Compare the blunt end suggested trimming against where an adapter might be found and move the marker as the most inward possible +//! 4. Trim the sequence and quality strings +//! +//! # Output +//! +//! The deflines will be altered with a description of the trimming in brackets, e.g., +//! [trimmed_adapter_rev=TT] [trimmed_left=0] [trimmed_right=250] extern crate fasten; extern crate statistical; @@ -39,11 +74,16 @@ extern crate threadpool; use std::fs::File; use std::io::BufReader; -use std::cmp::min; +use std::cmp::{min,max}; +use std::process::exit; + +use std::collections::HashMap; +use std::io::BufRead; use fasten::fasten_base_options; use fasten::fasten_base_options_matches; use fasten::logmsg; +use fasten::reverse_complement; use fasten::io::fastq; use fasten::io::seq::Seq; @@ -53,9 +93,42 @@ fn main(){ // script-specific options opts.optopt("f","first-base","The first base to keep (default: 0)","INT"); opts.optopt("l","last-base","The last base to keep (default: 0)","INT"); + opts.optopt("a","adapterseqs","fasta file of adapters","path/to/file.fa"); let matches = fasten_base_options_matches("Blunt-end trims using 0-based coordinates", opts); + let adapterseqs:String = { + if matches.opt_present("adapterseqs") { + matches.opt_str("adapterseqs") + .expect("ERROR: could not understand parameter --adapterseqs") + } else { + "".to_string() + } + }; + + // store the adapter sequences as a vector of strings + let mut adapters:Vec = Vec::new(); + if matches.opt_present("adapterseqs") && adapterseqs.len() > 0 { + // check that the file path exists + // if not, exit with an error + if !std::path::Path::new(&adapterseqs).exists() { + logmsg(format!("ERROR: adapter file {} does not exist", &adapterseqs)); + exit(1); + } + + // read the adapter sequences from the fasta file + adapters = read_fasta(&adapterseqs) + .values() + .map(|x| x.to_string()) + .collect(); + } + + //if matches.opt_present("verbose") { + // //logmsg(&adapters); + // eprintln!("Adapters: {:?}", adapters); + // exit(3); + //} + let first_base:usize ={ if matches.opt_present("first-base") { matches.opt_str("first-base") @@ -100,32 +173,107 @@ fn main(){ let fastq_iter = fastq_reader.into_iter(); for seq in fastq_iter { - let trimmed:String = trim_worker(seq, first_base, last_base); + let trimmed:String = trim_worker(seq, first_base, last_base, &adapters); println!("{}", trimmed); } } /// Trim a set of fastq entries and send it to a channel -fn trim_worker(seq:Seq, first_base:usize, last_base:usize ) -> String { +fn trim_worker(seq:Seq, suggested_first_base:usize, suggested_last_base:usize, adapters:&Vec ) -> String { + // In this function, keep track of where the first and + // last base would be trimmed with a simple marker. + // Most instances of the word "trimming" in this function is just moving first_base and last_base. + let mut first_base = 0; // The last position is either the last_base parameter // or the last position in the string, whichever is less. - let last_base_tmp = match last_base { - // But if the position is not specified, then it is the seq length - 0 => { - // zero based - seq.seq.len()-1 - }, - _ => { - min(seq.seq.len()-1, last_base) + let mut last_base = seq.seq.len()-1; + + // Make note of what is trimmed + let mut description = String::new(); + + // First, run the adapter trimming, before any blunt end trimming + + // First, detect if there are any adapters in the sequence + // If there are, then trim the sequence at the adapter + for adapter in adapters { + let adapter_length = adapter.len(); + + // If the adapter is longer than the sequence, skip it: it won't exist in the sequence as a whole adapter. + if adapter_length > seq.seq.len() { + continue; + } + + // Check if the adapter is at the beginning of the sequence + if &seq.seq[0..adapter_length] == adapter { + first_base = adapter_length; + description.push_str(&format!(" [trimmed_adapter_fwd={}]", &adapter)); + } + + // Check if the revcom is at the end of the sequence + let revcom = reverse_complement(&adapter); + let end_slice: &str = &seq.seq[&seq.seq.len()-1 - adapter_length..].trim(); + if end_slice == revcom { + last_base = seq.seq.len() - adapter_length; + description.push_str(&format!(" [trimmed_adapter_rev={}]", &revcom)); + } + } + + // Next, run the blunt end trimming. + // Take the maximum between the suggested left trim and the current left trim. + // If the left trim is longer than the sequence length, then omit a warning and do not trim. + first_base = max(first_base, suggested_first_base); + if first_base >= seq.seq.len() { + logmsg("Warning: the left trim is longer than the sequence length. Skipping."); + first_base = 0; + } + + // Take the minimum between the suggested right trim and the current right trim. + // If the last base is less than 1, then omit a warning and do not trim. + last_base = { + if suggested_last_base == 0 { + last_base + } else { + min(last_base, suggested_last_base) } }; + if last_base < 1 { + logmsg("Warning: the right trim is longer than the sequence length. Skipping."); + last_base = seq.seq.len()-1; + } - let sequence = &seq.seq[first_base..last_base_tmp]; - let quality = &seq.qual[first_base..last_base_tmp]; + description.push_str(&format!(" [trimmed_left={}] [trimmed_right={}]", first_base, last_base)); - let trimmed = format!("{}\n{}\n+\n{}", seq.id, sequence, quality); + let sequence = &seq.seq[first_base..last_base]; + let quality = &seq.qual[first_base..last_base]; + + let trimmed = format!("{}{}\n{}\n+\n{}", seq.id, description, sequence, quality); return trimmed; } - + +// Taken from https://medium.com/bioinformatics-with-rust/how-to-read-a-fasta-file-9472b77589f7 +/// Read a fasta file and return a HashMap of the sequences +fn read_fasta(file_path: &str) -> HashMap { + let mut data = HashMap::new(); + let file = File::open(file_path).expect("Invalid filepath"); + let reader = BufReader::new(file); + + let mut seq_id = String::new(); + + for line in reader.lines() { + let line = line.unwrap(); + + // Check if the line starts with '>' (indicating a sequence ID or header) + if line.starts_with('>') { + seq_id = line.trim_start_matches('>').to_string(); + } else { + // If it's a DNA sequence line, insert or update the HashMap entry + // If seq_id is not present, insert a new entry with an empty String + // Then append the current line to the existing DNA sequence + data.entry(seq_id.clone()).or_insert_with(String::new).push_str(&line); + } + } + + data +} diff --git a/src/lib.rs b/src/lib.rs index 30b8d08c..4809e9a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,7 @@ extern crate statistical; extern crate getopts; use std::env; use std::path::Path; +use std::collections::HashMap; use getopts::Options; use getopts::Matches; @@ -162,3 +163,29 @@ pub fn logmsg>(stringlike: S) { eprintln!("{}: {}", &program, str_ref); } +/// Reverse complement a DNA sequence. +/// Take into account lowercase vs uppercase. +/// Ambiguity codes are also handled. +pub fn reverse_complement(dna: &str) -> String { + // Create a mapping for complement bases, including ambiguity codes. + let complement_map: HashMap = [ + ('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G'), + ('R', 'Y'), ('Y', 'R'), ('S', 'S'), ('W', 'W'), + ('K', 'M'), ('M', 'K'), ('B', 'V'), ('V', 'B'), + ('D', 'H'), ('H', 'D'), ('N', 'N'), + ('a', 't'), ('t', 'a'), ('g', 'c'), ('c', 'g'), + ('r', 'y'), ('y', 'r'), ('s', 's'), ('w', 'w'), + ('k', 'm'), ('m', 'k'), ('b', 'v'), ('v', 'b'), + ('d', 'h'), ('h', 'd'), ('n', 'n'), + ] + .iter() + .cloned() + .collect(); + + // Generate the reverse complement. + dna.chars() + .rev() + .map(|base| complement_map.get(&base).cloned().unwrap_or('N')) // Default to 'N' for unknown bases. + .collect() +} +