From c1a590595b6d69a85cf28e5bb9b83dbf6c40fb5b Mon Sep 17 00:00:00 2001
From: Lee Katz - Aspen <gzu2@cdc.gov>
Date: Fri, 3 Jan 2025 22:28:37 -0500
Subject: [PATCH] trim-adapters

---
 Cargo.toml                  |   1 +
 src/bin/fasten_kmer.rs      |  64 +++++++++++-
 src/bin/fasten_normalize.rs |   2 -
 src/bin/fasten_trim.rs      | 190 ++++++++++++++++++++++++++++++++----
 src/lib.rs                  |  27 +++++
 5 files changed, 260 insertions(+), 24 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 47f911f2..088ed8c5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -104,6 +104,7 @@ path = "src/bin/fasten_normalize.rs"
 
 [dependencies]
 regex        = "1.10"
+fancy-regex  = "0.13"
 getopts      = "0.2.21"
 statistical  = "1.0"
 multiqueue   = "0.3.2"
diff --git a/src/bin/fasten_kmer.rs b/src/bin/fasten_kmer.rs
index d5c0a470..bb512f5d 100644
--- a/src/bin/fasten_kmer.rs
+++ b/src/bin/fasten_kmer.rs
@@ -48,16 +48,22 @@
 extern crate fasten;
 extern crate statistical;
 extern crate getopts;
+extern crate rand;
+extern crate fancy_regex;
 
 use std::io::BufReader;
 use std::io::BufRead;
 use std::io::stdin;
 use std::io::Stdin;
+use rand::Rng;
+//use rand::seq::SliceRandom;
 
 use fasten::fasten_base_options;
 use fasten::fasten_base_options_matches;
 use fasten::logmsg;
 
+use fancy_regex::Regex;
+
 use std::collections::HashMap;
 
 /// Glues together paired end reads internally and is a
@@ -149,6 +155,11 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo
 
     // keep track of which sequences start with which kmers
     let mut kmer_to_seqs :HashMap<String, Vec<String>> = HashMap::new();
+
+    // Some randomness
+    let mut rng = rand::thread_rng();
+    // Regular expression to find uncomplex kmers
+    let low_complexity = Regex::new(r"N|(.)\1{2,}|(.)(.)(\2\3){1,}").unwrap();
     
     // read the file
     let my_buffer=BufReader::new(stdin);
@@ -170,6 +181,7 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo
                 or_insert(0);
             *kmer_count += value;
         }
+        let kmer_keys: Vec<&String> = entry_kmers.keys().collect();
 
         // If this is paired end and if we're saving the second pair's
         // read, then reserve a declaired variable here for the string.
@@ -201,7 +213,24 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo
 
         // Remember the read that initiated this
         if remember_reads {
-            let init_kmer = String::from(&seq[0..kmer_length]);
+            // Get the kmer substring
+            // let init_kmer = String::from(&seq[0..kmer_length]);
+            // Get the minimizer as the initial kmer
+            //let init_kmer = calculate_minimizer(&seq, kmer_length);
+
+            //let init_kmer_pos = rand::thread_rng().gen_range(0 .. seq.len() - kmer_length + 1);
+            //let init_kmer = seq[init_kmer_pos .. init_kmer_pos+kmer_length].to_string();
+            let mut random_index = rng.gen_range(0..kmer_keys.len());
+            let mut init_kmer    = kmer_keys[random_index].to_string();
+            let mut kmer_tries :u16 = 0;
+            while kmer_tries < 1000 && low_complexity.is_match(&init_kmer).unwrap() {
+                random_index = rng.gen_range(0..kmer_keys.len());
+                init_kmer    = kmer_keys[random_index].to_string();
+                kmer_tries  += 1;
+                //fasten::logmsg(format!("skipping kmer {}, try {}",&init_kmer, kmer_tries));
+            }
+
+            // Get the vector of sequences for this kmer, or else initialize an empty vector
             let init_kmer_vec = kmer_to_seqs.entry(init_kmer).or_insert(vec![]);
 
             // get the formatted entry
@@ -276,6 +305,39 @@ fn kmers_in_str (seq:&str, kmer_length:usize, should_revcomp:bool) -> HashMap<St
     return kmer_hash;
 }
 
+/*
+fn calculate_minimizer(sequence: &str, k: usize) -> String {
+    // Ensure the sequence length is greater than or equal to k
+    if sequence.len() < k {
+        panic!("Sequence length is less than k");
+    }
+
+    // Initialize variables to keep track of the minimum k-mer and its hash
+    let mut min_kmer = &sequence[0..k];
+    let mut min_hash = hash_kmer(min_kmer);
+
+    // Iterate over the sequence to find the minimum k-mer and its hash
+    for i in 1..=(sequence.len() - k) {
+        let current_kmer = &sequence[i..(i + k)];
+        let current_hash = hash_kmer(current_kmer);
+
+        // Update the minimum k-mer and its hash if a smaller hash is found
+        if current_hash < min_hash {
+            min_kmer = current_kmer;
+            min_hash = current_hash;
+        }
+    }
+
+    // Return the minimizer k-mer
+    min_kmer.to_string()
+}
+
+fn hash_kmer(kmer: &str) -> u64 {
+    // Simple hash function that converts each nucleotide to its ASCII value and sums them up
+    kmer.bytes().map(|b| b as u64).sum()
+}
+*/
+
 /// reverse-complement a dna sequence
 // Thanks Henk for supplying these functions.
 fn revcomp(dna: &str) -> String {
diff --git a/src/bin/fasten_normalize.rs b/src/bin/fasten_normalize.rs
index 037cdc74..4087c375 100644
--- a/src/bin/fasten_normalize.rs
+++ b/src/bin/fasten_normalize.rs
@@ -129,8 +129,6 @@ fn normalize_coverage (stdin:Stdin, target_depth:u32, paired_end:bool) {
 
         // number of reads to keep is the target depth / kmer coverage * number of reads present
         let mut num_reads_to_keep :usize = min(
-            //(target_depth as f32 / count as f32 * f.len() as f32).ceil() as usize,
-            //(target_depth as f32 / num_reads_orig as f32).ceil() as usize,
             target_depth,
             num_reads_orig as u32
         ) as usize;
diff --git a/src/bin/fasten_trim.rs b/src/bin/fasten_trim.rs
index d52d440e..eaf352da 100644
--- a/src/bin/fasten_trim.rs
+++ b/src/bin/fasten_trim.rs
@@ -1,17 +1,39 @@
-//! Blunt-end trims using 0-based coordinates
+//! Trims reads using 0-based coordinates
 //! 
 //! # Examples
 //! 
-//! ## Trim five bases from the right side
+//! ## Adapters
+//! 
+//! ### Download the adapter files
+//! 
+//! ```bash
+//! mkdir -pv $HOME/db
+//! pushd $HOME/db # step into the db directory
+//! git clone https://github.com/lskatz/adapterseqs
+//! ADAPTERS=$(find $HOME/db/adapterseqs -name '*.fa')
+//! popd # return to the original directory
+//! ```
+//! 
+//! ### Trim the adapters
+//! 
+//! ```bash
+//! cat file.fastq | fasten_trim 
+//! ```
+//! 
+//! ## Blunt-end trim five bases from the right side
+//! 
 //! ```bash
 //! cat file.fastq | fasten_trim -l -5 > trimmed.fastq
 //! ```
 //!
-//! ## Keep a maximum of 100bp
+//! ## Keep a maximum of 100bp with blunt-end trimming on the right side
+//! 
 //! ```bash
 //! cat file.fastq | fasten_trim -l 99 > trimmed.fastq
 //! ```
-//! ## Trim 5bp from the left side
+//! 
+//! ## Blunt-end trim 5bp from the left side
+//! 
 //! ```bash
 //! cat file.fastq | fasten_trim -f 4  > trimmed.fastq
 //! ```
@@ -28,9 +50,22 @@
 //!     -v, --verbose       Print more status messages
 //!     -f, --first-base INT
 //!                         The first base to keep (default: 0)
-//!     -l, --last-base INT The last base to keep. If negative, counts from the
-//!                         right. (default: 0)
+//!     -l, --last-base INT The last base to keep. (default: 0)
 //! ```
+//! 
+//! # Notes
+//! 
+//! The algorithm is as follows:
+//! 
+//! 1. marks the first and last bases for trimming as 0 and the last base, respectively
+//! 2. if an adapter is found at the beginning of the sequence, then move the marker for where it will be trimmed
+//! 3. Compare the blunt end suggested trimming against where an adapter might be found and move the marker as the most inward possible
+//! 4. Trim the sequence and quality strings
+//! 
+//! # Output
+//! 
+//! The deflines will be altered with a description of the trimming in brackets, e.g.,
+//! [trimmed_adapter_rev=TT] [trimmed_left=0] [trimmed_right=250]
 
 extern crate fasten;
 extern crate statistical;
@@ -39,11 +74,16 @@ extern crate threadpool;
 
 use std::fs::File;
 use std::io::BufReader;
-use std::cmp::min;
+use std::cmp::{min,max};
+use std::process::exit;
+
+use std::collections::HashMap;
+use std::io::BufRead;
 
 use fasten::fasten_base_options;
 use fasten::fasten_base_options_matches;
 use fasten::logmsg;
+use fasten::reverse_complement;
 use fasten::io::fastq;
 use fasten::io::seq::Seq;
 
@@ -53,9 +93,42 @@ fn main(){
     // script-specific options
     opts.optopt("f","first-base","The first base to keep (default: 0)","INT");
     opts.optopt("l","last-base","The last base to keep (default: 0)","INT");
+    opts.optopt("a","adapterseqs","fasta file of adapters","path/to/file.fa");
 
     let matches = fasten_base_options_matches("Blunt-end trims using 0-based coordinates", opts);
 
+    let adapterseqs:String = {
+        if matches.opt_present("adapterseqs") {
+            matches.opt_str("adapterseqs")
+                .expect("ERROR: could not understand parameter --adapterseqs")
+        } else {
+            "".to_string()
+        }
+    };
+
+    // store the adapter sequences as a vector of strings
+    let mut adapters:Vec<String> = Vec::new();
+    if matches.opt_present("adapterseqs") && adapterseqs.len() > 0 {
+        // check that the file path exists
+        // if not, exit with an error
+        if !std::path::Path::new(&adapterseqs).exists() {
+            logmsg(format!("ERROR: adapter file {} does not exist", &adapterseqs));
+            exit(1);
+        }
+
+        // read the adapter sequences from the fasta file
+        adapters = read_fasta(&adapterseqs)
+            .values()
+            .map(|x| x.to_string())
+            .collect();
+    }
+    
+    //if matches.opt_present("verbose") { 
+    //    //logmsg(&adapters); 
+    //    eprintln!("Adapters: {:?}", adapters);
+    //    exit(3); 
+    //}
+
     let first_base:usize ={
         if matches.opt_present("first-base") {
             matches.opt_str("first-base")
@@ -100,32 +173,107 @@ fn main(){
     let fastq_iter  = fastq_reader.into_iter();
     for seq in fastq_iter {
 
-        let trimmed:String = trim_worker(seq, first_base, last_base);
+        let trimmed:String = trim_worker(seq, first_base, last_base, &adapters);
         println!("{}", trimmed);
     }
 }
 
 /// Trim a set of fastq entries and send it to a channel
-fn trim_worker(seq:Seq, first_base:usize, last_base:usize ) -> String {
+fn trim_worker(seq:Seq, suggested_first_base:usize, suggested_last_base:usize, adapters:&Vec<String> ) -> String {
 
+    // In this function, keep track of where the first and
+    // last base would be trimmed with a simple marker.
+    // Most instances of the word "trimming" in this function is just moving first_base and last_base.
+    let mut first_base = 0;
     // The last position is either the last_base parameter
     // or the last position in the string, whichever is less.
-    let last_base_tmp = match last_base {
-        // But if the position is not specified, then it is the seq length
-        0 => {
-            // zero based
-            seq.seq.len()-1
-        },
-        _ => {
-            min(seq.seq.len()-1, last_base)
+    let mut last_base = seq.seq.len()-1;
+
+    // Make note of what is trimmed
+    let mut description = String::new();
+
+    // First, run the adapter trimming, before any blunt end trimming
+
+    // First, detect if there are any adapters in the sequence
+    // If there are, then trim the sequence at the adapter
+    for adapter in adapters {
+        let adapter_length = adapter.len();
+        
+        // If the adapter is longer than the sequence, skip it: it won't exist in the sequence as a whole adapter.
+        if adapter_length > seq.seq.len() {
+            continue;
+        }
+        
+        // Check if the adapter is at the beginning of the sequence
+        if &seq.seq[0..adapter_length] == adapter {
+            first_base = adapter_length;
+            description.push_str(&format!(" [trimmed_adapter_fwd={}]", &adapter));
+        }
+        
+        // Check if the revcom is at the end of the sequence
+        let revcom = reverse_complement(&adapter);
+        let end_slice: &str = &seq.seq[&seq.seq.len()-1 - adapter_length..].trim();
+        if end_slice == revcom {
+            last_base = seq.seq.len() - adapter_length;
+            description.push_str(&format!(" [trimmed_adapter_rev={}]", &revcom));
+        }
+    }
+
+    // Next, run the blunt end trimming.
+    // Take the maximum between the suggested left trim and the current left trim.
+    // If the left trim is longer than the sequence length, then omit a warning and do not trim.
+    first_base = max(first_base, suggested_first_base);
+    if first_base >= seq.seq.len() {
+        logmsg("Warning: the left trim is longer than the sequence length.  Skipping.");
+        first_base = 0;
+    }
+
+    // Take the minimum between the suggested right trim and the current right trim.
+    // If the last base is less than 1, then omit a warning and do not trim.
+    last_base = {
+        if suggested_last_base == 0 {
+            last_base
+        } else {
+            min(last_base, suggested_last_base)
         }
     };
+    if last_base < 1 {
+        logmsg("Warning: the right trim is longer than the sequence length.  Skipping.");
+        last_base = seq.seq.len()-1;
+    }
 
-    let sequence = &seq.seq[first_base..last_base_tmp];
-    let quality  = &seq.qual[first_base..last_base_tmp];
+    description.push_str(&format!(" [trimmed_left={}] [trimmed_right={}]", first_base, last_base));
 
-    let trimmed = format!("{}\n{}\n+\n{}", seq.id, sequence, quality);
+    let sequence = &seq.seq[first_base..last_base];
+    let quality  = &seq.qual[first_base..last_base];
+
+    let trimmed = format!("{}{}\n{}\n+\n{}", seq.id, description, sequence, quality);
     return trimmed;
 }
-  
+
+// Taken from https://medium.com/bioinformatics-with-rust/how-to-read-a-fasta-file-9472b77589f7
+/// Read a fasta file and return a HashMap of the sequences
+fn read_fasta(file_path: &str) -> HashMap<String, String> {
+    let mut data = HashMap::new();
+    let file = File::open(file_path).expect("Invalid filepath");
+    let reader = BufReader::new(file);
+    
+    let mut seq_id = String::new();
+
+    for line in reader.lines() {
+        let line = line.unwrap();
+        
+        // Check if the line starts with '>' (indicating a sequence ID or header)
+        if line.starts_with('>') {
+            seq_id = line.trim_start_matches('>').to_string();
+        } else {
+            // If it's a DNA sequence line, insert or update the HashMap entry
+            // If seq_id is not present, insert a new entry with an empty String
+            // Then append the current line to the existing DNA sequence
+            data.entry(seq_id.clone()).or_insert_with(String::new).push_str(&line);
+        }
+    }
+    
+    data
+}
 
diff --git a/src/lib.rs b/src/lib.rs
index 30b8d08c..4809e9a1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -65,6 +65,7 @@ extern crate statistical;
 extern crate getopts;
 use std::env;
 use std::path::Path;
+use std::collections::HashMap;
 
 use getopts::Options;
 use getopts::Matches;
@@ -162,3 +163,29 @@ pub fn logmsg<S: AsRef<str>>(stringlike: S) {
     eprintln!("{}: {}", &program, str_ref);
 }
 
+/// Reverse complement a DNA sequence.
+/// Take into account lowercase vs uppercase.
+/// Ambiguity codes are also handled.
+pub fn reverse_complement(dna: &str) -> String {
+    // Create a mapping for complement bases, including ambiguity codes.
+    let complement_map: HashMap<char, char> = [
+        ('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G'),
+        ('R', 'Y'), ('Y', 'R'), ('S', 'S'), ('W', 'W'),
+        ('K', 'M'), ('M', 'K'), ('B', 'V'), ('V', 'B'),
+        ('D', 'H'), ('H', 'D'), ('N', 'N'),
+        ('a', 't'), ('t', 'a'), ('g', 'c'), ('c', 'g'),
+        ('r', 'y'), ('y', 'r'), ('s', 's'), ('w', 'w'),
+        ('k', 'm'), ('m', 'k'), ('b', 'v'), ('v', 'b'),
+        ('d', 'h'), ('h', 'd'), ('n', 'n'),
+    ]
+    .iter()
+    .cloned()
+    .collect();
+
+    // Generate the reverse complement.
+    dna.chars()
+        .rev()
+        .map(|base| complement_map.get(&base).cloned().unwrap_or('N')) // Default to 'N' for unknown bases.
+        .collect()
+}
+