Skip to content

Commit

Permalink
trim-adapters
Browse files Browse the repository at this point in the history
  • Loading branch information
lskatz committed Jan 4, 2025
1 parent 93e7ae9 commit c1a5905
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 24 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ path = "src/bin/fasten_normalize.rs"

[dependencies]
regex = "1.10"
fancy-regex = "0.13"
getopts = "0.2.21"
statistical = "1.0"
multiqueue = "0.3.2"
Expand Down
64 changes: 63 additions & 1 deletion src/bin/fasten_kmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,22 @@
extern crate fasten;
extern crate statistical;
extern crate getopts;
extern crate rand;
extern crate fancy_regex;

use std::io::BufReader;
use std::io::BufRead;
use std::io::stdin;
use std::io::Stdin;
use rand::Rng;
//use rand::seq::SliceRandom;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fasten::logmsg;

use fancy_regex::Regex;

use std::collections::HashMap;

/// Glues together paired end reads internally and is a
Expand Down Expand Up @@ -149,6 +155,11 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo

// keep track of which sequences start with which kmers
let mut kmer_to_seqs :HashMap<String, Vec<String>> = HashMap::new();

// Some randomness
let mut rng = rand::thread_rng();
// Regular expression to find uncomplex kmers
let low_complexity = Regex::new(r"N|(.)\1{2,}|(.)(.)(\2\3){1,}").unwrap();

// read the file
let my_buffer=BufReader::new(stdin);
Expand All @@ -170,6 +181,7 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo
or_insert(0);
*kmer_count += value;
}
let kmer_keys: Vec<&String> = entry_kmers.keys().collect();

// If this is paired end and if we're saving the second pair's
// read, then reserve a declaired variable here for the string.
Expand Down Expand Up @@ -201,7 +213,24 @@ fn count_kmers (stdin:Stdin, kmer_length:usize, revcomp:bool, remember_reads:boo

// Remember the read that initiated this
if remember_reads {
let init_kmer = String::from(&seq[0..kmer_length]);
// Get the kmer substring
// let init_kmer = String::from(&seq[0..kmer_length]);
// Get the minimizer as the initial kmer
//let init_kmer = calculate_minimizer(&seq, kmer_length);

//let init_kmer_pos = rand::thread_rng().gen_range(0 .. seq.len() - kmer_length + 1);
//let init_kmer = seq[init_kmer_pos .. init_kmer_pos+kmer_length].to_string();
let mut random_index = rng.gen_range(0..kmer_keys.len());
let mut init_kmer = kmer_keys[random_index].to_string();
let mut kmer_tries :u16 = 0;
while kmer_tries < 1000 && low_complexity.is_match(&init_kmer).unwrap() {
random_index = rng.gen_range(0..kmer_keys.len());
init_kmer = kmer_keys[random_index].to_string();
kmer_tries += 1;
//fasten::logmsg(format!("skipping kmer {}, try {}",&init_kmer, kmer_tries));
}

// Get the vector of sequences for this kmer, or else initialize an empty vector
let init_kmer_vec = kmer_to_seqs.entry(init_kmer).or_insert(vec![]);

// get the formatted entry
Expand Down Expand Up @@ -276,6 +305,39 @@ fn kmers_in_str (seq:&str, kmer_length:usize, should_revcomp:bool) -> HashMap<St
return kmer_hash;
}

/*
fn calculate_minimizer(sequence: &str, k: usize) -> String {
// Ensure the sequence length is greater than or equal to k
if sequence.len() < k {
panic!("Sequence length is less than k");
}
// Initialize variables to keep track of the minimum k-mer and its hash
let mut min_kmer = &sequence[0..k];
let mut min_hash = hash_kmer(min_kmer);
// Iterate over the sequence to find the minimum k-mer and its hash
for i in 1..=(sequence.len() - k) {
let current_kmer = &sequence[i..(i + k)];
let current_hash = hash_kmer(current_kmer);
// Update the minimum k-mer and its hash if a smaller hash is found
if current_hash < min_hash {
min_kmer = current_kmer;
min_hash = current_hash;
}
}
// Return the minimizer k-mer
min_kmer.to_string()
}
fn hash_kmer(kmer: &str) -> u64 {
// Simple hash function that converts each nucleotide to its ASCII value and sums them up
kmer.bytes().map(|b| b as u64).sum()
}
*/

/// reverse-complement a dna sequence
// Thanks Henk for supplying these functions.
fn revcomp(dna: &str) -> String {
Expand Down
2 changes: 0 additions & 2 deletions src/bin/fasten_normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,6 @@ fn normalize_coverage (stdin:Stdin, target_depth:u32, paired_end:bool) {

// number of reads to keep is the target depth / kmer coverage * number of reads present
let mut num_reads_to_keep :usize = min(
//(target_depth as f32 / count as f32 * f.len() as f32).ceil() as usize,
//(target_depth as f32 / num_reads_orig as f32).ceil() as usize,
target_depth,
num_reads_orig as u32
) as usize;
Expand Down
190 changes: 169 additions & 21 deletions src/bin/fasten_trim.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,39 @@
//! Blunt-end trims using 0-based coordinates
//! Trims reads using 0-based coordinates
//!
//! # Examples
//!
//! ## Trim five bases from the right side
//! ## Adapters
//!
//! ### Download the adapter files
//!
//! ```bash
//! mkdir -pv $HOME/db
//! pushd $HOME/db # step into the db directory
//! git clone https://github.com/lskatz/adapterseqs
//! ADAPTERS=$(find $HOME/db/adapterseqs -name '*.fa')
//! popd # return to the original directory
//! ```
//!
//! ### Trim the adapters
//!
//! ```bash
//! cat file.fastq | fasten_trim
//! ```
//!
//! ## Blunt-end trim five bases from the right side
//!
//! ```bash
//! cat file.fastq | fasten_trim -l -5 > trimmed.fastq
//! ```
//!
//! ## Keep a maximum of 100bp
//! ## Keep a maximum of 100bp with blunt-end trimming on the right side
//!
//! ```bash
//! cat file.fastq | fasten_trim -l 99 > trimmed.fastq
//! ```
//! ## Trim 5bp from the left side
//!
//! ## Blunt-end trim 5bp from the left side
//!
//! ```bash
//! cat file.fastq | fasten_trim -f 4 > trimmed.fastq
//! ```
Expand All @@ -28,9 +50,22 @@
//! -v, --verbose Print more status messages
//! -f, --first-base INT
//! The first base to keep (default: 0)
//! -l, --last-base INT The last base to keep. If negative, counts from the
//! right. (default: 0)
//! -l, --last-base INT The last base to keep. (default: 0)
//! ```
//!
//! # Notes
//!
//! The algorithm is as follows:
//!
//! 1. marks the first and last bases for trimming as 0 and the last base, respectively
//! 2. if an adapter is found at the beginning of the sequence, then move the marker for where it will be trimmed
//! 3. Compare the blunt end suggested trimming against where an adapter might be found and move the marker as the most inward possible
//! 4. Trim the sequence and quality strings
//!
//! # Output
//!
//! The deflines will be altered with a description of the trimming in brackets, e.g.,
//! [trimmed_adapter_rev=TT] [trimmed_left=0] [trimmed_right=250]
extern crate fasten;
extern crate statistical;
Expand All @@ -39,11 +74,16 @@ extern crate threadpool;

use std::fs::File;
use std::io::BufReader;
use std::cmp::min;
use std::cmp::{min,max};
use std::process::exit;

use std::collections::HashMap;
use std::io::BufRead;

use fasten::fasten_base_options;
use fasten::fasten_base_options_matches;
use fasten::logmsg;
use fasten::reverse_complement;
use fasten::io::fastq;
use fasten::io::seq::Seq;

Expand All @@ -53,9 +93,42 @@ fn main(){
// script-specific options
opts.optopt("f","first-base","The first base to keep (default: 0)","INT");
opts.optopt("l","last-base","The last base to keep (default: 0)","INT");
opts.optopt("a","adapterseqs","fasta file of adapters","path/to/file.fa");

let matches = fasten_base_options_matches("Blunt-end trims using 0-based coordinates", opts);

let adapterseqs:String = {
if matches.opt_present("adapterseqs") {
matches.opt_str("adapterseqs")
.expect("ERROR: could not understand parameter --adapterseqs")
} else {
"".to_string()
}
};

// store the adapter sequences as a vector of strings
let mut adapters:Vec<String> = Vec::new();
if matches.opt_present("adapterseqs") && adapterseqs.len() > 0 {
// check that the file path exists
// if not, exit with an error
if !std::path::Path::new(&adapterseqs).exists() {
logmsg(format!("ERROR: adapter file {} does not exist", &adapterseqs));
exit(1);
}

// read the adapter sequences from the fasta file
adapters = read_fasta(&adapterseqs)
.values()
.map(|x| x.to_string())
.collect();
}

//if matches.opt_present("verbose") {
// //logmsg(&adapters);
// eprintln!("Adapters: {:?}", adapters);
// exit(3);
//}

let first_base:usize ={
if matches.opt_present("first-base") {
matches.opt_str("first-base")
Expand Down Expand Up @@ -100,32 +173,107 @@ fn main(){
let fastq_iter = fastq_reader.into_iter();
for seq in fastq_iter {

let trimmed:String = trim_worker(seq, first_base, last_base);
let trimmed:String = trim_worker(seq, first_base, last_base, &adapters);
println!("{}", trimmed);
}
}

/// Trim a set of fastq entries and send it to a channel
fn trim_worker(seq:Seq, first_base:usize, last_base:usize ) -> String {
fn trim_worker(seq:Seq, suggested_first_base:usize, suggested_last_base:usize, adapters:&Vec<String> ) -> String {

// In this function, keep track of where the first and
// last base would be trimmed with a simple marker.
// Most instances of the word "trimming" in this function is just moving first_base and last_base.
let mut first_base = 0;
// The last position is either the last_base parameter
// or the last position in the string, whichever is less.
let last_base_tmp = match last_base {
// But if the position is not specified, then it is the seq length
0 => {
// zero based
seq.seq.len()-1
},
_ => {
min(seq.seq.len()-1, last_base)
let mut last_base = seq.seq.len()-1;

// Make note of what is trimmed
let mut description = String::new();

// First, run the adapter trimming, before any blunt end trimming

// First, detect if there are any adapters in the sequence
// If there are, then trim the sequence at the adapter
for adapter in adapters {
let adapter_length = adapter.len();

// If the adapter is longer than the sequence, skip it: it won't exist in the sequence as a whole adapter.
if adapter_length > seq.seq.len() {
continue;
}

// Check if the adapter is at the beginning of the sequence
if &seq.seq[0..adapter_length] == adapter {
first_base = adapter_length;
description.push_str(&format!(" [trimmed_adapter_fwd={}]", &adapter));
}

// Check if the revcom is at the end of the sequence
let revcom = reverse_complement(&adapter);
let end_slice: &str = &seq.seq[&seq.seq.len()-1 - adapter_length..].trim();
if end_slice == revcom {
last_base = seq.seq.len() - adapter_length;
description.push_str(&format!(" [trimmed_adapter_rev={}]", &revcom));
}
}

// Next, run the blunt end trimming.
// Take the maximum between the suggested left trim and the current left trim.
// If the left trim is longer than the sequence length, then omit a warning and do not trim.
first_base = max(first_base, suggested_first_base);
if first_base >= seq.seq.len() {
logmsg("Warning: the left trim is longer than the sequence length. Skipping.");
first_base = 0;
}

// Take the minimum between the suggested right trim and the current right trim.
// If the last base is less than 1, then omit a warning and do not trim.
last_base = {
if suggested_last_base == 0 {
last_base
} else {
min(last_base, suggested_last_base)
}
};
if last_base < 1 {
logmsg("Warning: the right trim is longer than the sequence length. Skipping.");
last_base = seq.seq.len()-1;
}

let sequence = &seq.seq[first_base..last_base_tmp];
let quality = &seq.qual[first_base..last_base_tmp];
description.push_str(&format!(" [trimmed_left={}] [trimmed_right={}]", first_base, last_base));

let trimmed = format!("{}\n{}\n+\n{}", seq.id, sequence, quality);
let sequence = &seq.seq[first_base..last_base];
let quality = &seq.qual[first_base..last_base];

let trimmed = format!("{}{}\n{}\n+\n{}", seq.id, description, sequence, quality);
return trimmed;
}


// Taken from https://medium.com/bioinformatics-with-rust/how-to-read-a-fasta-file-9472b77589f7
/// Read a fasta file and return a HashMap of the sequences
fn read_fasta(file_path: &str) -> HashMap<String, String> {
let mut data = HashMap::new();
let file = File::open(file_path).expect("Invalid filepath");
let reader = BufReader::new(file);

let mut seq_id = String::new();

for line in reader.lines() {
let line = line.unwrap();

// Check if the line starts with '>' (indicating a sequence ID or header)
if line.starts_with('>') {
seq_id = line.trim_start_matches('>').to_string();
} else {
// If it's a DNA sequence line, insert or update the HashMap entry
// If seq_id is not present, insert a new entry with an empty String
// Then append the current line to the existing DNA sequence
data.entry(seq_id.clone()).or_insert_with(String::new).push_str(&line);
}
}

data
}

Loading

0 comments on commit c1a5905

Please sign in to comment.