Skip to content

Commit

Permalink
Merge pull request #66 from databio/dev_64
Browse files Browse the repository at this point in the history
Fixes for #64 and #65
  • Loading branch information
donaldcampbelljr authored Jan 10, 2025
2 parents 33d4851 + bb5bc89 commit ce0967a
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
# - {os: windows-latest, r: 'release', rust-version: 'stable-msvc', rust-target: 'x86_64-pc-windows-gnu'}
- {os: macOS-latest, r: 'release', rust-version: 'stable'}
- {os: ubuntu-latest, r: 'release', rust-version: 'stable'}
- {os: ubuntu-latest, r: 'devel', rust-version: 'stable'}
#- {os: ubuntu-latest, r: 'devel', rust-version: 'stable'}
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
steps:
Expand Down
1 change: 1 addition & 0 deletions gtars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ base64-url = "2.0.0"
sha2 = "0.10.7"
md-5 = "0.10.5"
seq_io = "0.3.2"
serde_json = "1.0.135"


[dev-dependencies]
Expand Down
5 changes: 2 additions & 3 deletions gtars/src/common/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
///
/// - file_path: path to the file to read, or '-' for stdin
///
/// # Returns
///
/// # Returns
///
/// A `BufReader` object for a given file path or stdin.
pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
if file_path_str == "-" {
Expand All @@ -51,7 +51,6 @@ pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<d
}
}

Check warning on line 52 in gtars/src/common/utils.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/common/utils.rs#L52

Added line #L52 was not covered by tests


///
/// Create a region-to-id hash-map from a list of regions
///
Expand Down
18 changes: 9 additions & 9 deletions gtars/src/digests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@
//! # Usage
//!
//! The `sha512t24u` function can be used to compute the GA4GH sha512t24 checksum of a string.
//!
//!
//! ```rust
//! use gtars::digests::sha512t24u;
//!
//! let digest = sha512t24u("hello world");
//! ```
use std::io::prelude::{Read, Write};
use std::io;
use std::fs::File;
use std::io;
use std::io::prelude::{Read, Write};
use std::path::Path;

use anyhow::Result;
use md5::Md5;
use seq_io::fasta::{Reader, Record, RefRecord};
use sha2::{Digest, Sha512};
use seq_io::fasta::{Reader, RefRecord, Record};

use crate::common::utils::get_dynamic_reader;

Expand Down Expand Up @@ -75,7 +75,6 @@ pub fn md5(string: &str) -> String {
format!("{:x}", result)
}


/// Processes a FASTA file to compute the digests of each sequence in the file.
///
/// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence,
Expand Down Expand Up @@ -103,7 +102,8 @@ pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> {
let file_reader = get_dynamic_reader(&path)?;
let mut fasta_reader = Reader::new(file_reader);
let mut results = Vec::new();
while let Some(record) = fasta_reader.next() { // returns a RefRecord object
while let Some(record) = fasta_reader.next() {
// returns a RefRecord object
let record = record.expect("Error found when retrieving next record.");
let id = record.id().expect("No ID found for the FASTA record");
let mut sha512_hasher = Sha512::new();
Expand All @@ -123,7 +123,7 @@ pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> {
id: id.to_string(),
length: length,
sha512t24u: sha512,
md5: md5
md5: md5,
});
}
Ok(results)
Expand Down Expand Up @@ -169,10 +169,10 @@ mod tests {
assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw");
assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945");
}

#[test]
fn bogus_fasta_file() {
let result = digest_fasta("tests/data/bogus.fa");
assert!(result.is_err(), "Expected an error for a bogus fasta file");
}
}
}
12 changes: 4 additions & 8 deletions gtars/src/uniwig/counting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@ pub fn start_end_counts(
chrom_size: i32,
smoothsize: i32,
stepsize: i32,
shift: i32,
) -> (Vec<u32>, Vec<i32>) {
//let vin_iter = starts_vector.iter();

let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments
let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535

Expand All @@ -55,7 +52,7 @@ pub fn start_end_counts(

adjusted_start_site = starts_vector[0]; // get first coordinate position

adjusted_start_site.0 = adjusted_start_site.0 - smoothsize + shift;
adjusted_start_site.0 = adjusted_start_site.0 - smoothsize;

current_end_site = adjusted_start_site;
current_end_site.0 = adjusted_start_site.0 + 1 + smoothsize * 2;
Expand All @@ -74,7 +71,7 @@ pub fn start_end_counts(
coordinate_value = *coord;

adjusted_start_site = coordinate_value;
adjusted_start_site.0 = coordinate_value.0 - smoothsize + shift;
adjusted_start_site.0 = coordinate_value.0 - smoothsize;

let current_score = adjusted_start_site.1;

Expand Down Expand Up @@ -164,7 +161,6 @@ pub fn core_counts(
ends_vector: &[(i32, i32)],
chrom_size: i32,
stepsize: i32,
shift: i32,
) -> (Vec<u32>, Vec<i32>) {
let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments
let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535
Expand All @@ -184,7 +180,7 @@ pub fn core_counts(
current_start_site = starts_vector[0]; // get first coordinate position
current_end_site = ends_vector[0];

current_start_site.0 = current_start_site.0 + shift;
current_start_site.0 = current_start_site.0;

if current_start_site.0 < 1 {
current_start_site.0 = 1;
Expand All @@ -201,7 +197,7 @@ pub fn core_counts(

current_start_site = coordinate_value;

current_start_site.0 = current_start_site.0 + shift;
current_start_site.0 = current_start_site.0;

let current_score = current_start_site.1;
count += current_score;
Expand Down
103 changes: 84 additions & 19 deletions gtars/src/uniwig/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use indicatif::ProgressBar;

use rayon::prelude::*;
use std::error::Error;
use std::fs::File;
use std::fs::{remove_file, File};
use std::io::{BufRead, BufReader, BufWriter, Write};

use crate::uniwig::counting::{
Expand Down Expand Up @@ -191,8 +191,8 @@ pub fn run_uniwig(matches: &ArgMatches) {
}

/// Ensures that the start position is at a minimum equal to `1`
fn clamped_start_position(start: i32, smoothsize: i32) -> i32 {
std::cmp::max(1, start - smoothsize)
fn clamped_start_position(start: i32, smoothsize: i32, wig_shift: i32) -> i32 {
std::cmp::max(1, start - smoothsize + wig_shift)
}
/// Ensure that the start position is at a minimum equal to `0`
fn clamped_start_position_zero_pos(start: i32, smoothsize: i32) -> i32 {
Expand Down Expand Up @@ -222,8 +222,6 @@ pub fn uniwig_main(
.build()
.unwrap();

let mut wig_shift: i32 = 0; // This will be set to 1 when writing to wiggle files, else always set to 0

// Determine Input File Type
let input_filetype = FileType::from_str(filetype.to_lowercase().as_str());
// Set up output file names
Expand All @@ -238,6 +236,8 @@ pub fn uniwig_main(
meta_data_file_names[1] = format!("{}{}.{}", bwfileheader, "end", "meta");
meta_data_file_names[2] = format!("{}{}.{}", bwfileheader, "core", "meta");

let mut npy_meta_data_map: HashMap<String, HashMap<String, i32>> = HashMap::new();

let chrom_sizes = match read_chromosome_sizes(chromsizerefpath) {
// original program gets chromosome size from a .sizes file, e.g. chr1 248956422
// the original program simply pushes 0's until the end of the chromosome length and writes these to file.
Expand All @@ -252,19 +252,16 @@ pub fn uniwig_main(
match input_filetype {
//BED AND NARROWPEAK WORKFLOW
Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => {
// Pare down chromosomes if necessary
let mut final_chromosomes =
get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score);

// Some housekeeping depending on output type
let og_output_type = output_type; // need this later for conversion
let mut output_type = output_type;
if output_type == "bedgraph" || output_type == "bw" || output_type == "bigwig" {
output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files
}
if output_type == "wig" {
wig_shift = 1;
}

// Pare down chromosomes if necessary
let mut final_chromosomes =
get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score);

let bar = ProgressBar::new(final_chromosomes.len() as u64);

Expand Down Expand Up @@ -299,7 +296,6 @@ pub fn uniwig_main(
current_chrom_size,
smoothsize,
stepsize,
wig_shift,
);

match output_type {
Expand All @@ -322,6 +318,7 @@ pub fn uniwig_main(
clamped_start_position(
primary_start.0,
smoothsize,
1, //must shift wiggle starts and core by 1 since it is 1 based
),
stepsize,
current_chrom_size,
Expand Down Expand Up @@ -390,7 +387,6 @@ pub fn uniwig_main(
current_chrom_size,
smoothsize,
stepsize,
wig_shift,
);
match output_type {
"file" => {
Expand All @@ -411,6 +407,7 @@ pub fn uniwig_main(
clamped_start_position(
primary_end.0,
smoothsize,
0,
),
);
write_to_bed_graph_file(
Expand All @@ -432,6 +429,7 @@ pub fn uniwig_main(
clamped_start_position(
primary_end.0,
smoothsize,
0, // ends already 1 based, do not shift further
),
stepsize,
current_chrom_size,
Expand All @@ -450,6 +448,7 @@ pub fn uniwig_main(
clamped_start_position(
primary_end.0,
smoothsize,
0,
),
stepsize,
meta_data_file_names[1].clone(),
Expand All @@ -468,6 +467,7 @@ pub fn uniwig_main(
clamped_start_position(
primary_end.0,

Check warning on line 468 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L468

Added line #L468 was not covered by tests
smoothsize,
0,

Check warning on line 470 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L470

Added line #L470 was not covered by tests
),
stepsize,
meta_data_file_names[1].clone(),
Expand All @@ -481,7 +481,6 @@ pub fn uniwig_main(
&chromosome.ends,
current_chrom_size,
stepsize,
wig_shift,
);
match output_type {
"file" => {
Expand All @@ -499,7 +498,10 @@ pub fn uniwig_main(
let count_info: (Vec<u32>, Vec<u32>, Vec<u32>) =
compress_counts(
&mut core_results,
primary_start.0,
clamped_start_position_zero_pos(
primary_start.0,
0,
),
);
write_to_bed_graph_file(
&count_info,
Expand All @@ -517,7 +519,7 @@ pub fn uniwig_main(
&core_results.0,
file_name.clone(),
chrom_name.clone(),
clamped_start_position(primary_start.0, 0),
clamped_start_position(primary_start.0, 0, 1), //starts are 1 based must be shifted by 1
stepsize,
current_chrom_size,
);
Expand All @@ -531,7 +533,10 @@ pub fn uniwig_main(
&core_results.0,
file_name.clone(),
chrom_name.clone(),
primary_start.0,
clamped_start_position_zero_pos(
primary_start.0,
0,
),
stepsize,
meta_data_file_names[2].clone(),
);
Expand All @@ -546,7 +551,10 @@ pub fn uniwig_main(
&core_results.0,
file_name.clone(),
chrom_name.clone(),
primary_start.0,
clamped_start_position_zero_pos(
primary_start.0,
0,
),

Check warning on line 557 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L554-L557

Added lines #L554 - L557 were not covered by tests
stepsize,
meta_data_file_names[2].clone(),
);
Expand Down Expand Up @@ -580,6 +588,63 @@ pub fn uniwig_main(
);
}
}
"npy" => {
// populate hashmap for the npy meta data
for chromosome in final_chromosomes.iter() {
let chr_name = chromosome.chrom.clone();
let current_chrom_size =
*chrom_sizes.get(&chromosome.chrom).unwrap() as i32;
npy_meta_data_map.insert(
chr_name,
HashMap::from([
("stepsize".to_string(), stepsize),
("reported_chrom_size".to_string(), current_chrom_size),
]),
);
}

for location in vec_count_type.iter() {
let temp_meta_file_name =
format!("{}{}.{}", bwfileheader, *location, "meta");

if let Ok(file) = File::open(&temp_meta_file_name) {
let reader = BufReader::new(file);

for line in reader.lines() {
let line = line.unwrap();
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 3 {
let chrom = parts[1].split('=').nth(1).expect(
"Processing npy metadata file: Missing chromosome in line",
);
let start_str = parts[2].split('=')
.nth(1)
.expect("Processing npy metadata file: Missing start position in line");
let starting_position: i32 = start_str.parse().expect(
"Processing npy metadata file: Invalid start position",
);

if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom)
{
current_chr_data.insert(
(*location.to_string()).parse().unwrap(),
starting_position,
);
}
}

Check warning on line 634 in gtars/src/uniwig/mod.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/uniwig/mod.rs#L634

Added line #L634 was not covered by tests
}
// Remove the file after it is used.
let path = std::path::Path::new(&temp_meta_file_name);
let _ = remove_file(path).unwrap();
}
}
//write combined metadata as json
let json_string = serde_json::to_string_pretty(&npy_meta_data_map).unwrap();
let combined_npy_meta_file_path =
format!("{}{}.{}", bwfileheader, "npy_meta", "json");
let mut file = File::create(combined_npy_meta_file_path).unwrap();
file.write_all(json_string.as_bytes()).unwrap();
}
_ => {}
}
bar.finish();
Expand Down
Loading

0 comments on commit ce0967a

Please sign in to comment.