Skip to content

Commit

Permalink
update tokenizer config
Browse files Browse the repository at this point in the history
  • Loading branch information
nleroy917 committed Jun 14, 2024
1 parent 7f0b654 commit b912538
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 15 deletions.
21 changes: 19 additions & 2 deletions gtars/src/tokenizers/config.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
use std::fs::read_to_string;
use std::path::Path;

use anyhow::Result;
use serde::{Deserialize, Serialize};

#[derive(Deserialize, Serialize, Debug, PartialEq)]
pub struct TokenizerConfig {
pub universe: String,
pub hierarchical_universes: Option<Vec<String>>,
pub universes: Vec<String>,
pub exclude_ranges: Option<String>,
}

impl TokenizerConfig {
///
/// Create a new tokenizer config.
///
/// # Arguments
/// - path: Path to the config file (a .toml) file.
pub fn new(path: &Path) -> Result<TokenizerConfig> {
let toml_str = read_to_string(path)?;
let config: TokenizerConfig = toml::from_str(&toml_str)?;

Ok(config)
}
}
10 changes: 10 additions & 0 deletions gtars/src/tokenizers/meta_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
use std::collections::HashMap;

use rust_lapper::{Lapper, Interval};

use crate::common::models::Universe;

pub struct MetaTokenizer {
pub universe: Universe,
tree: HashMap<String, Lapper<u32,u32>>
}
34 changes: 34 additions & 0 deletions gtars/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub mod soft_tokenizer;
pub mod special_tokens;
pub mod traits;
pub mod tree_tokenizer;
pub mod meta_tokenizer;

/// constants for the tokenizer module.
pub mod consts {
Expand All @@ -26,6 +27,7 @@ pub use tree_tokenizer::TreeTokenizer;
mod tests {

use crate::common::models::{Region, RegionSet};
use crate::tokenizers::traits::SpecialTokens;
use std::path::Path;

use super::*;
Expand All @@ -42,6 +44,11 @@ mod tests {
"tests/data/tokenizer.toml"
}

#[fixture]
fn path_to_bad_config_file() -> &'static str {
"tests/data/tokenizer_bad.toml"
}

#[fixture]
fn path_to_tokenize_bed_file() -> &'static str {
"tests/data/to_tokenize.bed"
Expand All @@ -59,6 +66,33 @@ mod tests {
assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens
}

#[rstest]
#[should_panic]
fn test_bad_config_file(path_to_bad_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file));
let _tokenizer = tokenizer.unwrap();
}

#[rstest]
fn test_get_special_token_ids(path_to_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
let unk_id = tokenizer.unknown_token_id();
let pad_id = tokenizer.padding_token_id();
let mask_id = tokenizer.mask_token_id();
let eos_id = tokenizer.eos_token_id();
let bos_id = tokenizer.bos_token_id();
let cls_id = tokenizer.cls_token_id();
let sep_id = tokenizer.sep_token_id();

assert_eq!(unk_id, 25);
assert_eq!(pad_id, 26);
assert_eq!(mask_id, 27);
assert_eq!(eos_id, 28);
assert_eq!(bos_id, 29);
assert_eq!(cls_id, 30);
assert_eq!(sep_id, 31);
}

#[rstest]
fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
Expand Down
29 changes: 19 additions & 10 deletions gtars/src/tokenizers/tree_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use std::collections::HashMap;
use std::fs::read_to_string;
use std::path::Path;

use anyhow::Result;
use anyhow::{Context, Result};
use rust_lapper::{Interval, Lapper};

use crate::common::consts::special_tokens::*;
Expand All @@ -15,7 +14,6 @@ pub struct TreeTokenizer {
pub universe: Universe,
tree: HashMap<String, Lapper<u32, u32>>,
secondary_trees: Option<Vec<HashMap<String, Lapper<u32, u32>>>>,
exclude_ranges: Option<HashMap<String, Lapper<u32, u32>>>,
}

impl TryFrom<&Path> for TreeTokenizer {
Expand All @@ -34,30 +32,42 @@ impl TryFrom<&Path> for TreeTokenizer {
// and allows for the new way of creating tokenizers from toml files
let file_extension = value.extension().unwrap().to_str().unwrap();

let (mut universe, tree, secondary_trees, exclude_ranges) = match file_extension {
let (mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension {
// parse config file
"toml" => {
let toml_str = read_to_string(value)?;
let config: TokenizerConfig = toml::from_str(&toml_str)?;
let config = TokenizerConfig::new(value)
.with_context(|| {
format!("Invalid tokenizer configuration found for file: {}", value.to_str().unwrap())
})?;

if config.universes.is_empty() {
anyhow::bail!("You must have at least one universe in your universe list. Found zero.")

Check warning on line 44 in gtars/src/tokenizers/tree_tokenizer.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/tokenizers/tree_tokenizer.rs#L44

Added line #L44 was not covered by tests
}

let primary_universe = &config.universes[0];
let other_universes = match config.universes.len() {
1 => None,

Check warning on line 49 in gtars/src/tokenizers/tree_tokenizer.rs

View check run for this annotation

Codecov / codecov/patch

gtars/src/tokenizers/tree_tokenizer.rs#L49

Added line #L49 was not covered by tests
_ => Some(&config.universes[1..])
};

// universe path is relative to the config file
let universe_path = value.parent().unwrap().join(&config.universe);
let universe_path = value.parent().unwrap().join(primary_universe);

// create initial universe from the *required* universe field
let mut universe = Universe::try_from(Path::new(&universe_path))?;

let tree = create_interval_tree_from_universe(&universe);

// create secondary trees if they exist
let secondary_trees = match config.hierarchical_universes {
let secondary_trees = match other_universes {
Some(hierarchical_universes) => {
let mut secondary_trees = Vec::new();
for hierarchical_universe in hierarchical_universes {
let mut hierarchical_tree: HashMap<String, Lapper<u32, u32>> =
HashMap::new();

let hierarchical_universe_path =
value.parent().unwrap().join(&hierarchical_universe);
value.parent().unwrap().join(hierarchical_universe);

let hierarchical_universe_regions =
extract_regions_from_bed_file(&hierarchical_universe_path)?;
Expand Down Expand Up @@ -175,7 +185,6 @@ impl TryFrom<&Path> for TreeTokenizer {
universe,
tree,
secondary_trees,
exclude_ranges,
})
}
}
Expand Down
25 changes: 25 additions & 0 deletions gtars/tests/data/peaks.meta.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
chr17 7915738 7915777 0
chr6 157381091 157381200 0
chr2 168247745 168247800 0
chr4 16270164 16270220 1
chr6 7313181 7313245 1
chr10 70576200 70576231 2
chr1 151399431 151399527 2
chr2 203871200 203871375 2
chr2 203871387 203871588 2
chr12 54220192 54220409 2
chr9 3526071 3526165 3
chr9 3526183 3526269 3
chr7 1044556 1044591 3
chr8 65841729 65841752 4
chr8 65841823 65841921 4
chr2 206713923 206713976 5
chr19 48260083 48260280 5
chr15 28095897 28095963 5
chr17 78759156 78759193 5
chr17 78759222 78759311 5
chr12 121129062 121129088 6
chr1 110202920 110203109 6
chr13 74550022 74550411 6
chr15 49155856 49155887 7
chr15 49155935 49156182 8
5 changes: 2 additions & 3 deletions gtars/tests/data/tokenizer.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
universe = "peaks.bed.gz"
exclude_ranges = "excluderanges.bed.gz"
hierarchical_universes = ["chroms.bed"]
universes = ["peaks.bed.gz", "chroms.bed"]
exclude_ranges = "excluderanges.bed.gz"
3 changes: 3 additions & 0 deletions gtars/tests/data/tokenizer_bad.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
universes = "peaks.bed.gz"
hieracrhical_universes = ["chroms.bed"]
exclude_ranges = "excluderanges.bed.gz"

0 comments on commit b912538

Please sign in to comment.