From b912538b4fe3702d8a0c8596c9e4078f7ec788ba Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 15:27:48 -0400 Subject: [PATCH] update tokenizer config --- gtars/src/tokenizers/config.rs | 21 ++++++++++++++-- gtars/src/tokenizers/meta_tokenizer.rs | 10 ++++++++ gtars/src/tokenizers/mod.rs | 34 ++++++++++++++++++++++++++ gtars/src/tokenizers/tree_tokenizer.rs | 29 ++++++++++++++-------- gtars/tests/data/peaks.meta.bed | 25 +++++++++++++++++++ gtars/tests/data/tokenizer.toml | 5 ++-- gtars/tests/data/tokenizer_bad.toml | 3 +++ 7 files changed, 112 insertions(+), 15 deletions(-) create mode 100644 gtars/src/tokenizers/meta_tokenizer.rs create mode 100755 gtars/tests/data/peaks.meta.bed create mode 100644 gtars/tests/data/tokenizer_bad.toml diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs index 5f2b54a0..935605b8 100644 --- a/gtars/src/tokenizers/config.rs +++ b/gtars/src/tokenizers/config.rs @@ -1,8 +1,25 @@ +use std::fs::read_to_string; +use std::path::Path; + +use anyhow::Result; use serde::{Deserialize, Serialize}; #[derive(Deserialize, Serialize, Debug, PartialEq)] pub struct TokenizerConfig { - pub universe: String, - pub hierarchical_universes: Option>, + pub universes: Vec, pub exclude_ranges: Option, } + +impl TokenizerConfig { + /// + /// Create a new tokenizer config. + /// + /// # Arguments + /// - path: Path to the config file (a .toml) file. + pub fn new(path: &Path) -> Result { + let toml_str = read_to_string(path)?; + let config: TokenizerConfig = toml::from_str(&toml_str)?; + + Ok(config) + } +} diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs new file mode 100644 index 00000000..e0f34291 --- /dev/null +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -0,0 +1,10 @@ +use std::collections::HashMap; + +use rust_lapper::{Lapper, Interval}; + +use crate::common::models::Universe; + +pub struct MetaTokenizer { + pub universe: Universe, + tree: HashMap> +} \ No newline at end of file diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 3cb67abd..331b25a2 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -8,6 +8,7 @@ pub mod soft_tokenizer; pub mod special_tokens; pub mod traits; pub mod tree_tokenizer; +pub mod meta_tokenizer; /// constants for the tokenizer module. pub mod consts { @@ -26,6 +27,7 @@ pub use tree_tokenizer::TreeTokenizer; mod tests { use crate::common::models::{Region, RegionSet}; + use crate::tokenizers::traits::SpecialTokens; use std::path::Path; use super::*; @@ -42,6 +44,11 @@ mod tests { "tests/data/tokenizer.toml" } + #[fixture] + fn path_to_bad_config_file() -> &'static str { + "tests/data/tokenizer_bad.toml" + } + #[fixture] fn path_to_tokenize_bed_file() -> &'static str { "tests/data/to_tokenize.bed" @@ -59,6 +66,33 @@ mod tests { assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens } + #[rstest] + #[should_panic] + fn test_bad_config_file(path_to_bad_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file)); + let _tokenizer = tokenizer.unwrap(); + } + + #[rstest] + fn test_get_special_token_ids(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let unk_id = tokenizer.unknown_token_id(); + let pad_id = tokenizer.padding_token_id(); + let mask_id = tokenizer.mask_token_id(); + let eos_id = tokenizer.eos_token_id(); + let bos_id = tokenizer.bos_token_id(); + let cls_id = tokenizer.cls_token_id(); + let sep_id = tokenizer.sep_token_id(); + + assert_eq!(unk_id, 25); + assert_eq!(pad_id, 26); + assert_eq!(mask_id, 27); + assert_eq!(eos_id, 28); + assert_eq!(bos_id, 29); + assert_eq!(cls_id, 30); + assert_eq!(sep_id, 31); + } + #[rstest] fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 25a0d601..4f2fc8f3 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -1,8 +1,7 @@ use std::collections::HashMap; -use std::fs::read_to_string; use std::path::Path; -use anyhow::Result; +use anyhow::{Context, Result}; use rust_lapper::{Interval, Lapper}; use crate::common::consts::special_tokens::*; @@ -15,7 +14,6 @@ pub struct TreeTokenizer { pub universe: Universe, tree: HashMap>, secondary_trees: Option>>>, - exclude_ranges: Option>>, } impl TryFrom<&Path> for TreeTokenizer { @@ -34,14 +32,26 @@ impl TryFrom<&Path> for TreeTokenizer { // and allows for the new way of creating tokenizers from toml files let file_extension = value.extension().unwrap().to_str().unwrap(); - let (mut universe, tree, secondary_trees, exclude_ranges) = match file_extension { + let (mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension { // parse config file "toml" => { - let toml_str = read_to_string(value)?; - let config: TokenizerConfig = toml::from_str(&toml_str)?; + let config = TokenizerConfig::new(value) + .with_context(|| { + format!("Invalid tokenizer configuration found for file: {}", value.to_str().unwrap()) + })?; + + if config.universes.is_empty() { + anyhow::bail!("You must have at least one universe in your universe list. Found zero.") + } + + let primary_universe = &config.universes[0]; + let other_universes = match config.universes.len() { + 1 => None, + _ => Some(&config.universes[1..]) + }; // universe path is relative to the config file - let universe_path = value.parent().unwrap().join(&config.universe); + let universe_path = value.parent().unwrap().join(primary_universe); // create initial universe from the *required* universe field let mut universe = Universe::try_from(Path::new(&universe_path))?; @@ -49,7 +59,7 @@ impl TryFrom<&Path> for TreeTokenizer { let tree = create_interval_tree_from_universe(&universe); // create secondary trees if they exist - let secondary_trees = match config.hierarchical_universes { + let secondary_trees = match other_universes { Some(hierarchical_universes) => { let mut secondary_trees = Vec::new(); for hierarchical_universe in hierarchical_universes { @@ -57,7 +67,7 @@ impl TryFrom<&Path> for TreeTokenizer { HashMap::new(); let hierarchical_universe_path = - value.parent().unwrap().join(&hierarchical_universe); + value.parent().unwrap().join(hierarchical_universe); let hierarchical_universe_regions = extract_regions_from_bed_file(&hierarchical_universe_path)?; @@ -175,7 +185,6 @@ impl TryFrom<&Path> for TreeTokenizer { universe, tree, secondary_trees, - exclude_ranges, }) } } diff --git a/gtars/tests/data/peaks.meta.bed b/gtars/tests/data/peaks.meta.bed new file mode 100755 index 00000000..7303b6cc --- /dev/null +++ b/gtars/tests/data/peaks.meta.bed @@ -0,0 +1,25 @@ +chr17 7915738 7915777 0 +chr6 157381091 157381200 0 +chr2 168247745 168247800 0 +chr4 16270164 16270220 1 +chr6 7313181 7313245 1 +chr10 70576200 70576231 2 +chr1 151399431 151399527 2 +chr2 203871200 203871375 2 +chr2 203871387 203871588 2 +chr12 54220192 54220409 2 +chr9 3526071 3526165 3 +chr9 3526183 3526269 3 +chr7 1044556 1044591 3 +chr8 65841729 65841752 4 +chr8 65841823 65841921 4 +chr2 206713923 206713976 5 +chr19 48260083 48260280 5 +chr15 28095897 28095963 5 +chr17 78759156 78759193 5 +chr17 78759222 78759311 5 +chr12 121129062 121129088 6 +chr1 110202920 110203109 6 +chr13 74550022 74550411 6 +chr15 49155856 49155887 7 +chr15 49155935 49156182 8 \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.toml b/gtars/tests/data/tokenizer.toml index eb969b65..648deaa3 100644 --- a/gtars/tests/data/tokenizer.toml +++ b/gtars/tests/data/tokenizer.toml @@ -1,3 +1,2 @@ -universe = "peaks.bed.gz" -exclude_ranges = "excluderanges.bed.gz" -hierarchical_universes = ["chroms.bed"] \ No newline at end of file +universes = ["peaks.bed.gz", "chroms.bed"] +exclude_ranges = "excluderanges.bed.gz" \ No newline at end of file diff --git a/gtars/tests/data/tokenizer_bad.toml b/gtars/tests/data/tokenizer_bad.toml new file mode 100644 index 00000000..f59134a4 --- /dev/null +++ b/gtars/tests/data/tokenizer_bad.toml @@ -0,0 +1,3 @@ +universes = "peaks.bed.gz" +hieracrhical_universes = ["chroms.bed"] +exclude_ranges = "excluderanges.bed.gz" \ No newline at end of file