update tokenizer config

databio · Jun 14, 2024 · b912538 · b912538
1 parent 7f0b654
commit b912538
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 15 deletions.
diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs
@@ -1,8 +1,25 @@
+use std::fs::read_to_string;
+use std::path::Path;
+
+use anyhow::Result;
 use serde::{Deserialize, Serialize};
 
 #[derive(Deserialize, Serialize, Debug, PartialEq)]
 pub struct TokenizerConfig {
-    pub universe: String,
-    pub hierarchical_universes: Option<Vec<String>>,
+    pub universes: Vec<String>,
     pub exclude_ranges: Option<String>,
 }
+
+impl TokenizerConfig {
+    ///
+    /// Create a new tokenizer config.
+    ///
+    /// # Arguments
+    /// - path: Path to the config file (a .toml) file.
+    pub fn new(path: &Path) -> Result<TokenizerConfig> {
+        let toml_str = read_to_string(path)?;
+        let config: TokenizerConfig = toml::from_str(&toml_str)?;
+
+        Ok(config)
+    }
+}
diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs
@@ -0,0 +1,10 @@
+use std::collections::HashMap;
+
+use rust_lapper::{Lapper, Interval};
+
+use crate::common::models::Universe;
+
+pub struct MetaTokenizer {
+    pub universe: Universe,
+    tree: HashMap<String, Lapper<u32,u32>>
+}
diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs
@@ -8,6 +8,7 @@ pub mod soft_tokenizer;
 pub mod special_tokens;
 pub mod traits;
 pub mod tree_tokenizer;
+pub mod meta_tokenizer;
 
 /// constants for the tokenizer module.
 pub mod consts {
@@ -26,6 +27,7 @@ pub use tree_tokenizer::TreeTokenizer;
 mod tests {
 
     use crate::common::models::{Region, RegionSet};
+    use crate::tokenizers::traits::SpecialTokens;
     use std::path::Path;
 
     use super::*;
@@ -42,6 +44,11 @@ mod tests {
         "tests/data/tokenizer.toml"
     }
 
+    #[fixture]
+    fn path_to_bad_config_file() -> &'static str {
+        "tests/data/tokenizer_bad.toml"
+    }
+
     #[fixture]
     fn path_to_tokenize_bed_file() -> &'static str {
         "tests/data/to_tokenize.bed"
@@ -59,6 +66,33 @@ mod tests {
         assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens
     }
 
+    #[rstest]
+    #[should_panic]
+    fn test_bad_config_file(path_to_bad_config_file: &str) {
+        let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file));
+        let _tokenizer = tokenizer.unwrap();
+    }
+
+    #[rstest]
+    fn test_get_special_token_ids(path_to_bed_file: &str) {
+        let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
+        let unk_id = tokenizer.unknown_token_id();
+        let pad_id = tokenizer.padding_token_id();
+        let mask_id = tokenizer.mask_token_id();
+        let eos_id = tokenizer.eos_token_id();
+        let bos_id = tokenizer.bos_token_id();
+        let cls_id = tokenizer.cls_token_id();
+        let sep_id = tokenizer.sep_token_id();
+
+        assert_eq!(unk_id, 25);
+        assert_eq!(pad_id, 26);
+        assert_eq!(mask_id, 27);
+        assert_eq!(eos_id, 28);
+        assert_eq!(bos_id, 29);
+        assert_eq!(cls_id, 30);
+        assert_eq!(sep_id, 31);
+    }
+
     #[rstest]
     fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) {
         let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();

diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs
@@ -1,8 +1,7 @@
 use std::collections::HashMap;
-use std::fs::read_to_string;
 use std::path::Path;
 
-use anyhow::Result;
+use anyhow::{Context, Result};
 use rust_lapper::{Interval, Lapper};
 
 use crate::common::consts::special_tokens::*;
@@ -15,7 +14,6 @@ pub struct TreeTokenizer {
     pub universe: Universe,
     tree: HashMap<String, Lapper<u32, u32>>,
     secondary_trees: Option<Vec<HashMap<String, Lapper<u32, u32>>>>,
-    exclude_ranges: Option<HashMap<String, Lapper<u32, u32>>>,
 }
 
 impl TryFrom<&Path> for TreeTokenizer {
@@ -34,30 +32,42 @@ impl TryFrom<&Path> for TreeTokenizer {
         // and allows for the new way of creating tokenizers from toml files
         let file_extension = value.extension().unwrap().to_str().unwrap();
 
-        let (mut universe, tree, secondary_trees, exclude_ranges) = match file_extension {
+        let (mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension {
             // parse config file
             "toml" => {
-                let toml_str = read_to_string(value)?;
-                let config: TokenizerConfig = toml::from_str(&toml_str)?;
+                let config = TokenizerConfig::new(value)
+                .with_context(|| {
+                    format!("Invalid tokenizer configuration found for file: {}", value.to_str().unwrap())
+                })?;
+
+                if config.universes.is_empty() {
+                    anyhow::bail!("You must have at least one universe in your universe list. Found zero.")
+                }
+
+                let primary_universe = &config.universes[0];
+                let other_universes = match config.universes.len() {
+                    1 => None,
+                    _ => Some(&config.universes[1..])
+                };
 
                 // universe path is relative to the config file
-                let universe_path = value.parent().unwrap().join(&config.universe);
+                let universe_path = value.parent().unwrap().join(primary_universe);
 
                 // create initial universe from the *required* universe field
                 let mut universe = Universe::try_from(Path::new(&universe_path))?;
 
                 let tree = create_interval_tree_from_universe(&universe);
 
                 // create secondary trees if they exist
-                let secondary_trees = match config.hierarchical_universes {
+                let secondary_trees = match other_universes {
                     Some(hierarchical_universes) => {
                         let mut secondary_trees = Vec::new();
                         for hierarchical_universe in hierarchical_universes {
                             let mut hierarchical_tree: HashMap<String, Lapper<u32, u32>> =
                                 HashMap::new();
 
                             let hierarchical_universe_path =
-                                value.parent().unwrap().join(&hierarchical_universe);
+                                value.parent().unwrap().join(hierarchical_universe);
 
                             let hierarchical_universe_regions =
                                 extract_regions_from_bed_file(&hierarchical_universe_path)?;
@@ -175,7 +185,6 @@ impl TryFrom<&Path> for TreeTokenizer {
             universe,
             tree,
             secondary_trees,
-            exclude_ranges,
         })
     }
 }

diff --git a/gtars/tests/data/peaks.meta.bed b/gtars/tests/data/peaks.meta.bed
@@ -0,0 +1,25 @@
+chr17	7915738	7915777	0
+chr6	157381091	157381200	0
+chr2	168247745	168247800	0
+chr4	16270164	16270220	1
+chr6	7313181	7313245	1
+chr10	70576200	70576231	2
+chr1	151399431	151399527	2
+chr2	203871200	203871375	2
+chr2	203871387	203871588	2
+chr12	54220192	54220409	2
+chr9	3526071	3526165	3
+chr9	3526183	3526269	3
+chr7	1044556	1044591	3
+chr8	65841729	65841752	4
+chr8	65841823	65841921	4
+chr2	206713923	206713976	5
+chr19	48260083	48260280	5
+chr15	28095897	28095963	5
+chr17	78759156	78759193	5
+chr17	78759222	78759311	5
+chr12	121129062	121129088	6
+chr1	110202920	110203109	6
+chr13	74550022	74550411	6
+chr15	49155856	49155887	7
+chr15	49155935	49156182	8
diff --git a/gtars/tests/data/tokenizer.toml b/gtars/tests/data/tokenizer.toml
@@ -1,3 +1,2 @@
-universe = "peaks.bed.gz"
-exclude_ranges = "excluderanges.bed.gz"
-hierarchical_universes = ["chroms.bed"]
+universes = ["peaks.bed.gz", "chroms.bed"]
+exclude_ranges = "excluderanges.bed.gz"
diff --git a/gtars/tests/data/tokenizer_bad.toml b/gtars/tests/data/tokenizer_bad.toml
@@ -0,0 +1,3 @@
+universes = "peaks.bed.gz"
+hieracrhical_universes = ["chroms.bed"]
+exclude_ranges = "excluderanges.bed.gz"