From 75d4fa175228316682fff94f98f399ca02e3fb20 Mon Sep 17 00:00:00 2001 From: felipe-cohere Date: Fri, 22 Nov 2024 17:47:24 +0000 Subject: [PATCH] 255k vocab size --- tokenizers/examples/train_bpe.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tokenizers/examples/train_bpe.rs b/tokenizers/examples/train_bpe.rs index 578839c3..e6946a2f 100644 --- a/tokenizers/examples/train_bpe.rs +++ b/tokenizers/examples/train_bpe.rs @@ -9,8 +9,10 @@ use std::fs; use std::path::Path; +// cargo run --package tokenizers --example train_bpe + fn main() -> Result<()> { - let vocab_size: usize = 100; + let vocab_size: usize = 255000; let min_frequency = 5; let add_prefix_space = false; @@ -40,7 +42,8 @@ fn main() -> Result<()> { - let input_files_dir = "/home/felipe_cohere_com/roman_mixture_50gb_pretok_counts_400_workers_tsv_unigram_preprocessing/pretokenized_counts"; + let mut input_files_dir = "/home/felipe_cohere_com/roman_mixture_50gb_pretok_counts_400_workers_tsv_unigram_preprocessing/pretokenized_counts"; + // input_files_dir = "/home/felipe_cohere_com/test_csvs/"; let paths = fs::read_dir(input_files_dir).unwrap(); let file_paths: Vec = paths