Skip to content

Commit

Permalink
255k vocab size
Browse files Browse the repository at this point in the history
  • Loading branch information
felipe-cohere committed Nov 22, 2024
1 parent 5567c03 commit 75d4fa1
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions tokenizers/examples/train_bpe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ use std::fs;

use std::path::Path;

// cargo run --package tokenizers --example train_bpe

fn main() -> Result<()> {
let vocab_size: usize = 100;
let vocab_size: usize = 255000;

let min_frequency = 5;
let add_prefix_space = false;
Expand Down Expand Up @@ -40,7 +42,8 @@ fn main() -> Result<()> {



let input_files_dir = "/home/felipe_cohere_com/roman_mixture_50gb_pretok_counts_400_workers_tsv_unigram_preprocessing/pretokenized_counts";
let mut input_files_dir = "/home/felipe_cohere_com/roman_mixture_50gb_pretok_counts_400_workers_tsv_unigram_preprocessing/pretokenized_counts";
// input_files_dir = "/home/felipe_cohere_com/test_csvs/";
let paths = fs::read_dir(input_files_dir).unwrap();

let file_paths: Vec<String> = paths
Expand Down

0 comments on commit 75d4fa1

Please sign in to comment.