From 746aaaeafa4ae746f97284ff43e3abe9af835b1d Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 6 Nov 2023 03:05:21 -0800 Subject: [PATCH] remove tiktoken pin (#1759) --- requirements.txt | 2 +- tests/test_tokenizer.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3c11ac32c..a03dae853 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ numpy torch tqdm more-itertools -tiktoken==0.3.3 +tiktoken diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 09d0351e1..be424e5fe 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,7 +1,17 @@ +import pytest + from whisper.tokenizer import get_tokenizer -def test_tokenizer(): +@pytest.mark.parametrize("multilingual", [True, False]) +def test_tokenizer(multilingual): + tokenizer = get_tokenizer(multilingual=False) + assert tokenizer.sot in tokenizer.sot_sequence + assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens) + assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens) + + +def test_multilingual_tokenizer(): gpt2_tokenizer = get_tokenizer(multilingual=False) multilingual_tokenizer = get_tokenizer(multilingual=True) @@ -20,5 +30,5 @@ def test_split_on_unicode(): tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378] words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens) - assert words == [" elle", " est", " l", "'", "�", "é", "rit", "oire"] + assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"] assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]