From fac1e41eca5dc6220b33090b9c2057494389f924 Mon Sep 17 00:00:00 2001 From: karinazad Date: Tue, 28 Jan 2025 08:59:44 -0500 Subject: [PATCH] special tokens --- .../smiles_tokenizer/special_tokens_map.json | 2 ++ .../assets/smiles_tokenizer/tokenizer.json | 22 +++++++++++++++++-- .../smiles_tokenizer/tokenizer_config.json | 20 ++++++++++++++++- src/lobster/assets/smiles_tokenizer/vocab.txt | 2 +- .../tokenization/test__smiles_tokenizer.py | 16 ++++++++++++++ 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/lobster/assets/smiles_tokenizer/special_tokens_map.json b/src/lobster/assets/smiles_tokenizer/special_tokens_map.json index ba61142..c883d1b 100644 --- a/src/lobster/assets/smiles_tokenizer/special_tokens_map.json +++ b/src/lobster/assets/smiles_tokenizer/special_tokens_map.json @@ -1,5 +1,7 @@ { + "cls_token": "", "eos_token": "", + "mask_token": "", "pad_token": "", "sep_token": "", "unk_token": "" diff --git a/src/lobster/assets/smiles_tokenizer/tokenizer.json b/src/lobster/assets/smiles_tokenizer/tokenizer.json index b9dd29b..0f33d84 100644 --- a/src/lobster/assets/smiles_tokenizer/tokenizer.json +++ b/src/lobster/assets/smiles_tokenizer/tokenizer.json @@ -21,6 +21,15 @@ "normalized": false, "special": true }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, { "id": 3, "content": "", @@ -31,7 +40,16 @@ "special": true }, { - "id": 582, + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, "content": "", "single_word": false, "lstrip": false, @@ -176,7 +194,7 @@ "": 2, "": 3, "": 4, - "": 5, + "": 5, "c": 6, "C": 7, "(": 8, diff --git a/src/lobster/assets/smiles_tokenizer/tokenizer_config.json b/src/lobster/assets/smiles_tokenizer/tokenizer_config.json index 16a5289..a88fd05 100644 --- a/src/lobster/assets/smiles_tokenizer/tokenizer_config.json +++ b/src/lobster/assets/smiles_tokenizer/tokenizer_config.json @@ -16,6 +16,14 @@ "single_word": false, "special": true }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, "3": { "content": "", "lstrip": false, @@ -24,7 +32,15 @@ "single_word": false, "special": true }, - "582": { + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "5": { "content": "", "lstrip": false, "normalized": false, @@ -35,8 +51,10 @@ }, "bos_token": null, "clean_up_tokenization_spaces": false, + "cls_token": "", "eos_token": "", "extra_special_tokens": {}, + "mask_token": "", "model_max_length": 1000000000000000019884624838656, "pad_token": "", "padding_side": "right", diff --git a/src/lobster/assets/smiles_tokenizer/vocab.txt b/src/lobster/assets/smiles_tokenizer/vocab.txt index 539c2e1..25cc51c 100644 --- a/src/lobster/assets/smiles_tokenizer/vocab.txt +++ b/src/lobster/assets/smiles_tokenizer/vocab.txt @@ -3,7 +3,7 @@ - + c C ( diff --git a/tests/lobster/tokenization/test__smiles_tokenizer.py b/tests/lobster/tokenization/test__smiles_tokenizer.py index 27e3e27..45e63d5 100644 --- a/tests/lobster/tokenization/test__smiles_tokenizer.py +++ b/tests/lobster/tokenization/test__smiles_tokenizer.py @@ -31,6 +31,14 @@ def test__make_smiles_tokenizer(mock_load_vocab_file): ids = tokenizer.encode("CCO") assert ids == [2, 7, 7, 10, 5] assert tokenizer.decode(ids) == " C C O " + assert tokenizer.special_tokens_map == { + "eos_token": "", + "unk_token": "", + "sep_token": "", + "pad_token": "", + "cls_token": "", + "mask_token": "", + } class TestSmilesTokenizerFast: @@ -47,3 +55,11 @@ def test_smiles_tokenizer_fast(self, tokenizer): ids = tokenizer.encode("CCO") assert ids == [2, 7, 7, 10, 5] assert tokenizer.decode(ids) == " C C O " + assert tokenizer.special_tokens_map == { + "eos_token": "", + "unk_token": "", + "sep_token": "", + "pad_token": "", + "cls_token": "", + "mask_token": "", + }