Skip to content

Commit

Permalink
special tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
karinazad committed Jan 28, 2025
1 parent 7b34579 commit fac1e41
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 4 deletions.
2 changes: 2 additions & 0 deletions src/lobster/assets/smiles_tokenizer/special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"cls_token": "<cls>",
"eos_token": "<eos>",
"mask_token": "<mask>",
"pad_token": "<pad>",
"sep_token": "<sep>",
"unk_token": "<unk>"
Expand Down
22 changes: 20 additions & 2 deletions src/lobster/assets/smiles_tokenizer/tokenizer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<cls>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<sep>",
Expand All @@ -31,7 +40,16 @@
"special": true
},
{
"id": 582,
"id": 4,
"content": "<mask>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "<eos>",
"single_word": false,
"lstrip": false,
Expand Down Expand Up @@ -176,7 +194,7 @@
"<cls>": 2,
"<sep>": 3,
"<mask>": 4,
"<os>": 5,
"<eos>": 5,
"c": 6,
"C": 7,
"(": 8,
Expand Down
20 changes: 19 additions & 1 deletion src/lobster/assets/smiles_tokenizer/tokenizer_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
"single_word": false,
"special": true
},
"2": {
"content": "<cls>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "<sep>",
"lstrip": false,
Expand All @@ -24,7 +32,15 @@
"single_word": false,
"special": true
},
"582": {
"4": {
"content": "<mask>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"5": {
"content": "<eos>",
"lstrip": false,
"normalized": false,
Expand All @@ -35,8 +51,10 @@
},
"bos_token": null,
"clean_up_tokenization_spaces": false,
"cls_token": "<cls>",
"eos_token": "<eos>",
"extra_special_tokens": {},
"mask_token": "<mask>",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "<pad>",
"padding_side": "right",
Expand Down
2 changes: 1 addition & 1 deletion src/lobster/assets/smiles_tokenizer/vocab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<cls>
<sep>
<mask>
<os>
<eos>
c
C
(
Expand Down
16 changes: 16 additions & 0 deletions tests/lobster/tokenization/test__smiles_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ def test__make_smiles_tokenizer(mock_load_vocab_file):
ids = tokenizer.encode("CCO")
assert ids == [2, 7, 7, 10, 5]
assert tokenizer.decode(ids) == "<cls> C C O <eos>"
assert tokenizer.special_tokens_map == {
"eos_token": "<eos>",
"unk_token": "<unk>",
"sep_token": "<sep>",
"pad_token": "<pad>",
"cls_token": "<cls>",
"mask_token": "<mask>",
}


class TestSmilesTokenizerFast:
Expand All @@ -47,3 +55,11 @@ def test_smiles_tokenizer_fast(self, tokenizer):
ids = tokenizer.encode("CCO")
assert ids == [2, 7, 7, 10, 5]
assert tokenizer.decode(ids) == "<cls> C C O <eos>"
assert tokenizer.special_tokens_map == {
"eos_token": "<eos>",
"unk_token": "<unk>",
"sep_token": "<sep>",
"pad_token": "<pad>",
"cls_token": "<cls>",
"mask_token": "<mask>",
}

0 comments on commit fac1e41

Please sign in to comment.