diff --git a/src/lobster/assets/smiles_tokenizer/special_tokens_map.json b/src/lobster/assets/smiles_tokenizer/special_tokens_map.json new file mode 100644 index 0000000..ba61142 --- /dev/null +++ b/src/lobster/assets/smiles_tokenizer/special_tokens_map.json @@ -0,0 +1,6 @@ +{ + "eos_token": "", + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/src/lobster/assets/smiles_tokenizer/tokenizer.json b/src/lobster/assets/smiles_tokenizer/tokenizer.json new file mode 100644 index 0000000..b9dd29b --- /dev/null +++ b/src/lobster/assets/smiles_tokenizer/tokenizer.json @@ -0,0 +1,759 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 582, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Split", + "pattern": { + "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])" + }, + "behavior": "Isolated", + "invert": false + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 5 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 4 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 0 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 3 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 1 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": true, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "c": 6, + "C": 7, + "(": 8, + ")": 9, + "O": 10, + "1": 11, + "2": 12, + "=": 13, + "N": 14, + ".": 15, + "n": 16, + "3": 17, + "F": 18, + "Cl": 19, + ">>": 20, + "~": 21, + "-": 22, + "4": 23, + "[C@H]": 24, + "S": 25, + "[C@@H]": 26, + "[O-]": 27, + "Br": 28, + "#": 29, + "/": 30, + "[nH]": 31, + "[N+]": 32, + "s": 33, + "5": 34, + "o": 35, + "P": 36, + "[Na+]": 37, + "[Si]": 38, + "I": 39, + "[Na]": 40, + "[Pd]": 41, + "[K+]": 42, + "[K]": 43, + "[P]": 44, + "B": 45, + "[C@]": 46, + "[C@@]": 47, + "[Cl-]": 48, + "6": 49, + "[OH-]": 50, + "\\": 51, + "[N-]": 52, + "[Li]": 53, + "[H]": 54, + "[2H]": 55, + "[NH4+]": 56, + "[c-]": 57, + "[P-]": 58, + "[Cs+]": 59, + "[Li+]": 60, + "[Cs]": 61, + "[NaH]": 62, + "[H-]": 63, + "[O+]": 64, + "[BH4-]": 65, + "[Cu]": 66, + "7": 67, + "[Mg]": 68, + "[Fe+2]": 69, + "[n+]": 70, + "[Sn]": 71, + "[BH-]": 72, + "[Pd+2]": 73, + "[CH]": 74, + "[I-]": 75, + "[Br-]": 76, + "[C-]": 77, + "[Zn]": 78, + "[B-]": 79, + "[F-]": 80, + "[Al]": 81, + "[P+]": 82, + "[BH3-]": 83, + "[Fe]": 84, + "[C]": 85, + "[AlH4]": 86, + "[Ni]": 87, + "[SiH]": 88, + "8": 89, + "[Cu+2]": 90, + "[Mn]": 91, + "[AlH]": 92, + "[nH+]": 93, + "[AlH4-]": 94, + "[O-2]": 95, + "[Cr]": 96, + "[Mg+2]": 97, + "[NH3+]": 98, + "[S@]": 99, + "[Pt]": 100, + "[Al+3]": 101, + "[S@@]": 102, + "[S-]": 103, + "[Ti]": 104, + "[Zn+2]": 105, + "[PH]": 106, + "[NH2+]": 107, + "[Ru]": 108, + "[Ag+]": 109, + "[S+]": 110, + "[I+3]": 111, + "[NH+]": 112, + "[Ca+2]": 113, + "[Ag]": 114, + "9": 115, + "[Os]": 116, + "[Se]": 117, + "[SiH2]": 118, + "[Ca]": 119, + "[Ti+4]": 120, + "[Ac]": 121, + "[Cu+]": 122, + "[S]": 123, + "[Rh]": 124, + "[Cl+3]": 125, + "[cH-]": 126, + "[Zn+]": 127, + "[O]": 128, + "[Cl+]": 129, + "[SH]": 130, + "[H+]": 131, + "[Pd+]": 132, + "[se]": 133, + "[PH+]": 134, + "[I]": 135, + "[Pt+2]": 136, + "[C+]": 137, + "[Mg+]": 138, + "[Hg]": 139, + "[W]": 140, + "[SnH]": 141, + "[SiH3]": 142, + "[Fe+3]": 143, + "[NH]": 144, + "[Mo]": 145, + "[CH2+]": 146, + "%10": 147, + "[CH2-]": 148, + "[CH2]": 149, + "[n-]": 150, + "[Ce+4]": 151, + "[NH-]": 152, + "[Co]": 153, + "[I+]": 154, + "[PH2]": 155, + "[Pt+4]": 156, + "[Ce]": 157, + "[B]": 158, + "[Sn+2]": 159, + "[Ba+2]": 160, + "%11": 161, + "[Fe-3]": 162, + "[18F]": 163, + "[SH-]": 164, + "[Pb+2]": 165, + "[Os-2]": 166, + "[Zr+4]": 167, + "[N]": 168, + "[Ir]": 169, + "[Bi]": 170, + "[Ni+2]": 171, + "[P@]": 172, + "[Co+2]": 173, + "[s+]": 174, + "[As]": 175, + "[P+3]": 176, + "[Hg+2]": 177, + "[Yb+3]": 178, + "[CH-]": 179, + "[Zr+2]": 180, + "[Mn+2]": 181, + "[CH+]": 182, + "[In]": 183, + "[KH]": 184, + "[Ce+3]": 185, + "[Zr]": 186, + "[AlH2-]": 187, + "[OH2+]": 188, + "[Ti+3]": 189, + "[Rh+2]": 190, + "[Sb]": 191, + "[S-2]": 192, + "%12": 193, + "[P@@]": 194, + "[Si@H]": 195, + "[Mn+4]": 196, + "p": 197, + "[Ba]": 198, + "[NH2-]": 199, + "[Ge]": 200, + "[Pb+4]": 201, + "[Cr+3]": 202, + "[Au]": 203, + "[LiH]": 204, + "[Sc+3]": 205, + "[o+]": 206, + "[Rh-3]": 207, + "%13": 208, + "[Br]": 209, + "[Sb-]": 210, + "[S@+]": 211, + "[I+2]": 212, + "[Ar]": 213, + "[V]": 214, + "[Cu-]": 215, + "[Al-]": 216, + "[Te]": 217, + "[13c]": 218, + "[13C]": 219, + "[Cl]": 220, + "[PH4+]": 221, + "[SiH4]": 222, + "[te]": 223, + "[CH3-]": 224, + "[S@@+]": 225, + "[Rh+3]": 226, + "[SH+]": 227, + "[Bi+3]": 228, + "[Br+2]": 229, + "[La]": 230, + "[La+3]": 231, + "[Pt-2]": 232, + "[N@@]": 233, + "[PH3+]": 234, + "[N@]": 235, + "[Si+4]": 236, + "[Sr+2]": 237, + "[Al+]": 238, + "[Pb]": 239, + "[SeH]": 240, + "[Si-]": 241, + "[V+5]": 242, + "[Y+3]": 243, + "[Re]": 244, + "[Ru+]": 245, + "[Sm]": 246, + "*": 247, + "[3H]": 248, + "[NH2]": 249, + "[Ag-]": 250, + "[13CH3]": 251, + "[OH+]": 252, + "[Ru+3]": 253, + "[OH]": 254, + "[Gd+3]": 255, + "[13CH2]": 256, + "[In+3]": 257, + "[Si@@]": 258, + "[Si@]": 259, + "[Ti+2]": 260, + "[Sn+]": 261, + "[Cl+2]": 262, + "[AlH-]": 263, + "[Pd-2]": 264, + "[SnH3]": 265, + "[B+3]": 266, + "[Cu-2]": 267, + "[Nd+3]": 268, + "[Pb+3]": 269, + "[13cH]": 270, + "[Fe-4]": 271, + "[Ga]": 272, + "[Sn+4]": 273, + "[Hg+]": 274, + "[11CH3]": 275, + "[Hf]": 276, + "[Pr]": 277, + "[Y]": 278, + "[S+2]": 279, + "[Cd]": 280, + "[Cr+6]": 281, + "[Zr+3]": 282, + "[Rh+]": 283, + "[CH3]": 284, + "[N-3]": 285, + "[Hf+2]": 286, + "[Th]": 287, + "[Sb+3]": 288, + "%14": 289, + "[Cr+2]": 290, + "[Ru+2]": 291, + "[Hf+4]": 292, + "[14C]": 293, + "[Ta]": 294, + "[Tl+]": 295, + "[B+]": 296, + "[Os+4]": 297, + "[PdH2]": 298, + "[Pd-]": 299, + "[Cd+2]": 300, + "[Co+3]": 301, + "[S+4]": 302, + "[Nb+5]": 303, + "[123I]": 304, + "[c+]": 305, + "[Rb+]": 306, + "[V+2]": 307, + "[CH3+]": 308, + "[Ag+2]": 309, + "[cH+]": 310, + "[Mn+3]": 311, + "[Se-]": 312, + "[As-]": 313, + "[Eu+3]": 314, + "[SH2]": 315, + "[Sm+3]": 316, + "[IH+]": 317, + "%15": 318, + "[OH3+]": 319, + "[PH3]": 320, + "[IH2+]": 321, + "[SH2+]": 322, + "[Ir+3]": 323, + "[AlH3]": 324, + "[Sc]": 325, + "[Yb]": 326, + "[15NH2]": 327, + "[Lu]": 328, + "[sH+]": 329, + "[Gd]": 330, + "[18F-]": 331, + "[SH3+]": 332, + "[SnH4]": 333, + "[TeH]": 334, + "[Si@@H]": 335, + "[Ga+3]": 336, + "[CaH2]": 337, + "[Tl]": 338, + "[Ta+5]": 339, + "[GeH]": 340, + "[Br+]": 341, + "[Sr]": 342, + "[Tl+3]": 343, + "[Sm+2]": 344, + "[PH5]": 345, + "%16": 346, + "[N@@+]": 347, + "[Au+3]": 348, + "[C-4]": 349, + "[Nd]": 350, + "[Ti+]": 351, + "[IH]": 352, + "[N@+]": 353, + "[125I]": 354, + "[Eu]": 355, + "[Sn+3]": 356, + "[Nb]": 357, + "[Er+3]": 358, + "[123I-]": 359, + "[14c]": 360, + "%17": 361, + "[SnH2]": 362, + "[YH]": 363, + "[Sb+5]": 364, + "[Pr+3]": 365, + "[Ir+]": 366, + "[N+3]": 367, + "[AlH2]": 368, + "[19F]": 369, + "%18": 370, + "[Tb]": 371, + "[14CH]": 372, + "[Mo+4]": 373, + "[Si+]": 374, + "[BH]": 375, + "[Be]": 376, + "[Rb]": 377, + "[pH]": 378, + "%19": 379, + "%20": 380, + "[Xe]": 381, + "[Ir-]": 382, + "[Be+2]": 383, + "[C+4]": 384, + "[RuH2]": 385, + "[15NH]": 386, + "[U+2]": 387, + "[Au-]": 388, + "%21": 389, + "%22": 390, + "[Au+]": 391, + "[15n]": 392, + "[Al+2]": 393, + "[Tb+3]": 394, + "[15N]": 395, + "[V+3]": 396, + "[W+6]": 397, + "[14CH3]": 398, + "[Cr+4]": 399, + "[ClH+]": 400, + "b": 401, + "[Ti+6]": 402, + "[Nd+]": 403, + "[Zr+]": 404, + "[PH2+]": 405, + "[Fm]": 406, + "[N@H+]": 407, + "[RuH]": 408, + "[Dy+3]": 409, + "%23": 410, + "[Hf+3]": 411, + "[W+4]": 412, + "[11C]": 413, + "[13CH]": 414, + "[Er]": 415, + "[124I]": 416, + "[LaH]": 417, + "[F]": 418, + "[siH]": 419, + "[Ga+]": 420, + "[Cm]": 421, + "[GeH3]": 422, + "[IH-]": 423, + "[U+6]": 424, + "[SeH+]": 425, + "[32P]": 426, + "[SeH-]": 427, + "[Pt-]": 428, + "[Ir+2]": 429, + "[se+]": 430, + "[U]": 431, + "[F+]": 432, + "[BH2]": 433, + "[As+]": 434, + "[Cf]": 435, + "[ClH2+]": 436, + "[Ni+]": 437, + "[TeH3]": 438, + "[SbH2]": 439, + "[Ag+3]": 440, + "%24": 441, + "[18O]": 442, + "[PH4]": 443, + "[Os+2]": 444, + "[Na-]": 445, + "[Sb+2]": 446, + "[V+4]": 447, + "[Ho+3]": 448, + "[68Ga]": 449, + "[PH-]": 450, + "[Bi+2]": 451, + "[Ce+2]": 452, + "[Pd+3]": 453, + "[99Tc]": 454, + "[13C@@H]": 455, + "[Fe+6]": 456, + "[c]": 457, + "[GeH2]": 458, + "[10B]": 459, + "[Cu+3]": 460, + "[Mo+2]": 461, + "[Cr+]": 462, + "[Pd+4]": 463, + "[Dy]": 464, + "[AsH]": 465, + "[Ba+]": 466, + "[SeH2]": 467, + "[In+]": 468, + "[TeH2]": 469, + "[BrH+]": 470, + "[14cH]": 471, + "[W+]": 472, + "[13C@H]": 473, + "[AsH2]": 474, + "[In+2]": 475, + "[N+2]": 476, + "[N@@H+]": 477, + "[SbH]": 478, + "[60Co]": 479, + "[AsH4+]": 480, + "[AsH3]": 481, + "[18OH]": 482, + "[Ru-2]": 483, + "[Na-2]": 484, + "[CuH2]": 485, + "[31P]": 486, + "[Ti+5]": 487, + "[35S]": 488, + "[P@@H]": 489, + "[ArH]": 490, + "[Co+]": 491, + "[Zr-2]": 492, + "[BH2-]": 493, + "[131I]": 494, + "[SH5]": 495, + "[VH]": 496, + "[B+2]": 497, + "[Yb+2]": 498, + "[14C@H]": 499, + "[211At]": 500, + "[NH3+2]": 501, + "[IrH]": 502, + "[IrH2]": 503, + "[Rh-]": 504, + "[Cr-]": 505, + "[Sb+]": 506, + "[Ni+3]": 507, + "[TaH3]": 508, + "[Tl+2]": 509, + "[64Cu]": 510, + "[Tc]": 511, + "[Cd+]": 512, + "[1H]": 513, + "[15nH]": 514, + "[AlH2+]": 515, + "[FH+2]": 516, + "[BiH3]": 517, + "[Ru-]": 518, + "[Mo+6]": 519, + "[AsH+]": 520, + "[BaH2]": 521, + "[BaH]": 522, + "[Fe+4]": 523, + "[229Th]": 524, + "[Th+4]": 525, + "[As+3]": 526, + "[NH+3]": 527, + "[P@H]": 528, + "[Li-]": 529, + "[7NaH]": 530, + "[Bi+]": 531, + "[PtH+2]": 532, + "[p-]": 533, + "[Re+5]": 534, + "[NiH]": 535, + "[Ni-]": 536, + "[Xe+]": 537, + "[Ca+]": 538, + "[11c]": 539, + "[Rh+4]": 540, + "[AcH]": 541, + "[HeH]": 542, + "[Sc+2]": 543, + "[Mn+]": 544, + "[UH]": 545, + "[14CH2]": 546, + "[SiH4+]": 547, + "[18OH2]": 548, + "[Ac-]": 549, + "[Re+4]": 550, + "[118Sn]": 551, + "[153Sm]": 552, + "[P+2]": 553, + "[9CH]": 554, + "[9CH3]": 555, + "[Y-]": 556, + "[NiH2]": 557, + "[Si+2]": 558, + "[Mn+6]": 559, + "[ZrH2]": 560, + "[C-2]": 561, + "[Bi+5]": 562, + "[24NaH]": 563, + "[Fr]": 564, + "[15CH]": 565, + "[Se+]": 566, + "[At]": 567, + "[P-3]": 568, + "[124I-]": 569, + "[CuH2-]": 570, + "[Nb+4]": 571, + "[Nb+3]": 572, + "[MgH]": 573, + "[Ir+4]": 574, + "[67Ga+3]": 575, + "[67Ga]": 576, + "[13N]": 577, + "[15OH2]": 578, + "[2NH]": 579, + "[Ho]": 580, + "[Cn]": 581 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/src/lobster/assets/smiles_tokenizer/tokenizer_config.json b/src/lobster/assets/smiles_tokenizer/tokenizer_config.json new file mode 100644 index 0000000..16a5289 --- /dev/null +++ b/src/lobster/assets/smiles_tokenizer/tokenizer_config.json @@ -0,0 +1,47 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "582": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sep_token": "", + "tokenizer_class": "PreTrainedTokenizerFast", + "truncation_side": "left", + "unk_token": "" +} diff --git a/src/lobster/assets/smiles_tokenizer/vocab.txt b/src/lobster/assets/smiles_tokenizer/vocab.txt new file mode 100644 index 0000000..539c2e1 --- /dev/null +++ b/src/lobster/assets/smiles_tokenizer/vocab.txt @@ -0,0 +1,582 @@ + + + + + + +c +C +( +) +O +1 +2 += +N +. +n +3 +F +Cl +>> +~ +- +4 +[C@H] +S +[C@@H] +[O-] +Br +# +/ +[nH] +[N+] +s +5 +o +P +[Na+] +[Si] +I +[Na] +[Pd] +[K+] +[K] +[P] +B +[C@] +[C@@] +[Cl-] +6 +[OH-] +\ +[N-] +[Li] +[H] +[2H] +[NH4+] +[c-] +[P-] +[Cs+] +[Li+] +[Cs] +[NaH] +[H-] +[O+] +[BH4-] +[Cu] +7 +[Mg] +[Fe+2] +[n+] +[Sn] +[BH-] +[Pd+2] +[CH] +[I-] +[Br-] +[C-] +[Zn] +[B-] +[F-] +[Al] +[P+] +[BH3-] +[Fe] +[C] +[AlH4] +[Ni] +[SiH] +8 +[Cu+2] +[Mn] +[AlH] +[nH+] +[AlH4-] +[O-2] +[Cr] +[Mg+2] +[NH3+] +[S@] +[Pt] +[Al+3] +[S@@] +[S-] +[Ti] +[Zn+2] +[PH] +[NH2+] +[Ru] +[Ag+] +[S+] +[I+3] +[NH+] +[Ca+2] +[Ag] +9 +[Os] +[Se] +[SiH2] +[Ca] +[Ti+4] +[Ac] +[Cu+] +[S] +[Rh] +[Cl+3] +[cH-] +[Zn+] +[O] +[Cl+] +[SH] +[H+] +[Pd+] +[se] +[PH+] +[I] +[Pt+2] +[C+] +[Mg+] +[Hg] +[W] +[SnH] +[SiH3] +[Fe+3] +[NH] +[Mo] +[CH2+] +%10 +[CH2-] +[CH2] +[n-] +[Ce+4] +[NH-] +[Co] +[I+] +[PH2] +[Pt+4] +[Ce] +[B] +[Sn+2] +[Ba+2] +%11 +[Fe-3] +[18F] +[SH-] +[Pb+2] +[Os-2] +[Zr+4] +[N] +[Ir] +[Bi] +[Ni+2] +[P@] +[Co+2] +[s+] +[As] +[P+3] +[Hg+2] +[Yb+3] +[CH-] +[Zr+2] +[Mn+2] +[CH+] +[In] +[KH] +[Ce+3] +[Zr] +[AlH2-] +[OH2+] +[Ti+3] +[Rh+2] +[Sb] +[S-2] +%12 +[P@@] +[Si@H] +[Mn+4] +p +[Ba] +[NH2-] +[Ge] +[Pb+4] +[Cr+3] +[Au] +[LiH] +[Sc+3] +[o+] +[Rh-3] +%13 +[Br] +[Sb-] +[S@+] +[I+2] +[Ar] +[V] +[Cu-] +[Al-] +[Te] +[13c] +[13C] +[Cl] +[PH4+] +[SiH4] +[te] +[CH3-] +[S@@+] +[Rh+3] +[SH+] +[Bi+3] +[Br+2] +[La] +[La+3] +[Pt-2] +[N@@] +[PH3+] +[N@] +[Si+4] +[Sr+2] +[Al+] +[Pb] +[SeH] +[Si-] +[V+5] +[Y+3] +[Re] +[Ru+] +[Sm] +* +[3H] +[NH2] +[Ag-] +[13CH3] +[OH+] +[Ru+3] +[OH] +[Gd+3] +[13CH2] +[In+3] +[Si@@] +[Si@] +[Ti+2] +[Sn+] +[Cl+2] +[AlH-] +[Pd-2] +[SnH3] +[B+3] +[Cu-2] +[Nd+3] +[Pb+3] +[13cH] +[Fe-4] +[Ga] +[Sn+4] +[Hg+] +[11CH3] +[Hf] +[Pr] +[Y] +[S+2] +[Cd] +[Cr+6] +[Zr+3] +[Rh+] +[CH3] +[N-3] +[Hf+2] +[Th] +[Sb+3] +%14 +[Cr+2] +[Ru+2] +[Hf+4] +[14C] +[Ta] +[Tl+] +[B+] +[Os+4] +[PdH2] +[Pd-] +[Cd+2] +[Co+3] +[S+4] +[Nb+5] +[123I] +[c+] +[Rb+] +[V+2] +[CH3+] +[Ag+2] +[cH+] +[Mn+3] +[Se-] +[As-] +[Eu+3] +[SH2] +[Sm+3] +[IH+] +%15 +[OH3+] +[PH3] +[IH2+] +[SH2+] +[Ir+3] +[AlH3] +[Sc] +[Yb] +[15NH2] +[Lu] +[sH+] +[Gd] +[18F-] +[SH3+] +[SnH4] +[TeH] +[Si@@H] +[Ga+3] +[CaH2] +[Tl] +[Ta+5] +[GeH] +[Br+] +[Sr] +[Tl+3] +[Sm+2] +[PH5] +%16 +[N@@+] +[Au+3] +[C-4] +[Nd] +[Ti+] +[IH] +[N@+] +[125I] +[Eu] +[Sn+3] +[Nb] +[Er+3] +[123I-] +[14c] +%17 +[SnH2] +[YH] +[Sb+5] +[Pr+3] +[Ir+] +[N+3] +[AlH2] +[19F] +%18 +[Tb] +[14CH] +[Mo+4] +[Si+] +[BH] +[Be] +[Rb] +[pH] +%19 +%20 +[Xe] +[Ir-] +[Be+2] +[C+4] +[RuH2] +[15NH] +[U+2] +[Au-] +%21 +%22 +[Au+] +[15n] +[Al+2] +[Tb+3] +[15N] +[V+3] +[W+6] +[14CH3] +[Cr+4] +[ClH+] +b +[Ti+6] +[Nd+] +[Zr+] +[PH2+] +[Fm] +[N@H+] +[RuH] +[Dy+3] +%23 +[Hf+3] +[W+4] +[11C] +[13CH] +[Er] +[124I] +[LaH] +[F] +[siH] +[Ga+] +[Cm] +[GeH3] +[IH-] +[U+6] +[SeH+] +[32P] +[SeH-] +[Pt-] +[Ir+2] +[se+] +[U] +[F+] +[BH2] +[As+] +[Cf] +[ClH2+] +[Ni+] +[TeH3] +[SbH2] +[Ag+3] +%24 +[18O] +[PH4] +[Os+2] +[Na-] +[Sb+2] +[V+4] +[Ho+3] +[68Ga] +[PH-] +[Bi+2] +[Ce+2] +[Pd+3] +[99Tc] +[13C@@H] +[Fe+6] +[c] +[GeH2] +[10B] +[Cu+3] +[Mo+2] +[Cr+] +[Pd+4] +[Dy] +[AsH] +[Ba+] +[SeH2] +[In+] +[TeH2] +[BrH+] +[14cH] +[W+] +[13C@H] +[AsH2] +[In+2] +[N+2] +[N@@H+] +[SbH] +[60Co] +[AsH4+] +[AsH3] +[18OH] +[Ru-2] +[Na-2] +[CuH2] +[31P] +[Ti+5] +[35S] +[P@@H] +[ArH] +[Co+] +[Zr-2] +[BH2-] +[131I] +[SH5] +[VH] +[B+2] +[Yb+2] +[14C@H] +[211At] +[NH3+2] +[IrH] +[IrH2] +[Rh-] +[Cr-] +[Sb+] +[Ni+3] +[TaH3] +[Tl+2] +[64Cu] +[Tc] +[Cd+] +[1H] +[15nH] +[AlH2+] +[FH+2] +[BiH3] +[Ru-] +[Mo+6] +[AsH+] +[BaH2] +[BaH] +[Fe+4] +[229Th] +[Th+4] +[As+3] +[NH+3] +[P@H] +[Li-] +[7NaH] +[Bi+] +[PtH+2] +[p-] +[Re+5] +[NiH] +[Ni-] +[Xe+] +[Ca+] +[11c] +[Rh+4] +[AcH] +[HeH] +[Sc+2] +[Mn+] +[UH] +[14CH2] +[SiH4+] +[18OH2] +[Ac-] +[Re+4] +[118Sn] +[153Sm] +[P+2] +[9CH] +[9CH3] +[Y-] +[NiH2] +[Si+2] +[Mn+6] +[ZrH2] +[C-2] +[Bi+5] +[24NaH] +[Fr] +[15CH] +[Se+] +[At] +[P-3] +[124I-] +[CuH2-] +[Nb+4] +[Nb+3] +[MgH] +[Ir+4] +[67Ga+3] +[67Ga] +[13N] +[15OH2] +[2NH] +[Ho] +[Cn] \ No newline at end of file diff --git a/src/lobster/tokenization/__init__.py b/src/lobster/tokenization/__init__.py index 9fc5246..07636de 100644 --- a/src/lobster/tokenization/__init__.py +++ b/src/lobster/tokenization/__init__.py @@ -1,3 +1,4 @@ +from ._amino_acid import AminoAcidTokenizerFast from ._hyena_tokenizer import HyenaTokenizer from ._hyena_tokenizer_transform import HyenaTokenizerTransform from ._mgm_tokenizer import MgmTokenizer @@ -14,5 +15,4 @@ PT5TeacherForcingTransform, PT5TokenizerTransform, ) - -from ._amino_acid import AminoAcidTokenizerFast +from ._smiles_tokenizer import SmilesTokenizerFast diff --git a/src/lobster/tokenization/_hyena_tokenizer.py b/src/lobster/tokenization/_hyena_tokenizer.py index 6d6f191..7cb4855 100644 --- a/src/lobster/tokenization/_hyena_tokenizer.py +++ b/src/lobster/tokenization/_hyena_tokenizer.py @@ -8,6 +8,8 @@ from transformers.tokenization_utils_base import AddedToken from transformers.utils import logging +from ._load_vocab_file import load_vocab_file + logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} @@ -15,12 +17,6 @@ VOCAB_PATH = importlib.resources.files("lobster") / "assets" / "hyena_tokenizer" / "vocab.txt" -def load_vocab_file(vocab_file): - with open(vocab_file, "r") as f: - lines = f.read().splitlines() - return [ll.strip() for ll in lines] - - class HyenaTokenizer(PreTrainedTokenizer): """ Constructs a Hyena tokenizer. diff --git a/src/lobster/tokenization/_load_vocab_file.py b/src/lobster/tokenization/_load_vocab_file.py new file mode 100644 index 0000000..547f624 --- /dev/null +++ b/src/lobster/tokenization/_load_vocab_file.py @@ -0,0 +1,5 @@ +def load_vocab_file(vocab_file: str) -> list[str]: + with open(vocab_file, "r") as f: + lines = f.read().splitlines() + + return [ll.strip() for ll in lines] diff --git a/src/lobster/tokenization/_mgm_tokenizer.py b/src/lobster/tokenization/_mgm_tokenizer.py index 345affd..187b84a 100644 --- a/src/lobster/tokenization/_mgm_tokenizer.py +++ b/src/lobster/tokenization/_mgm_tokenizer.py @@ -8,6 +8,8 @@ from transformers.tokenization_utils_base import AddedToken from transformers.utils import logging +from ._load_vocab_file import load_vocab_file + logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} @@ -15,12 +17,6 @@ VOCAB_PATH = importlib.resources.files("lobster") / "assets" / "mgm_tokenizer" / "vocab.txt" -def load_vocab_file(vocab_file): - with open(vocab_file, "r") as f: - lines = f.read().splitlines() - return [ll.strip() for ll in lines] - - class MgmTokenizer(PreTrainedTokenizer): """ Constructs a MGM Mlm tokenizer. diff --git a/src/lobster/tokenization/_pmlm_tokenizer.py b/src/lobster/tokenization/_pmlm_tokenizer.py index 5d1c38d..e25c963 100644 --- a/src/lobster/tokenization/_pmlm_tokenizer.py +++ b/src/lobster/tokenization/_pmlm_tokenizer.py @@ -11,6 +11,8 @@ from transformers.tokenization_utils_base import AddedToken from transformers.utils import logging +from ._load_vocab_file import load_vocab_file + logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} @@ -30,12 +32,6 @@ } -def load_vocab_file(vocab_file): - with open(vocab_file, "r") as f: - lines = f.read().splitlines() - return [ll.strip() for ll in lines] - - class PmlmTokenizer(PreTrainedTokenizer): """ Constructs a Pmlm tokenizer. diff --git a/src/lobster/tokenization/_smiles_tokenizer.py b/src/lobster/tokenization/_smiles_tokenizer.py new file mode 100644 index 0000000..b9b9df5 --- /dev/null +++ b/src/lobster/tokenization/_smiles_tokenizer.py @@ -0,0 +1,76 @@ +import importlib.resources +from typing import Optional + +from tokenizers import Regex, Tokenizer +from tokenizers.models import BPE +from tokenizers.pre_tokenizers import Split +from tokenizers.processors import TemplateProcessing +from transformers import PreTrainedTokenizerFast + +from ._load_vocab_file import load_vocab_file + +SMILES_REGEX_PATTERN = ( + r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])""" +) + +PRETRAINED_TOKENIZER_PATH = importlib.resources.files("lobster") / "assets" / "smiles_tokenizer" +VOCAB_PATH = PRETRAINED_TOKENIZER_PATH / "vocab.txt" + + +def _make_smiles_tokenizer( + cls_token: str = "", + eos_token: str = "", + unk_token: str = "", + pad_token: str = "", + sep_token: str = "", + mask_token: str = "", + vocab_file: Optional[str] = None, +) -> PreTrainedTokenizerFast: + vocab = load_vocab_file(VOCAB_PATH if vocab_file is None else vocab_file) + vocab = {v: k for k, v in dict(enumerate(vocab)).items()} + + tok = Tokenizer(BPE(vocab, merges=[], unk_token="", ignore_merges=True)) + + tok.pre_tokenizer = Split(pattern=Regex(SMILES_REGEX_PATTERN), behavior="isolated") + + tok.post_processor = TemplateProcessing( + single=f"{cls_token} $A {eos_token}", + pair=f"{cls_token} $A {eos_token} $B:1 {eos_token}:1", + special_tokens=[ + (pad_token, 0), + (unk_token, 1), + (cls_token, 2), + (sep_token, 3), + (mask_token, 4), + (eos_token, 5), + ], + ) + + return PreTrainedTokenizerFast( + tokenizer_object=tok, + padding_side="right", + truncation_side="left", + bos_token=None, + sep_token=sep_token, + eos_token=eos_token, + pad_token=pad_token, + unk_token=unk_token, + ) + + +class SmilesTokenizerFast(PreTrainedTokenizerFast): + padding_side = "right" + truncation_side = "right" + model_input_names = ["input_ids", "attention_mask"] + + def __init__(self): + super().__init__( + tokenizer_file=str(PRETRAINED_TOKENIZER_PATH / "tokenizer.json"), + bos_token=None, + eos_token="", + unk_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + )