diff --git a/main/api/safe.html b/main/api/safe.html index 90d59e6..7a44d58 100644 --- a/main/api/safe.html +++ b/main/api/safe.html @@ -536,6 +536,120 @@ + + +
SAFEDesign
+
+
+¶Molecular generation using SAFE pretrained model
+ +safe/sample.py
22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 +282 +283 +284 +285 +286 +287 +288 +289 +290 +291 +292 +293 +294 +295 +296 +297 +298 +299 +300 +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +313 +314 +315 +316 +317 +318 +319 +320 +321 +322 +323 +324 +325 +326 +327 +328 +329 +330 +331 +332 +333 +334 +335 +336 +337 +338 +339 +340 +341 +342 +343 +344 +345 +346 +347 +348 +349 +350 +351 +352 +353 +354 +355 +356 +357 +358 +359 +360 +361 +362 +363 +364 +365 +366 +367 +368 +369 +370 +371 +372 +373 +374 +375 +376 +377 +378 +379 +380 +381 +382 +383 +384 +385 +386 +387 +388 +389 +390 +391 +392 +393 +394 +395 +396 +397 +398 +399 +400 +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 +445 +446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 +557 +558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 +596 +597 +598 +599 +600 +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +613 +614 +615 +616 +617 +618 +619 +620 +621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 +664 +665 +666 +667 +668 +669 +670 +671 +672 +673 +674 +675 +676 +677 +678 +679 +680 +681 +682 +683 +684 +685 +686 +687 +688 +689 +690 +691 +692 +693 +694 +695 +696 +697 +698 +699 +700 +701 +702 +703 +704 +705 +706 +707 +708 +709 +710 +711 +712 +713 +714 +715 +716 +717 +718 +719 +720 +721 +722 +723 +724 +725 +726 +727 +728 +729 +730 +731 +732 +733 +734 +735 +736 +737 +738 +739 +740 +741 +742 +743 +744 +745 +746 +747 +748 +749 +750 +751 +752 +753 +754 +755 +756 +757 +758 +759 +760 +761 +762 +763 +764 +765 +766 +767 +768 +769 +770 +771 +772 +773 +774 +775 +776 +777 +778 +779 +780 +781 +782 +783 +784 +785 +786 +787 +788 +789 +790 +791 +792 +793 +794 +795 +796 +797 +798 +799 +800 +801 +802 +803 +804 +805 +806 +807 +808 +809 +810 +811 +812 +813 +814 +815 +816 +817 +818 +819 +820 +821 +822 +823 +824 +825 +826 +827 +828 +829 +830 +831 +832 +833 +834 +835 +836 +837 +838 +839 +840 +841 +842 +843 +844 +845 +846 +847 +848 +849 +850 +851 +852 +853 +854 +855 +856 +857 +858 +859 +860 +861 +862 +863 +864 +865 +866 +867 +868 +869 +870 +871 +872 +873 +874 +875 +876 +877 +878 +879 +880 +881 +882 +883 +884 +885 +886 +887 +888 +889 +890 +891 +892 +893 +894 +895 +896 +897 +898 +899 +900 +901 +902 +903 +904 +905 |
|
__init__(model, tokenizer, generation_config=None, safe_encoder=None, verbose=True)
+
+¶SAFEDesign constructor
+Info
+Design methods in SAFE are not deterministic when it comes to the token sampling step.
+If a method accepts a random_seed
, it's for the SAFE-related algorithms and not the
+sampling from the autoregressive model. To ensure you get a deterministic sampling,
+please set the seed at the transformers
package level.
import safe as sf
+import transformers
+my_seed = 100
+designer = sf.SAFEDesign(...)
+
+transformers.set_seed(100) # use this before calling a design function
+designer.linker_generation(...)
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
model |
+
+ Union[SAFEDoubleHeadsModel, str]
+ |
+
+
+
+ input SAFEDoubleHeadsModel to use for generation + |
+ + required + | +
tokenizer |
+
+ Union[str, SAFETokenizer]
+ |
+
+
+
+ input SAFETokenizer to use for generation + |
+ + required + | +
generation_config |
+
+ Optional[Union[str, GenerationConfig]]
+ |
+
+
+
+ input GenerationConfig to use for generation + |
+
+ None
+ |
+
safe_encoder |
+
+ Optional[SAFEConverter]
+ |
+
+
+
+ custom safe encoder to use + |
+
+ None
+ |
+
verbose |
+
+ bool
+ |
+
+
+
+ whether to print out logging information during generation + |
+
+ True
+ |
+
safe/sample.py
28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 |
|
__mix_sequences(prefix_sequences, suffix_sequences, prefix, suffix, n_samples, mol_linker_slicer)
+
+¶Use generated prefix and suffix sequences to form new molecules +that will be the merging of both. This is the two step scaffold morphing and linker generation scheme +Args: + prefix_sequences: list of prefix sequences + suffix_sequences: list of suffix sequences + prefix: decoded smiles of the prefix + suffix: decoded smiles of the suffix + n_samples: number of samples to generate
+ +safe/sample.py
621 +622 +623 +624 +625 +626 +627 +628 +629 +630 +631 +632 +633 +634 +635 +636 +637 +638 +639 +640 +641 +642 +643 +644 +645 +646 +647 +648 +649 +650 +651 +652 +653 +654 +655 +656 +657 +658 +659 +660 +661 +662 +663 |
|
de_novo_generation(n_samples_per_trial=10, sanitize=False, n_trials=None, **kwargs)
+
+¶Perform de novo generation using the pretrained SAFE model.
+De novo generation is equivalent to not having any prefix.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
n_samples_per_trial |
+
+ int
+ |
+
+
+
+ number of new molecules to generate + |
+
+ 10
+ |
+
sanitize |
+
+ bool
+ |
+
+
+
+ whether to perform sanitization, aka, perform control to ensure what is asked is what is returned + |
+
+ False
+ |
+
n_trials |
+
+ Optional[int]
+ |
+
+
+
+ number of randomization to perform + |
+
+ None
+ |
+
kwargs |
+
+ Optional[Dict[Any, Any]]
+ |
+
+
+
+ any argument to provide to the underlying generation function + |
+
+ {}
+ |
+
safe/sample.py
558 +559 +560 +561 +562 +563 +564 +565 +566 +567 +568 +569 +570 +571 +572 +573 +574 +575 +576 +577 +578 +579 +580 +581 +582 +583 +584 +585 +586 +587 +588 +589 +590 +591 +592 +593 +594 +595 |
|
linker_generation(*groups, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, model_only=False, **kwargs)
+
+¶Perform linker generation using the pretrained SAFE model. +Linker generation is really just scaffold morphing underlying.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
groups |
+
+ Union[str, Mol]
+ |
+
+
+
+ list of fragments to link together, they are joined in the order provided + |
+
+ ()
+ |
+
n_samples_per_trial |
+
+ int
+ |
+
+
+
+ number of new molecules to generate for each randomization + |
+
+ 10
+ |
+
n_trials |
+
+ Optional[int]
+ |
+
+
+
+ number of randomization to perform + |
+
+ 1
+ |
+
do_not_fragment_further |
+
+ Optional[bool]
+ |
+
+
+
+ whether to fragment the scaffold further or not + |
+
+ True
+ |
+
sanitize |
+
+ bool
+ |
+
+
+
+ whether to sanitize the generated molecules + |
+
+ False
+ |
+
random_seed |
+
+ Optional[int]
+ |
+
+
+
+ random seed to use + |
+
+ None
+ |
+
model_only |
+
+ Optional[bool]
+ |
+
+
+
+ whether to use the model only ability and nothing more. + |
+
+ False
+ |
+
kwargs |
+
+ Optional[Dict[Any, Any]]
+ |
+
+
+
+ any argument to provide to the underlying generation function + |
+
+ {}
+ |
+
safe/sample.py
106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 |
|
load_default(verbose=False, model_dir=None, device=None)
+
+
+ classmethod
+
+
+¶Load default SAFEGenerator model
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
verbose |
+
+ bool
+ |
+
+
+
+ whether to print out logging information during generation + |
+
+ False
+ |
+
model_dir |
+
+ Optional[str]
+ |
+
+
+
+ Optional path to model folder to use instead of the default one.
+If provided the tokenizer should be in the model_dir named as |
+
+ None
+ |
+
device |
+
+ str
+ |
+
+
+
+ optional device where to move the model + |
+
+ None
+ |
+
safe/sample.py
85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 |
|
motif_extension(motif, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, **kwargs)
+
+¶Perform motif extension using the pretrained SAFE model. +Motif extension is really just scaffold decoration underlying.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
motif |
+
+ Union[str, Mol]
+ |
+
+
+
+ scaffold (with attachment points) to decorate + |
+ + required + | +
n_samples_per_trial |
+
+ int
+ |
+
+
+
+ number of new molecules to generate for each randomization + |
+
+ 10
+ |
+
n_trials |
+
+ Optional[int]
+ |
+
+
+
+ number of randomization to perform + |
+
+ 1
+ |
+
do_not_fragment_further |
+
+ Optional[bool]
+ |
+
+
+
+ whether to fragment the scaffold further or not + |
+
+ True
+ |
+
sanitize |
+
+ bool
+ |
+
+
+
+ whether to sanitize the generated molecules and check + |
+
+ False
+ |
+
random_seed |
+
+ Optional[int]
+ |
+
+
+
+ random seed to use + |
+
+ None
+ |
+
kwargs |
+
+ Optional[Dict[Any, Any]]
+ |
+
+
+
+ any argument to provide to the underlying generation function + |
+
+ {}
+ |
+
safe/sample.py
413 +414 +415 +416 +417 +418 +419 +420 +421 +422 +423 +424 +425 +426 +427 +428 +429 +430 +431 +432 +433 +434 +435 +436 +437 +438 +439 +440 +441 +442 +443 +444 |
|
scaffold_decoration(scaffold, n_samples_per_trial=10, n_trials=1, do_not_fragment_further=True, sanitize=False, random_seed=None, add_dot=True, **kwargs)
+
+¶Perform scaffold decoration using the pretrained SAFE model
+For scaffold decoration, we basically starts with a prefix with the attachment point. +We first convert the prefix into valid safe string.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
scaffold |
+
+ Union[str, Mol]
+ |
+
+
+
+ scaffold (with attachment points) to decorate + |
+ + required + | +
n_samples_per_trial |
+
+ int
+ |
+
+
+
+ number of new molecules to generate for each randomization + |
+
+ 10
+ |
+
n_trials |
+
+ Optional[int]
+ |
+
+
+
+ number of randomization to perform + |
+
+ 1
+ |
+
do_not_fragment_further |
+
+ Optional[bool]
+ |
+
+
+
+ whether to fragment the scaffold further or not + |
+
+ True
+ |
+
sanitize |
+
+ bool
+ |
+
+
+
+ whether to sanitize the generated molecules and check if the scaffold is still present + |
+
+ False
+ |
+
random_seed |
+
+ Optional[int]
+ |
+
+
+
+ random seed to use + |
+
+ None
+ |
+
kwargs |
+
+ Optional[Dict[Any, Any]]
+ |
+
+
+
+ any argument to provide to the underlying generation function + |
+
+ {}
+ |
+
safe/sample.py
512 +513 +514 +515 +516 +517 +518 +519 +520 +521 +522 +523 +524 +525 +526 +527 +528 +529 +530 +531 +532 +533 +534 +535 +536 +537 +538 +539 +540 +541 +542 +543 +544 +545 +546 +547 +548 +549 +550 +551 +552 +553 +554 +555 +556 |
|
scaffold_morphing(side_chains=None, mol=None, core=None, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, **kwargs)
+
+¶Perform scaffold morphing decoration using the pretrained SAFE model
+For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them. +If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the +scaffold morphing then.
+Finding the side chains
+The algorithm to find the side chains from core assumes that the core we get as input has attachment points. +Those attachment points are never considered as part of the query, rather they are used to define the attachment points. +See ~sf.utils.compute_side_chains for more information.
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
side_chains |
+
+ Optional[Union[Mol, str, List[Union[str, Mol]]]]
+ |
+
+
+
+ side chains to use to perform scaffold morphing (joining as best as possible the set of fragments) + |
+
+ None
+ |
+
mol |
+
+ Optional[Union[Mol, str]]
+ |
+
+
+
+ input molecules when side_chains are not provided + |
+
+ None
+ |
+
core |
+
+ Optional[Union[Mol, str]]
+ |
+
+
+
+ core to morph into another scaffold + |
+
+ None
+ |
+
n_samples_per_trial |
+
+ int
+ |
+
+
+
+ number of new molecules to generate for each randomization + |
+
+ 10
+ |
+
n_trials |
+
+ Optional[int]
+ |
+
+
+
+ number of randomization to perform + |
+
+ 1
+ |
+
do_not_fragment_further |
+
+ Optional[bool]
+ |
+
+
+
+ whether to fragment the scaffold further or not + |
+
+ True
+ |
+
sanitize |
+
+ bool
+ |
+
+
+
+ whether to sanitize the generated molecules + |
+
+ False
+ |
+
random_seed |
+
+ Optional[int]
+ |
+
+
+
+ random seed to use + |
+
+ None
+ |
+
kwargs |
+
+ Optional[Dict[Any, Any]]
+ |
+
+
+
+ any argument to provide to the underlying generation function + |
+
+ {}
+ |
+
safe/sample.py
149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 |
|
super_structure(core, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, attachment_point_depth=None, **kwargs)
+
+¶Perform super structure generation using the pretrained SAFE model.
+To generate super-structure, we basically just create various attachment points to the input core, +then perform scaffold decoration.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
core |
+
+ Union[str, Mol]
+ |
+
+
+
+ input substructure to use. We aim to generate super structures of this molecule + |
+ + required + | +
n_samples_per_trial |
+
+ int
+ |
+
+
+
+ number of new molecules to generate for each randomization + |
+
+ 10
+ |
+
n_trials |
+
+ Optional[int]
+ |
+
+
+
+ number of different attachment points to consider + |
+
+ 1
+ |
+
do_not_fragment_further |
+
+ Optional[bool]
+ |
+
+
+
+ whether to fragment the scaffold further or not + |
+
+ True
+ |
+
sanitize |
+
+ bool
+ |
+
+
+
+ whether to sanitize the generated molecules + |
+
+ False
+ |
+
random_seed |
+
+ Optional[int]
+ |
+
+
+
+ random seed to use + |
+
+ None
+ |
+
attachment_point_depth |
+
+ Optional[int]
+ |
+
+
+
+ depth of opening the attachment points. +Increasing this, means you increase the number of substitution point to consider. + |
+
+ None
+ |
+
kwargs |
+
+ Optional[Dict[Any, Any]]
+ |
+
+
+
+ any argument to provide to the underlying generation function + |
+
+ {}
+ |
+
safe/sample.py
446 +447 +448 +449 +450 +451 +452 +453 +454 +455 +456 +457 +458 +459 +460 +461 +462 +463 +464 +465 +466 +467 +468 +469 +470 +471 +472 +473 +474 +475 +476 +477 +478 +479 +480 +481 +482 +483 +484 +485 +486 +487 +488 +489 +490 +491 +492 +493 +494 +495 +496 +497 +498 +499 +500 +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 |
|
Paper | Docs | \ud83e\udd17 Model | \ud83e\udd17 Training Dataset
"},{"location":"index.html#overview-of-safe","title":"Overview of SAFE","text":"
SAFE is the deep learning molecular representation. It's an encoding leveraging a peculiarity in the decoding schemes of SMILES, to allow representation of molecules as a contiguous sequence of connected fragments. SAFE strings are valid SMILES strings, and thus are able to preserve the same amount of information. The intuitive representation of molecules as an ordered sequence of connected fragments greatly simplifies the following tasks often encountered in molecular design:
The construction of a SAFE strings requires defining a molecular fragmentation algorithm. By default, we use [BRICS], but any other fragmentation algorithm can be used. The image below illustrates the process of building a SAFE string. The resulting string is a valid SMILES that can be read by datamol or RDKit.
"},{"location":"index.html#news","title":"News \ud83d\ude80","text":""},{"location":"index.html#20240115","title":"\ud83d\udca5 2024/01/15 \ud83d\udca5","text":"You can install safe
using pip:
pip install safe-mol\n
You can use conda/mamba:
mamba install -c conda-forge safe-mol\n
"},{"location":"index.html#datasets-and-models","title":"Datasets and Models","text":"Type Name Infos Size Comment Model datamol-io/safe-gpt 87M params 350M Default model Training Dataset datamol-io/safe-gpt 1.1B rows 250GB Training dataset Drug Benchmark Dataset datamol-io/safe-drugs 26 rows 20 kB Benchmarking dataset"},{"location":"index.html#usage","title":"Usage","text":"The tutorials in the documentation can help you get started with safe
and SAFE-GPT
.
We summarize some key functions provided by the safe
package below.
safe.encode
Translates a SMILES string into its corresponding SAFE string. safe.decode
Translates a SAFE string into its corresponding SMILES string. The SAFE decoder just augment RDKit's Chem.MolFromSmiles
with an optional correction argument to take care of missing hydrogens bonds. safe.split
Tokenizes a SAFE string to build a generative model."},{"location":"index.html#examples","title":"Examples","text":""},{"location":"index.html#translation-between-safe-and-smiles-representations","title":"Translation between SAFE and SMILES representations","text":"import safe\n\nibuprofen = \"CC(Cc1ccc(cc1)C(C(=O)O)C)C\"\n\n# SMILES -> SAFE -> SMILES translation\ntry:\n ibuprofen_sf = safe.encode(ibuprofen) # c12ccc3cc1.C3(C)C(=O)O.CC(C)C2\n ibuprofen_smi = safe.decode(ibuprofen_sf, canonical=True) # CC(C)Cc1ccc(C(C)C(=O)O)cc1\nexcept safe.EncoderError:\n pass\nexcept safe.DecoderError:\n pass\n\nibuprofen_tokens = list(safe.split(ibuprofen_sf))\n
"},{"location":"index.html#trainingfinetuning-a-new-model","title":"Training/Finetuning a (new) model","text":"A command line interface is available to train a new model, please run safe-train --help
. You can also provide an existing checkpoint to continue training or finetune on you own dataset.
For example:
safe-train --config <path to config> \\\n --model-path <path to model> \\\n --tokenizer <path to tokenizer> \\\n --dataset <path to dataset> \\\n --num_labels 9 \\\n --torch_compile True \\\n --optim \"adamw_torch\" \\\n --learning_rate 1e-5 \\\n --prop_loss_coeff 1e-3 \\\n --gradient_accumulation_steps 1 \\\n --output_dir \"<path to outputdir>\" \\\n --max_steps 5\n
"},{"location":"index.html#references","title":"References","text":"If you use this repository, please cite the following related paper:
@misc{noutahi2023gotta,\n title={Gotta be SAFE: A New Framework for Molecular Design},\n author={Emmanuel Noutahi and Cristian Gabellini and Michael Craig and Jonathan S. C Lim and Prudencio Tossou},\n year={2023},\n eprint={2310.10773},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n
"},{"location":"index.html#license","title":"License","text":"Note that all data and model weights of SAFE are exclusively licensed for research purposes. The accompanying dataset is licensed under CC BY 4.0, which permits solely non-commercial usage. See DATA_LICENSE for details.
This code base is licensed under the Apache-2.0 license. See LICENSE for details.
"},{"location":"index.html#development-lifecycle","title":"Development lifecycle","text":""},{"location":"index.html#setup-dev-environment","title":"Setup dev environment","text":"mamba create -n safe -f env.yml\nmamba activate safe\n\npip install --no-deps -e .\n
"},{"location":"index.html#tests","title":"Tests","text":"You can run tests locally with:
pytest\n
"},{"location":"cli.html","title":"CLI for model Training","text":"You can train a new SAFE
generative models using the provided CLI, which uses \ud83e\udd17 Transformers !
Usage:
safe-train [-h] [--model_path MODEL_PATH] [--config CONFIG] [--tokenizer TOKENIZER] [--num_labels NUM_LABELS]\n [--include_descriptors [INCLUDE_DESCRIPTORS]] [--no_include_descriptors] [--prop_loss_coeff PROP_LOSS_COEFF]\n [--wandb_project WANDB_PROJECT] [--wandb_watch {gradients,all}] [--cache_dir CACHE_DIR]\n [--torch_dtype {auto,bfloat16,float16,float32}] [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] [--model_max_length MODEL_MAX_LENGTH]\n [--dataset DATASET] [--is_tokenized [IS_TOKENIZED]] [--streaming [STREAMING]] [--text_column TEXT_COLUMN] --output_dir\n OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]\n [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}] [--prediction_loss_only [PREDICTION_LOSS_ONLY]]\n [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]\n [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]\n [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]\n [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]\n [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS]\n [--max_steps MAX_STEPS]\n [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}]\n [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}]\n [--log_level_replica {debug,info,warning,error,critical,passive}] [--log_on_each_node [LOG_ON_EACH_NODE]]\n [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]\n [--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]\n [--no_logging_nan_inf_filter] [--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT]\n [--save_safetensors [SAVE_SAFETENSORS]] [--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]]\n [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]\n [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL]\n [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}] [--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]]\n [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}] [--tpu_num_cores TPU_NUM_CORES]\n [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG [DEBUG ...]] [--dataloader_drop_last [DATALOADER_DROP_LAST]]\n [--eval_steps EVAL_STEPS] [--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME]\n [--disable_tqdm DISABLE_TQDM] [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns]\n [--label_names LABEL_NAMES [LABEL_NAMES ...]] [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]\n [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER] [--ignore_data_skip [IGNORE_DATA_SKIP]]\n [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]\n [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED]\n [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]\n [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]\n [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]]\n [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]\n [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]\n [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] [--no_dataloader_pin_memory]\n [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics]\n [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] [--push_to_hub [PUSH_TO_HUB]]\n [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID]\n [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] [--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]]\n [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]\n [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]\n [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]\n [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO]\n [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT] [--torch_compile [TORCH_COMPILE]]\n [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]\n
Options:
-h, --help show this help message and exit\n--model_path MODEL_PATH\n Optional model path or model name to use as a starting point for the safe model (default: None)\n--config CONFIG Path to the default config file to use for the safe model (default: None)\n--tokenizer TOKENIZER\n--num_labels NUM_LABELS\n Optional number of labels for the descriptors (default: None)\n--include_descriptors [INCLUDE_DESCRIPTORS]\n Whether to train with descriptors if they are available or Not (default: True)\n--no_include_descriptors\n Whether to train with descriptors if they are available or Not (default: False)\n--prop_loss_coeff PROP_LOSS_COEFF\n coefficient for the propery loss (default: 0.01)\n--wandb_project WANDB_PROJECT\n Name of the wandb project to use to log the SAFE model parameter (default: safe-gpt2)\n--wandb_watch {gradients,all}\n Whether to watch the wandb models or not (default: None)\n--cache_dir CACHE_DIR\n Where do you want to store the pretrained models downloaded from s3 (default: None)\n--torch_dtype {auto,bfloat16,float16,float32}\n Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the dtype will be\n automatically derived from the model's weights. (default: None)\n--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]\n It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights\n are loaded.set True will benefit LLM loading time and RAM consumption. Only valid when loading a pretrained model\n (default: False)\n--model_max_length MODEL_MAX_LENGTH\n Maximum sequence length. Sequences will be right padded (and possibly truncated) up to that value. (default: 1024)\n--dataset DATASET Path to the preprocessed dataset to use for the safe model building (default: None)\n--is_tokenized [IS_TOKENIZED]\n whether the dataset submitted as input is already tokenized or not (default: False)\n--streaming [STREAMING]\n Whether to use a streaming dataset or not (default: False)\n--text_column TEXT_COLUMN\n Column containing text data to process. (default: inputs)\n--output_dir OUTPUT_DIR\n The output directory where the model predictions and checkpoints will be written. (default: None)\n--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]\n Overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint\n directory. (default: False)\n--do_train [DO_TRAIN]\n Whether to run training. (default: False)\n--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)\n--do_predict [DO_PREDICT]\n Whether to run predictions on the test set. (default: False)\n--evaluation_strategy {no,steps,epoch}\n The evaluation strategy to use. (default: no)\n--prediction_loss_only [PREDICTION_LOSS_ONLY]\n When performing evaluation and predictions, only returns the loss. (default: False)\n--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE\n Batch size per GPU/TPU core/CPU for training. (default: 8)\n--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE\n Batch size per GPU/TPU core/CPU for evaluation. (default: 8)\n--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE\n Deprecated, the use of `--per_device_train_batch_size` is preferred. Batch size per GPU/TPU core/CPU for training.\n (default: None)\n--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE\n Deprecated, the use of `--per_device_eval_batch_size` is preferred. Batch size per GPU/TPU core/CPU for evaluation.\n (default: None)\n--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS\n Number of updates steps to accumulate before performing a backward/update pass. (default: 1)\n--eval_accumulation_steps EVAL_ACCUMULATION_STEPS\n Number of predictions steps to accumulate before moving the tensors to the CPU. (default: None)\n--eval_delay EVAL_DELAY\n Number of epochs or steps to wait for before the first evaluation can be performed, depending on the evaluation_strategy.\n (default: 0)\n--learning_rate LEARNING_RATE\n The initial learning rate for AdamW. (default: 5e-05)\n--weight_decay WEIGHT_DECAY\n Weight decay for AdamW if we apply some. (default: 0.0)\n--adam_beta1 ADAM_BETA1\n Beta1 for AdamW optimizer (default: 0.9)\n--adam_beta2 ADAM_BETA2\n Beta2 for AdamW optimizer (default: 0.999)\n--adam_epsilon ADAM_EPSILON\n Epsilon for AdamW optimizer. (default: 1e-08)\n--max_grad_norm MAX_GRAD_NORM\n Max gradient norm. (default: 1.0)\n--num_train_epochs NUM_TRAIN_EPOCHS\n Total number of training epochs to perform. (default: 3.0)\n--max_steps MAX_STEPS\n If > 0: set total number of training steps to perform. Override num_train_epochs. (default: -1)\n--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}\n The scheduler type to use. (default: linear)\n--warmup_ratio WARMUP_RATIO\n Linear warmup over warmup_ratio fraction of total steps. (default: 0.0)\n--warmup_steps WARMUP_STEPS\n Linear warmup over warmup_steps. (default: 0)\n--log_level {debug,info,warning,error,critical,passive}\n Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug', 'info', 'warning',\n 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the application set the level. Defaults\n to 'passive'. (default: passive)\n--log_level_replica {debug,info,warning,error,critical,passive}\n Logger log level to use on replica nodes. Same choices and defaults as ``log_level`` (default: warning)\n--log_on_each_node [LOG_ON_EACH_NODE]\n When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: True)\n--no_log_on_each_node\n When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: False)\n--logging_dir LOGGING_DIR\n Tensorboard log dir. (default: None)\n--logging_strategy {no,steps,epoch}\n The logging strategy to use. (default: steps)\n--logging_first_step [LOGGING_FIRST_STEP]\n Log the first global_step (default: False)\n--logging_steps LOGGING_STEPS\n Log every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted as\n ratio of total training steps. (default: 500)\n--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]\n Filter nan and inf losses for logging. (default: True)\n--no_logging_nan_inf_filter\n Filter nan and inf losses for logging. (default: False)\n--save_strategy {no,steps,epoch}\n The checkpoint save strategy to use. (default: steps)\n--save_steps SAVE_STEPS\n Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be\n interpreted as ratio of total training steps. (default: 500)\n--save_total_limit SAVE_TOTAL_LIMIT\n If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. When\n `load_best_model_at_end` is enabled, the 'best' checkpoint according to `metric_for_best_model` will always be retained in\n addition to the most recent ones. For example, for `save_total_limit=5` and `load_best_model_at_end=True`, the four last\n checkpoints will always be retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,\n it is possible that two checkpoints are saved: the last one and the best one (if they are different). Default is unlimited\n checkpoints (default: None)\n--save_safetensors [SAVE_SAFETENSORS]\n Use safetensors saving and loading for state dicts instead of default torch.load and torch.save. (default: False)\n--save_on_each_node [SAVE_ON_EACH_NODE]\n When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one\n (default: False)\n--no_cuda [NO_CUDA] Do not use CUDA even when it is available (default: False)\n--use_mps_device [USE_MPS_DEVICE]\n This argument is deprecated. `mps` device will be used if available similar to `cuda` device. It will be removed in\n version 5.0 of \ud83e\udd17 Transformers (default: False)\n--seed SEED Random seed that will be set at the beginning of training. (default: 42)\n--data_seed DATA_SEED\n Random seed to be used with data samplers. (default: None)\n--jit_mode_eval [JIT_MODE_EVAL]\n Whether or not to use PyTorch jit trace for inference (default: False)\n--use_ipex [USE_IPEX]\n Use Intel extension for PyTorch when it is available, installation: 'https://github.com/intel/intel-extension-for-pytorch'\n (default: False)\n--bf16 [BF16] Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or using CPU\n (no_cuda). This is an experimental API and it may change. (default: False)\n--fp16 [FP16] Whether to use fp16 (mixed) precision instead of 32-bit (default: False)\n--fp16_opt_level FP16_OPT_LEVEL\n For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at\n https://nvidia.github.io/apex/amp.html (default: O1)\n--half_precision_backend {auto,cuda_amp,apex,cpu_amp}\n The backend to be used for half precision. (default: auto)\n--bf16_full_eval [BF16_FULL_EVAL]\n Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may change. (default: False)\n--fp16_full_eval [FP16_FULL_EVAL]\n Whether to use full float16 evaluation instead of 32-bit (default: False)\n--tf32 TF32 Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API and it may\n change. (default: None)\n--local_rank LOCAL_RANK\n For distributed training: local_rank (default: -1)\n--ddp_backend {nccl,gloo,mpi,ccl}\n The backend to be used for distributed training (default: None)\n--tpu_num_cores TPU_NUM_CORES\n TPU: Number of TPU cores (automatically passed by launcher script) (default: None)\n--tpu_metrics_debug [TPU_METRICS_DEBUG]\n Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU: Whether to print debug metrics (default: False)\n--debug DEBUG [DEBUG ...]\n Whether or not to enable debug mode. Current options: `underflow_overflow` (Detect underflow and overflow in activations\n and weights), `tpu_metrics_debug` (print debug metrics on TPU). (default: None)\n--dataloader_drop_last [DATALOADER_DROP_LAST]\n Drop the last incomplete batch if it is not divisible by the batch size. (default: False)\n--eval_steps EVAL_STEPS\n Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted\n as ratio of total training steps. (default: None)\n--dataloader_num_workers DATALOADER_NUM_WORKERS\n Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process.\n (default: 0)\n--past_index PAST_INDEX\n If >=0, uses the corresponding part of the output as the past state for next step. (default: -1)\n--run_name RUN_NAME An optional descriptor for the run. Notably used for wandb logging. (default: None)\n--disable_tqdm DISABLE_TQDM\n Whether or not to disable the tqdm progress bars. (default: None)\n--remove_unused_columns [REMOVE_UNUSED_COLUMNS]\n Remove columns not required by the model when using an nlp.Dataset. (default: True)\n--no_remove_unused_columns\n Remove columns not required by the model when using an nlp.Dataset. (default: False)\n--label_names LABEL_NAMES [LABEL_NAMES ...]\n The list of keys in your dictionary of inputs that correspond to the labels. (default: None)\n--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]\n Whether or not to load the best model found during training at the end of training. When this option is enabled, the best\n checkpoint will always be saved. See `save_total_limit` for more. (default: False)\n--metric_for_best_model METRIC_FOR_BEST_MODEL\n The metric to use to compare two different models. (default: None)\n--greater_is_better GREATER_IS_BETTER\n Whether the `metric_for_best_model` should be maximized or not. (default: None)\n--ignore_data_skip [IGNORE_DATA_SKIP]\n When resuming training, whether or not to skip the first epochs and batches to get to the same training data. (default:\n False)\n--sharded_ddp SHARDED_DDP\n Whether or not to use sharded DDP training (in distributed training only). The base option should be `simple`, `zero_dp_2`\n or `zero_dp_3` and you can add CPU-offload to `zero_dp_2` or `zero_dp_3` like this: zero_dp_2 offload` or `zero_dp_3\n offload`. You can add auto-wrap to `zero_dp_2` or `zero_dp_3` with the same syntax: zero_dp_2 auto_wrap` or `zero_dp_3\n auto_wrap`. (default: )\n--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training only). The base option\n should be `full_shard`, `shard_grad_op` or `no_shard` and you can add CPU-offload to `full_shard` or `shard_grad_op` like\n this: full_shard offload` or `shard_grad_op offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the\n same syntax: full_shard auto_wrap` or `shard_grad_op auto_wrap`. (default: )\n--fsdp_min_num_params FSDP_MIN_NUM_PARAMS\n This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp`\n field is passed). (default: 0)\n--fsdp_config FSDP_CONFIG\n Config to be used with FSDP (Pytorch Fully Sharded Data Parallel). The value is either afsdp json config file (e.g.,\n `fsdp_config.json`) or an already loaded json file as `dict`. (default: None)\n--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP\n This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,\n `T5Block` .... (useful only when `fsdp` flag is passed). (default: None)\n--deepspeed DEEPSPEED\n Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a\n dict (default: None)\n--label_smoothing_factor LABEL_SMOOTHING_FACTOR\n The label smoothing epsilon to apply (zero means no label smoothing). (default: 0.0)\n--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}\n The optimizer to use. (default: adamw_hf)\n--optim_args OPTIM_ARGS\n Optional arguments to supply to optimizer. (default: None)\n--adafactor [ADAFACTOR]\n Whether or not to replace AdamW by Adafactor. (default: False)\n--group_by_length [GROUP_BY_LENGTH]\n Whether or not to group samples of roughly the same length together when batching. (default: False)\n--length_column_name LENGTH_COLUMN_NAME\n Column name with precomputed lengths to use when grouping by length. (default: length)\n--report_to REPORT_TO [REPORT_TO ...]\n The list of integrations to report the results and logs to. (default: None)\n--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS\n When using distributed training, the value of the flag `find_unused_parameters` passed to `DistributedDataParallel`.\n (default: None)\n--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB\n When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`. (default:\n None)\n--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS\n When using distributed training, the value of the flag `broadcast_buffers` passed to `DistributedDataParallel`. (default:\n None)\n--dataloader_pin_memory [DATALOADER_PIN_MEMORY]\n Whether or not to pin memory for DataLoader. (default: True)\n--no_dataloader_pin_memory\n Whether or not to pin memory for DataLoader. (default: False)\n--skip_memory_metrics [SKIP_MEMORY_METRICS]\n Whether or not to skip adding of memory profiler reports to metrics. (default: True)\n--no_skip_memory_metrics\n Whether or not to skip adding of memory profiler reports to metrics. (default: False)\n--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]\n Whether or not to use the legacy prediction_loop in the Trainer. (default: False)\n--push_to_hub [PUSH_TO_HUB]\n Whether or not to upload the trained model to the model hub after training. (default: False)\n--resume_from_checkpoint RESUME_FROM_CHECKPOINT\n The path to a folder with a valid checkpoint for your model. (default: None)\n--hub_model_id HUB_MODEL_ID\n The name of the repository to keep in sync with the local `output_dir`. (default: None)\n--hub_strategy {end,every_save,checkpoint,all_checkpoints}\n The hub strategy to use when `--push_to_hub` is activated. (default: every_save)\n--hub_token HUB_TOKEN\n The token to use to push to the Model Hub. (default: None)\n--hub_private_repo [HUB_PRIVATE_REPO]\n Whether the model repository is private or not. (default: False)\n--gradient_checkpointing [GRADIENT_CHECKPOINTING]\n If True, use gradient checkpointing to save memory at the expense of slower backward pass. (default: False)\n--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]\n Whether or not the inputs will be passed to the `compute_metrics` function. (default: False)\n--fp16_backend {auto,cuda_amp,apex,cpu_amp}\n Deprecated. Use half_precision_backend instead (default: auto)\n--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID\n The name of the repository to which push the `Trainer`. (default: None)\n--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION\n The name of the organization in with to which push the `Trainer`. (default: None)\n--push_to_hub_token PUSH_TO_HUB_TOKEN\n The token to use to push to the Model Hub. (default: None)\n--mp_parameters MP_PARAMETERS\n Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer (default: )\n--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]\n Whether to automatically decrease the batch size in half and rerun the training loop again each time a CUDA Out-of-Memory\n was reached (default: False)\n--full_determinism [FULL_DETERMINISM]\n Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed training. Important: this\n will negatively impact the performance, so only use it for debugging. (default: False)\n--torchdynamo TORCHDYNAMO\n This argument is deprecated, use `--torch_compile_backend` instead. (default: None)\n--ray_scope RAY_SCOPE\n The scope to use when doing hyperparameter search with Ray. By default, `\"last\"` will be used. Ray will then use the last\n checkpoint of all trials, compare those, and select the best one. However, other options are also available. See the Ray\n documentation (https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for\n more options. (default: last)\n--ddp_timeout DDP_TIMEOUT\n Overrides the default timeout for distributed training (value should be given in seconds). (default: 1800)\n--torch_compile [TORCH_COMPILE]\n If set to `True`, the model will be wrapped in `torch.compile`. (default: False)\n--torch_compile_backend TORCH_COMPILE_BACKEND\n Which backend to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--torch_compile_mode TORCH_COMPILE_MODE\n Which mode to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--xpu_backend {mpi,ccl,gloo}\n The backend to be used for distributed training on Intel XPU. (default: None)\n
"},{"location":"data_license.html","title":"Data License","text":"# Creative Commons Attribution 4.0 International License (CC BY 4.0)\n\nThis work is licensed under the Creative Commons Attribution 4.0 International License.\n\nTo view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.\n
"},{"location":"license.html","title":"License","text":"Apache License\n Version 2.0, January 2004\n http://www.apache.org/licenses/\n\n TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n 1. Definitions.\n\n \"License\" shall mean the terms and conditions for use, reproduction,\n and distribution as defined by Sections 1 through 9 of this document.\n\n \"Licensor\" shall mean the copyright owner or entity authorized by\n the copyright owner that is granting the License.\n\n \"Legal Entity\" shall mean the union of the acting entity and all\n other entities that control, are controlled by, or are under common\n control with that entity. For the purposes of this definition,\n \"control\" means (i) the power, direct or indirect, to cause the\n direction or management of such entity, whether by contract or\n otherwise, or (ii) ownership of fifty percent (50%) or more of the\n outstanding shares, or (iii) beneficial ownership of such entity.\n\n \"You\" (or \"Your\") shall mean an individual or Legal Entity\n exercising permissions granted by this License.\n\n \"Source\" form shall mean the preferred form for making modifications,\n including but not limited to software source code, documentation\n source, and configuration files.\n\n \"Object\" form shall mean any form resulting from mechanical\n transformation or translation of a Source form, including but\n not limited to compiled object code, generated documentation,\n and conversions to other media types.\n\n \"Work\" shall mean the work of authorship, whether in Source or\n Object form, made available under the License, as indicated by a\n copyright notice that is included in or attached to the work\n (an example is provided in the Appendix below).\n\n \"Derivative Works\" shall mean any work, whether in Source or Object\n form, that is based on (or derived from) the Work and for which the\n editorial revisions, annotations, elaborations, or other modifications\n represent, as a whole, an original work of authorship. For the purposes\n of this License, Derivative Works shall not include works that remain\n separable from, or merely link (or bind by name) to the interfaces of,\n the Work and Derivative Works thereof.\n\n \"Contribution\" shall mean any work of authorship, including\n the original version of the Work and any modifications or additions\n to that Work or Derivative Works thereof, that is intentionally\n submitted to Licensor for inclusion in the Work by the copyright owner\n or by an individual or Legal Entity authorized to submit on behalf of\n the copyright owner. For the purposes of this definition, \"submitted\"\n means any form of electronic, verbal, or written communication sent\n to the Licensor or its representatives, including but not limited to\n communication on electronic mailing lists, source code control systems,\n and issue tracking systems that are managed by, or on behalf of, the\n Licensor for the purpose of discussing and improving the Work, but\n excluding communication that is conspicuously marked or otherwise\n designated in writing by the copyright owner as \"Not a Contribution.\"\n\n \"Contributor\" shall mean Licensor and any individual or Legal Entity\n on behalf of whom a Contribution has been received by Licensor and\n subsequently incorporated within the Work.\n\n 2. Grant of Copyright License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n copyright license to reproduce, prepare Derivative Works of,\n publicly display, publicly perform, sublicense, and distribute the\n Work and such Derivative Works in Source or Object form.\n\n 3. Grant of Patent License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n (except as stated in this section) patent license to make, have made,\n use, offer to sell, sell, import, and otherwise transfer the Work,\n where such license applies only to those patent claims licensable\n by such Contributor that are necessarily infringed by their\n Contribution(s) alone or by combination of their Contribution(s)\n with the Work to which such Contribution(s) was submitted. If You\n institute patent litigation against any entity (including a\n cross-claim or counterclaim in a lawsuit) alleging that the Work\n or a Contribution incorporated within the Work constitutes direct\n or contributory patent infringement, then any patent licenses\n granted to You under this License for that Work shall terminate\n as of the date such litigation is filed.\n\n 4. Redistribution. You may reproduce and distribute copies of the\n Work or Derivative Works thereof in any medium, with or without\n modifications, and in Source or Object form, provided that You\n meet the following conditions:\n\n (a) You must give any other recipients of the Work or\n Derivative Works a copy of this License; and\n\n (b) You must cause any modified files to carry prominent notices\n stating that You changed the files; and\n\n (c) You must retain, in the Source form of any Derivative Works\n that You distribute, all copyright, patent, trademark, and\n attribution notices from the Source form of the Work,\n excluding those notices that do not pertain to any part of\n the Derivative Works; and\n\n (d) If the Work includes a \"NOTICE\" text file as part of its\n distribution, then any Derivative Works that You distribute must\n include a readable copy of the attribution notices contained\n within such NOTICE file, excluding those notices that do not\n pertain to any part of the Derivative Works, in at least one\n of the following places: within a NOTICE text file distributed\n as part of the Derivative Works; within the Source form or\n documentation, if provided along with the Derivative Works; or,\n within a display generated by the Derivative Works, if and\n wherever such third-party notices normally appear. The contents\n of the NOTICE file are for informational purposes only and\n do not modify the License. You may add Your own attribution\n notices within Derivative Works that You distribute, alongside\n or as an addendum to the NOTICE text from the Work, provided\n that such additional attribution notices cannot be construed\n as modifying the License.\n\n You may add Your own copyright statement to Your modifications and\n may provide additional or different license terms and conditions\n for use, reproduction, or distribution of Your modifications, or\n for any such Derivative Works as a whole, provided Your use,\n reproduction, and distribution of the Work otherwise complies with\n the conditions stated in this License.\n\n 5. Submission of Contributions. Unless You explicitly state otherwise,\n any Contribution intentionally submitted for inclusion in the Work\n by You to the Licensor shall be under the terms and conditions of\n this License, without any additional terms or conditions.\n Notwithstanding the above, nothing herein shall supersede or modify\n the terms of any separate license agreement you may have executed\n with Licensor regarding such Contributions.\n\n 6. Trademarks. This License does not grant permission to use the trade\n names, trademarks, service marks, or product names of the Licensor,\n except as required for reasonable and customary use in describing the\n origin of the Work and reproducing the content of the NOTICE file.\n\n 7. Disclaimer of Warranty. Unless required by applicable law or\n agreed to in writing, Licensor provides the Work (and each\n Contributor provides its Contributions) on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n implied, including, without limitation, any warranties or conditions\n of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n PARTICULAR PURPOSE. You are solely responsible for determining the\n appropriateness of using or redistributing the Work and assume any\n risks associated with Your exercise of permissions under this License.\n\n 8. Limitation of Liability. In no event and under no legal theory,\n whether in tort (including negligence), contract, or otherwise,\n unless required by applicable law (such as deliberate and grossly\n negligent acts) or agreed to in writing, shall any Contributor be\n liable to You for damages, including any direct, indirect, special,\n incidental, or consequential damages of any character arising as a\n result of this License or out of the use or inability to use the\n Work (including but not limited to damages for loss of goodwill,\n work stoppage, computer failure or malfunction, or any and all\n other commercial damages or losses), even if such Contributor\n has been advised of the possibility of such damages.\n\n 9. Accepting Warranty or Additional Liability. While redistributing\n the Work or Derivative Works thereof, You may choose to offer,\n and charge a fee for, acceptance of support, warranty, indemnity,\n or other liability obligations and/or rights consistent with this\n License. However, in accepting such obligations, You may act only\n on Your own behalf and on Your sole responsibility, not on behalf\n of any other Contributor, and only if You agree to indemnify,\n defend, and hold each Contributor harmless for any liability\n incurred by, or claims asserted against, such Contributor by reason\n of your accepting any such warranty or additional liability.\n\n END OF TERMS AND CONDITIONS\n\n APPENDIX: How to apply the Apache License to your work.\n\n To apply the Apache License to your work, attach the following\n boilerplate notice, with the fields enclosed by brackets \"[]\"\n replaced with your own identifying information. (Don't include\n the brackets!) The text should be enclosed in the appropriate\n comment syntax for the file format. We also recommend that a\n file or class name and description of purpose be included on the\n same \"printed page\" as the copyright notice for easier\n identification within third-party archives.\n\n Copyright 2023 Emmanuel Noutahi\n\n Licensed under the Apache License, Version 2.0 (the \"License\");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\n Unless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.\n
"},{"location":"api/safe.html","title":"SAFE","text":""},{"location":"api/safe.html#safe-encoder-decoder","title":"SAFE Encoder-Decoder","text":""},{"location":"api/safe.html#safe.converter.SAFEConverter","title":"SAFEConverter
","text":"Molecule line notation conversion from SMILES to SAFE
A SAFE representation is a string based representation of a molecule decomposition into fragment components, separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves, unless explicitely correct to add missing hydrogens.
Slicing algorithms
By default SAFE strings are generated using BRICS
, however, the following alternative are supported:
hr
)recap
)mmpa
)attach
)Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.
Source code insafe/converter.py
class SAFEConverter:\n \"\"\"Molecule line notation conversion from SMILES to SAFE\n\n A SAFE representation is a string based representation of a molecule decomposition into fragment components,\n separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves,\n unless explicitely correct to add missing hydrogens.\n\n !!! note \"Slicing algorithms\"\n\n By default SAFE strings are generated using `BRICS`, however, the following alternative are supported:\n\n * [Hussain-Rea (`hr`)](https://pubs.acs.org/doi/10.1021/ci900450m)\n * [RECAP (`recap`)](https://pubmed.ncbi.nlm.nih.gov/9611787/)\n * [RDKit's MMPA (`mmpa`)](https://www.rdkit.org/docs/source/rdkit.Chem.rdMMPA.html)\n * Any possible attachment points (`attach`)\n\n Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms\n corresponding to the bonds to break.\n\n \"\"\"\n\n SUPPORTED_SLICERS = [\"hr\", \"rotatable\", \"recap\", \"mmpa\", \"attach\", \"brics\"]\n __SLICE_SMARTS = {\n \"hr\": [\"[*]!@-[*]\"], # any non ring single bond\n \"recap\": [\n \"[$([C;!$(C([#7])[#7])](=!@[O]))]!@[$([#7;+0;!D1])]\",\n \"[$(C=!@O)]!@[$([O;+0])]\",\n \"[$([N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*]))]-!@[$([*])]\",\n \"[$(C(=!@O)([#7;+0;D2,D3])!@[#7;+0;D2,D3])]!@[$([#7;+0;D2,D3])]\",\n \"[$([O;+0](-!@[#6!$(C=O)])-!@[#6!$(C=O)])]-!@[$([#6!$(C=O)])]\",\n \"C=!@C\",\n \"[N;+1;D4]!@[#6]\",\n \"[$([n;+0])]-!@C\",\n \"[$([O]=[C]-@[N;+0])]-!@[$([C])]\",\n \"c-!@c\",\n \"[$([#7;+0;D2,D3])]-!@[$([S](=[O])=[O])]\",\n ],\n \"mmpa\": [\"[#6+0;!$(*=,#[!#6])]!@!=!#[*]\"], # classical mmpa slicing smarts\n \"attach\": [\"[*]!@[*]\"], # any potential attachment point, including hydrogens when explicit\n \"rotatable\": [\"[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]\"],\n }\n\n def __init__(\n self,\n slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n require_hs: Optional[bool] = None,\n use_original_opener_for_attach: bool = True,\n ignore_stereo: bool = False,\n ):\n \"\"\"Constructor for the SAFE converter\n\n Args:\n slicer: slicer algorithm to use for encoding.\n Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n or a custom callable that returns the bond ids that can be sliced.\n require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n `attach` slicer requires adding hydrogens.\n use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n mapping number to attachment points, or use simple enumeration.\n ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n \"\"\"\n self.slicer = slicer\n if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n if self.slicer != \"brics\" and isinstance(self.slicer, str):\n self.slicer = [self.slicer]\n if isinstance(self.slicer, (list, tuple)):\n self.slicer = [dm.from_smarts(x) for x in self.slicer]\n if any(x is None for x in self.slicer):\n raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n self.require_hs = require_hs or (slicer == \"attach\")\n self.use_original_opener_for_attach = use_original_opener_for_attach\n self.ignore_stereo = ignore_stereo\n\n @staticmethod\n def randomize(mol: dm.Mol, rng: Optional[int] = None):\n \"\"\"Randomize the position of the atoms in a mol.\n\n Args:\n mol: molecules to randomize\n rng: optional seed to use\n \"\"\"\n if isinstance(rng, int):\n rng = np.random.default_rng(rng)\n if mol.GetNumAtoms() == 0:\n return mol\n atom_indices = list(range(mol.GetNumAtoms()))\n atom_indices = rng.permutation(atom_indices).tolist()\n return Chem.RenumberAtoms(mol, atom_indices)\n\n @classmethod\n def _find_branch_number(cls, inp: str):\n \"\"\"Find the branch number and ring closure in the SMILES representation using regexp\n\n Args:\n inp: input smiles\n \"\"\"\n inp = re.sub(\"[\\[].*?[\\]]\", \"\", inp) # noqa\n matching_groups = re.findall(r\"((?<=%)\\d{2})|((?<!%)\\d+)(?![^\\[]*\\])\", inp)\n # first match is for multiple connection as multiple digits\n # second match is for single connections requiring 2 digits\n # SMILES does not support triple digits\n branch_numbers = []\n for m in matching_groups:\n if m[0] == \"\":\n branch_numbers.extend(int(mm) for mm in m[1])\n elif m[1] == \"\":\n branch_numbers.append(int(m[0].replace(\"%\", \"\")))\n return branch_numbers\n\n def _ensure_valid(self, inp: str):\n \"\"\"Ensure that the input SAFE string is valid by fixing the missing attachment points\n\n Args:\n inp: input SAFE string\n\n \"\"\"\n missing_tokens = [inp]\n branch_numbers = self._find_branch_number(inp)\n # only use the set that have exactly 1 element\n # any branch number that is not pairwise should receive a dummy atom to complete the attachment point\n branch_numbers = Counter(branch_numbers)\n for i, (bnum, bcount) in enumerate(branch_numbers.items()):\n if bcount % 2 != 0:\n bnum_str = str(bnum) if bnum < 10 else f\"%{bnum}\"\n _tk = f\"[*:{i+1}]{bnum_str}\"\n if self.use_original_opener_for_attach:\n bnum_digit = bnum_str.strip(\"%\") # strip out the % sign\n _tk = f\"[*:{bnum_digit}]{bnum_str}\"\n missing_tokens.append(_tk)\n return \".\".join(missing_tokens)\n\n def decoder(\n self,\n inp: str,\n as_mol: bool = False,\n canonical: bool = False,\n fix: bool = True,\n remove_dummies: bool = True,\n remove_added_hs: bool = True,\n ):\n \"\"\"Convert input SAFE representation to smiles\n\n Args:\n inp: input SAFE representation to decode as a valid molecule or smiles\n as_mol: whether to return a molecule object or a smiles string\n canonical: whether to return a canonical\n fix: whether to fix the SAFE representation to take into account non-connected attachment points\n remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n \"\"\"\n\n if fix:\n inp = self._ensure_valid(inp)\n mol = dm.to_mol(inp)\n if remove_dummies:\n with suppress(Exception):\n du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n mol = dm.remove_dummies(out)\n if as_mol:\n if remove_added_hs:\n mol = dm.remove_hs(mol, update_explicit_count=True)\n if canonical:\n mol = dm.standardize_mol(mol)\n mol = dm.canonical_tautomer(mol)\n return mol\n out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n if canonical:\n out = dm.standardize_smiles(out)\n return out\n\n def _fragment(self, mol: dm.Mol, allow_empty: bool = False):\n \"\"\"\n Perform bond cutting in place for the input molecule, given the slicing algorithm\n\n Args:\n mol: input molecule to split\n allow_empty: whether to allow the slicing algorithm to return empty bonds\n Raises:\n SAFEFragmentationError: if the slicing algorithm return empty bonds\n \"\"\"\n\n if self.slicer is None:\n matching_bonds = []\n\n elif callable(self.slicer):\n matching_bonds = self.slicer(mol)\n matching_bonds = list(matching_bonds)\n\n elif self.slicer == \"brics\":\n matching_bonds = BRICS.FindBRICSBonds(mol)\n matching_bonds = [brics_match[0] for brics_match in matching_bonds]\n\n else:\n matches = set()\n for smarts in self.slicer:\n matches |= {\n tuple(sorted(match)) for match in mol.GetSubstructMatches(smarts, uniquify=True)\n }\n matching_bonds = list(matches)\n\n if matching_bonds is None or len(matching_bonds) == 0 and not allow_empty:\n raise SAFEFragmentationError(\n \"Slicing algorithms did not return any bonds that can be cut !\"\n )\n return matching_bonds or []\n\n def encoder(\n self,\n inp: Union[str, dm.Mol],\n canonical: bool = True,\n randomize: Optional[bool] = False,\n seed: Optional[int] = None,\n constraints: Optional[List[dm.Mol]] = None,\n allow_empty: bool = False,\n rdkit_safe: bool = True,\n ):\n \"\"\"Convert input smiles to SAFE representation\n\n Args:\n inp: input smiles\n canonical: whether to return canonical smiles string. Defaults to True\n randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n seed: optional seed to use when allowing randomization of the SAFE encoding.\n Randomization happens at two steps:\n 1. at the original smiles representation by randomization the atoms.\n 2. at the SAFE conversion by randomizing fragment orders\n constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n happen outside of a substructure matching one of the patterns.\n allow_empty: whether to allow the slicing algorithm to return empty bonds\n rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n \"\"\"\n rng = None\n if randomize:\n rng = np.random.default_rng(seed)\n if not canonical:\n inp = dm.to_mol(inp, remove_hs=False)\n inp = self.randomize(inp, rng)\n\n if isinstance(inp, dm.Mol):\n inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n # EN: we first normalize the attachment if the molecule is a query:\n # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n branch_numbers = self._find_branch_number(inp)\n\n mol = dm.to_mol(inp, remove_hs=False)\n potential_stereos = Chem.FindPotentialStereo(mol)\n has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n if self.ignore_stereo:\n mol = dm.remove_stereochemistry(mol)\n\n bond_map_id = 1\n for atom in mol.GetAtoms():\n if atom.GetAtomicNum() == 0:\n atom.SetAtomMapNum(0)\n atom.SetIsotope(bond_map_id)\n bond_map_id += 1\n\n if self.require_hs:\n mol = dm.add_hs(mol)\n matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n substructed_ignored = []\n if constraints is not None:\n substructed_ignored = list(\n itertools.chain(\n *[\n mol.GetSubstructMatches(constraint, uniquify=True)\n for constraint in constraints\n ]\n )\n )\n\n bonds = []\n for i_a, i_b in matching_bonds:\n # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n # on the other end, a bond between two substructure to preserved independently is perfectly fine\n if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n continue\n obond = mol.GetBondBetweenAtoms(i_a, i_b)\n bonds.append(obond.GetIdx())\n\n if len(bonds) > 0:\n mol = Chem.FragmentOnBonds(\n mol,\n bonds,\n dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n )\n # here we need to be clever and disable rooted atom as the atom with mapping\n\n frags = list(Chem.GetMolFrags(mol, asMols=True))\n if randomize:\n frags = rng.permutation(frags).tolist()\n elif canonical:\n frags = sorted(\n frags,\n key=lambda x: x.GetNumAtoms(),\n reverse=True,\n )\n\n frags_str = []\n for frag in frags:\n non_map_atom_idxs = [\n atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n ]\n frags_str.append(\n Chem.MolToSmiles(\n frag,\n isomericSmiles=True,\n canonical=True, # needs to always be true\n rootedAtAtom=non_map_atom_idxs[0],\n )\n )\n\n scaffold_str = \".\".join(frags_str)\n # EN: fix for https://github.com/datamol-io/safe/issues/37\n # we were using the wrong branch number count which did not take into account\n # possible change in digit utilization after bond slicing\n scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n # don't capture atom mapping in the scaffold\n attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n if canonical:\n attach_pos = sorted(attach_pos)\n starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n for attach in attach_pos:\n val = str(starting_num) if starting_num < 10 else f\"%{starting_num}\"\n # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n scaffold_str = attach_regexp.sub(val, scaffold_str)\n starting_num += 1\n\n # now we need to remove all the parenthesis around digit only number\n wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n scaffold_str = wrong_attach.sub(r\"\\g<1>\", scaffold_str)\n # furthermore, we autoapply rdkit-compatible digit standardization.\n if rdkit_safe:\n pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n replacement = r\"\\g<1>\\g<2>\"\n scaffold_str = re.sub(pattern, replacement, scaffold_str)\n if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n logger.warning(\n \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n )\n return scaffold_str\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.__init__","title":"__init__(slicer='brics', require_hs=None, use_original_opener_for_attach=True, ignore_stereo=False)
","text":"Constructor for the SAFE converter
Parameters:
Name Type Description Defaultslicer
Optional[Union[str, List[str], Callable]]
slicer algorithm to use for encoding. Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS) or a custom callable that returns the bond ids that can be sliced.
'brics'
require_hs
Optional[bool]
whether the slicing algorithm require the molecule to have hydrogen explictly added. attach
slicer requires adding hydrogens.
None
use_original_opener_for_attach
bool
whether to use the original branch opener digit when adding back mapping number to attachment points, or use simple enumeration.
True
ignore_stereo
bool
RDKIT does not support some particular SAFE subset when stereochemistry is defined.
False
Source code in safe/converter.py
def __init__(\n self,\n slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n require_hs: Optional[bool] = None,\n use_original_opener_for_attach: bool = True,\n ignore_stereo: bool = False,\n):\n \"\"\"Constructor for the SAFE converter\n\n Args:\n slicer: slicer algorithm to use for encoding.\n Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n or a custom callable that returns the bond ids that can be sliced.\n require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n `attach` slicer requires adding hydrogens.\n use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n mapping number to attachment points, or use simple enumeration.\n ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n \"\"\"\n self.slicer = slicer\n if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n if self.slicer != \"brics\" and isinstance(self.slicer, str):\n self.slicer = [self.slicer]\n if isinstance(self.slicer, (list, tuple)):\n self.slicer = [dm.from_smarts(x) for x in self.slicer]\n if any(x is None for x in self.slicer):\n raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n self.require_hs = require_hs or (slicer == \"attach\")\n self.use_original_opener_for_attach = use_original_opener_for_attach\n self.ignore_stereo = ignore_stereo\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.decoder","title":"decoder(inp, as_mol=False, canonical=False, fix=True, remove_dummies=True, remove_added_hs=True)
","text":"Convert input SAFE representation to smiles
Parameters:
Name Type Description Defaultinp
str
input SAFE representation to decode as a valid molecule or smiles
requiredas_mol
bool
whether to return a molecule object or a smiles string
False
canonical
bool
whether to return a canonical
False
fix
bool
whether to fix the SAFE representation to take into account non-connected attachment points
True
remove_dummies
bool
whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with
True
remove_added_hs
bool
whether to remove all the added hydrogen atoms after applying dummy removal for recovery
True
Source code in safe/converter.py
def decoder(\n self,\n inp: str,\n as_mol: bool = False,\n canonical: bool = False,\n fix: bool = True,\n remove_dummies: bool = True,\n remove_added_hs: bool = True,\n):\n \"\"\"Convert input SAFE representation to smiles\n\n Args:\n inp: input SAFE representation to decode as a valid molecule or smiles\n as_mol: whether to return a molecule object or a smiles string\n canonical: whether to return a canonical\n fix: whether to fix the SAFE representation to take into account non-connected attachment points\n remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n \"\"\"\n\n if fix:\n inp = self._ensure_valid(inp)\n mol = dm.to_mol(inp)\n if remove_dummies:\n with suppress(Exception):\n du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n mol = dm.remove_dummies(out)\n if as_mol:\n if remove_added_hs:\n mol = dm.remove_hs(mol, update_explicit_count=True)\n if canonical:\n mol = dm.standardize_mol(mol)\n mol = dm.canonical_tautomer(mol)\n return mol\n out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n if canonical:\n out = dm.standardize_smiles(out)\n return out\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.encoder","title":"encoder(inp, canonical=True, randomize=False, seed=None, constraints=None, allow_empty=False, rdkit_safe=True)
","text":"Convert input smiles to SAFE representation
Parameters:
Name Type Description Defaultinp
Union[str, Mol]
input smiles
requiredcanonical
bool
whether to return canonical smiles string. Defaults to True
True
randomize
Optional[bool]
whether to randomize the safe string encoding. Will be ignored if canonical is provided
False
seed
Optional[int]
optional seed to use when allowing randomization of the SAFE encoding. Randomization happens at two steps: 1. at the original smiles representation by randomization the atoms. 2. at the SAFE conversion by randomizing fragment orders
None
constraints
Optional[List[Mol]]
List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would happen outside of a substructure matching one of the patterns.
None
allow_empty
bool
whether to allow the slicing algorithm to return empty bonds
False
rdkit_safe
bool
whether to apply rdkit-safe digit standardization to the output SAFE string.
True
Source code in safe/converter.py
def encoder(\n self,\n inp: Union[str, dm.Mol],\n canonical: bool = True,\n randomize: Optional[bool] = False,\n seed: Optional[int] = None,\n constraints: Optional[List[dm.Mol]] = None,\n allow_empty: bool = False,\n rdkit_safe: bool = True,\n):\n \"\"\"Convert input smiles to SAFE representation\n\n Args:\n inp: input smiles\n canonical: whether to return canonical smiles string. Defaults to True\n randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n seed: optional seed to use when allowing randomization of the SAFE encoding.\n Randomization happens at two steps:\n 1. at the original smiles representation by randomization the atoms.\n 2. at the SAFE conversion by randomizing fragment orders\n constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n happen outside of a substructure matching one of the patterns.\n allow_empty: whether to allow the slicing algorithm to return empty bonds\n rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n \"\"\"\n rng = None\n if randomize:\n rng = np.random.default_rng(seed)\n if not canonical:\n inp = dm.to_mol(inp, remove_hs=False)\n inp = self.randomize(inp, rng)\n\n if isinstance(inp, dm.Mol):\n inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n # EN: we first normalize the attachment if the molecule is a query:\n # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n branch_numbers = self._find_branch_number(inp)\n\n mol = dm.to_mol(inp, remove_hs=False)\n potential_stereos = Chem.FindPotentialStereo(mol)\n has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n if self.ignore_stereo:\n mol = dm.remove_stereochemistry(mol)\n\n bond_map_id = 1\n for atom in mol.GetAtoms():\n if atom.GetAtomicNum() == 0:\n atom.SetAtomMapNum(0)\n atom.SetIsotope(bond_map_id)\n bond_map_id += 1\n\n if self.require_hs:\n mol = dm.add_hs(mol)\n matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n substructed_ignored = []\n if constraints is not None:\n substructed_ignored = list(\n itertools.chain(\n *[\n mol.GetSubstructMatches(constraint, uniquify=True)\n for constraint in constraints\n ]\n )\n )\n\n bonds = []\n for i_a, i_b in matching_bonds:\n # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n # on the other end, a bond between two substructure to preserved independently is perfectly fine\n if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n continue\n obond = mol.GetBondBetweenAtoms(i_a, i_b)\n bonds.append(obond.GetIdx())\n\n if len(bonds) > 0:\n mol = Chem.FragmentOnBonds(\n mol,\n bonds,\n dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n )\n # here we need to be clever and disable rooted atom as the atom with mapping\n\n frags = list(Chem.GetMolFrags(mol, asMols=True))\n if randomize:\n frags = rng.permutation(frags).tolist()\n elif canonical:\n frags = sorted(\n frags,\n key=lambda x: x.GetNumAtoms(),\n reverse=True,\n )\n\n frags_str = []\n for frag in frags:\n non_map_atom_idxs = [\n atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n ]\n frags_str.append(\n Chem.MolToSmiles(\n frag,\n isomericSmiles=True,\n canonical=True, # needs to always be true\n rootedAtAtom=non_map_atom_idxs[0],\n )\n )\n\n scaffold_str = \".\".join(frags_str)\n # EN: fix for https://github.com/datamol-io/safe/issues/37\n # we were using the wrong branch number count which did not take into account\n # possible change in digit utilization after bond slicing\n scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n # don't capture atom mapping in the scaffold\n attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n if canonical:\n attach_pos = sorted(attach_pos)\n starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n for attach in attach_pos:\n val = str(starting_num) if starting_num < 10 else f\"%{starting_num}\"\n # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n scaffold_str = attach_regexp.sub(val, scaffold_str)\n starting_num += 1\n\n # now we need to remove all the parenthesis around digit only number\n wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n scaffold_str = wrong_attach.sub(r\"\\g<1>\", scaffold_str)\n # furthermore, we autoapply rdkit-compatible digit standardization.\n if rdkit_safe:\n pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n replacement = r\"\\g<1>\\g<2>\"\n scaffold_str = re.sub(pattern, replacement, scaffold_str)\n if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n logger.warning(\n \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n )\n return scaffold_str\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.randomize","title":"randomize(mol, rng=None)
staticmethod
","text":"Randomize the position of the atoms in a mol.
Parameters:
Name Type Description Defaultmol
Mol
molecules to randomize
requiredrng
Optional[int]
optional seed to use
None
Source code in safe/converter.py
@staticmethod\ndef randomize(mol: dm.Mol, rng: Optional[int] = None):\n \"\"\"Randomize the position of the atoms in a mol.\n\n Args:\n mol: molecules to randomize\n rng: optional seed to use\n \"\"\"\n if isinstance(rng, int):\n rng = np.random.default_rng(rng)\n if mol.GetNumAtoms() == 0:\n return mol\n atom_indices = list(range(mol.GetNumAtoms()))\n atom_indices = rng.permutation(atom_indices).tolist()\n return Chem.RenumberAtoms(mol, atom_indices)\n
"},{"location":"api/safe.html#safe.converter.encode","title":"encode(inp, canonical=True, randomize=False, seed=None, slicer=None, require_hs=None, constraints=None, ignore_stereo=False)
","text":"Convert input smiles to SAFE representation
Parameters:
Name Type Description Defaultinp
Union[str, Mol]
input smiles
requiredcanonical
bool
whether to return canonical SAFE string. Defaults to True
True
randomize
Optional[bool]
whether to randomize the safe string encoding. Will be ignored if canonical is provided
False
seed
Optional[int]
optional seed to use when allowing randomization of the SAFE encoding.
None
slicer
Optional[Union[List[str], str, Callable]]
slicer algorithm to use for encoding. Defaults to \"brics\".
None
require_hs
Optional[bool]
whether the slicing algorithm require the molecule to have hydrogen explictly added.
None
constraints
Optional[List[Mol]]
List of molecules or pattern to preserve during the SAFE construction.
None
ignore_stereo
Optional[bool]
RDKIT does not support some particular SAFE subset when stereochemistry is defined.
False
Source code in safe/converter.py
def encode(\n inp: Union[str, dm.Mol],\n canonical: bool = True,\n randomize: Optional[bool] = False,\n seed: Optional[int] = None,\n slicer: Optional[Union[List[str], str, Callable]] = None,\n require_hs: Optional[bool] = None,\n constraints: Optional[List[dm.Mol]] = None,\n ignore_stereo: Optional[bool] = False,\n):\n \"\"\"\n Convert input smiles to SAFE representation\n\n Args:\n inp: input smiles\n canonical: whether to return canonical SAFE string. Defaults to True\n randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n seed: optional seed to use when allowing randomization of the SAFE encoding.\n slicer: slicer algorithm to use for encoding. Defaults to \"brics\".\n require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n constraints: List of molecules or pattern to preserve during the SAFE construction.\n ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n \"\"\"\n if slicer is None:\n slicer = \"brics\"\n with dm.without_rdkit_log():\n safe_obj = SAFEConverter(slicer=slicer, require_hs=require_hs, ignore_stereo=ignore_stereo)\n try:\n encoded = safe_obj.encoder(\n inp,\n canonical=canonical,\n randomize=randomize,\n constraints=constraints,\n seed=seed,\n )\n except SAFEFragmentationError as e:\n raise e\n except Exception as e:\n raise SAFEEncodeError(f\"Failed to encode {inp} with {slicer}\") from e\n return encoded\n
"},{"location":"api/safe.html#safe.converter.decode","title":"decode(safe_str, as_mol=False, canonical=False, fix=True, remove_added_hs=True, remove_dummies=True, ignore_errors=False)
","text":"Convert input SAFE representation to smiles Args: safe_str: input SAFE representation to decode as a valid molecule or smiles as_mol: whether to return a molecule object or a smiles string canonical: whether to return a canonical smiles or a randomized smiles fix: whether to fix the SAFE representation to take into account non-connected attachment points remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string. remove_dummies: whether to remove dummy atoms from the SAFE representation ignore_errors: whether to ignore error and return None on decoding failure or raise an error
Source code insafe/converter.py
def decode(\n safe_str: str,\n as_mol: bool = False,\n canonical: bool = False,\n fix: bool = True,\n remove_added_hs: bool = True,\n remove_dummies: bool = True,\n ignore_errors: bool = False,\n):\n \"\"\"Convert input SAFE representation to smiles\n Args:\n safe_str: input SAFE representation to decode as a valid molecule or smiles\n as_mol: whether to return a molecule object or a smiles string\n canonical: whether to return a canonical smiles or a randomized smiles\n fix: whether to fix the SAFE representation to take into account non-connected attachment points\n remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string.\n remove_dummies: whether to remove dummy atoms from the SAFE representation\n ignore_errors: whether to ignore error and return None on decoding failure or raise an error\n\n \"\"\"\n with dm.without_rdkit_log():\n safe_obj = SAFEConverter()\n try:\n decoded = safe_obj.decoder(\n safe_str,\n as_mol=as_mol,\n canonical=canonical,\n fix=fix,\n remove_dummies=remove_dummies,\n remove_added_hs=remove_added_hs,\n )\n\n except Exception as e:\n if ignore_errors:\n return None\n raise SAFEDecodeError(f\"Failed to decode {safe_str}\") from e\n return decoded\n
"},{"location":"api/safe.html#safe-tokenizer","title":"SAFE Tokenizer","text":""},{"location":"api/safe.html#safe.tokenizer.SAFESplitter","title":"SAFESplitter
","text":"Standard Splitter for SAFE string
Source code insafe/tokenizer.py
class SAFESplitter:\n \"\"\"Standard Splitter for SAFE string\"\"\"\n\n REGEX_PATTERN = r\"\"\"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])\"\"\"\n\n name = \"safe\"\n\n def __init__(self, pattern: Optional[str] = None):\n # do not use this as raw strings (not r before)\n if pattern is None:\n pattern = self.REGEX_PATTERN\n self.regex = re.compile(pattern)\n\n def tokenize(self, line):\n \"\"\"Tokenize a safe string into characters.\"\"\"\n if isinstance(line, str):\n tokens = list(self.regex.findall(line))\n reconstruction = \"\".join(tokens)\n if line != reconstruction:\n logger.error(\n f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n )\n raise ValueError(line)\n else:\n idxs = re.finditer(self.regex, str(line))\n tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n return tokens\n\n def detokenize(self, chars):\n \"\"\"Detokenize SAFE notation\"\"\"\n if isinstance(chars, str):\n chars = chars.split(\" \")\n return \"\".join([x.strip() for x in chars])\n\n def split(self, n, normalized):\n \"\"\"Perform splitting for pretokenization\"\"\"\n return self.tokenize(normalized)\n\n def pre_tokenize(self, pretok):\n \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n pretok.split(self.split)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.detokenize","title":"detokenize(chars)
","text":"Detokenize SAFE notation
Source code insafe/tokenizer.py
def detokenize(self, chars):\n \"\"\"Detokenize SAFE notation\"\"\"\n if isinstance(chars, str):\n chars = chars.split(\" \")\n return \"\".join([x.strip() for x in chars])\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.pre_tokenize","title":"pre_tokenize(pretok)
","text":"Pretokenize using an input pretokenizer object from the tokenizer library
Source code insafe/tokenizer.py
def pre_tokenize(self, pretok):\n \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n pretok.split(self.split)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.split","title":"split(n, normalized)
","text":"Perform splitting for pretokenization
Source code insafe/tokenizer.py
def split(self, n, normalized):\n \"\"\"Perform splitting for pretokenization\"\"\"\n return self.tokenize(normalized)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.tokenize","title":"tokenize(line)
","text":"Tokenize a safe string into characters.
Source code insafe/tokenizer.py
def tokenize(self, line):\n \"\"\"Tokenize a safe string into characters.\"\"\"\n if isinstance(line, str):\n tokens = list(self.regex.findall(line))\n reconstruction = \"\".join(tokens)\n if line != reconstruction:\n logger.error(\n f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n )\n raise ValueError(line)\n else:\n idxs = re.finditer(self.regex, str(line))\n tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n return tokens\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer","title":"SAFETokenizer
","text":" Bases: PushToHubMixin
Class to initialize and train a tokenizer for SAFE string Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast
Source code insafe/tokenizer.py
class SAFETokenizer(PushToHubMixin):\n \"\"\"\n Class to initialize and train a tokenizer for SAFE string\n Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast\n \"\"\"\n\n vocab_files_names: str = \"tokenizer.json\"\n\n def __init__(\n self,\n tokenizer_type: str = \"bpe\",\n splitter: Optional[str] = \"safe\",\n trainer_args=None,\n decoder_args=None,\n token_model_args=None,\n ):\n super().__init__()\n self.tokenizer_type = tokenizer_type\n self.trainer_args = trainer_args or {}\n self.decoder_args = decoder_args or {}\n self.token_model_args = token_model_args or {}\n if tokenizer_type is not None and tokenizer_type.startswith(\"bpe\"):\n self.model = BPE(unk_token=UNK_TOKEN, **self.token_model_args)\n self.trainer = BpeTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n else:\n self.model = WordLevel(unk_token=UNK_TOKEN, **self.token_model_args)\n self.trainer = WordLevelTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n self.tokenizer = Tokenizer(self.model)\n self.splitter = None\n if splitter == \"safe\":\n self.splitter = SAFESplitter()\n self.tokenizer.pre_tokenizer = PreTokenizer.custom(self.splitter)\n self.tokenizer.post_processor = TemplateProcessing(\n single=TEMPLATE_SINGLE,\n pair=TEMPLATE_PAIR,\n special_tokens=TEMPLATE_SPECIAL_TOKENS,\n )\n self.tokenizer.decoder = decoders.BPEDecoder(**self.decoder_args)\n self.tokenizer = self.set_special_tokens(self.tokenizer)\n\n @property\n def bos_token_id(self):\n \"\"\"Get the bos token id\"\"\"\n return self.tokenizer.token_to_id(self.tokenizer.bos_token)\n\n @property\n def pad_token_id(self):\n \"\"\"Get the bos token id\"\"\"\n return self.tokenizer.token_to_id(self.tokenizer.pad_token)\n\n @property\n def eos_token_id(self):\n \"\"\"Get the bos token id\"\"\"\n return self.tokenizer.token_to_id(self.tokenizer.eos_token)\n\n @classmethod\n def set_special_tokens(\n cls,\n tokenizer: Tokenizer,\n bos_token: str = CLS_TOKEN,\n eos_token: str = SEP_TOKEN,\n ):\n \"\"\"Set special tokens for a tokenizer\n\n Args:\n tokenizer: tokenizer for which special tokens will be set\n bos_token: Optional bos token to use\n eos_token: Optional eos token to use\n \"\"\"\n tokenizer.pad_token = PADDING_TOKEN\n tokenizer.cls_token = CLS_TOKEN\n tokenizer.sep_token = SEP_TOKEN\n tokenizer.mask_token = MASK_TOKEN\n tokenizer.unk_token = UNK_TOKEN\n tokenizer.eos_token = eos_token\n tokenizer.bos_token = bos_token\n\n if isinstance(tokenizer, Tokenizer):\n tokenizer.add_special_tokens(\n [\n PADDING_TOKEN,\n CLS_TOKEN,\n SEP_TOKEN,\n MASK_TOKEN,\n UNK_TOKEN,\n eos_token,\n bos_token,\n ]\n )\n return tokenizer\n\n def train(self, files: Optional[List[str]], **kwargs):\n r\"\"\"\n This is to train a new tokenizer from either a list of file or some input data\n\n Args\n files (str): file in which your molecules are separated by new line\n kwargs (dict): optional args for the tokenizer `train`\n \"\"\"\n if isinstance(files, str):\n files = [files]\n self.tokenizer.train(files=files, trainer=self.trainer)\n\n def __getstate__(self):\n \"\"\"Getting state to allow pickling\"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n d = copy.deepcopy(self.__dict__)\n # copy back tokenizer level attribute\n d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n d[\"tokenizer\"].pre_tokenizer = Whitespace()\n return d\n\n def __setstate__(self, d):\n \"\"\"Setting state during reloading pickling\"\"\"\n use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n if use_pretokenizer:\n d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n self.__dict__.update(d)\n\n def train_from_iterator(self, data: Iterator, **kwargs: Any):\n \"\"\"Train the Tokenizer using the provided iterator.\n\n You can provide anything that is a Python Iterator\n * A list of sequences :obj:`List[str]`\n * A generator that yields :obj:`str` or :obj:`List[str]`\n * A Numpy array of strings\n\n Args:\n data: data iterator\n **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n \"\"\"\n self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n\n def __len__(self):\n r\"\"\"\n Gets the count of tokens in vocab along with special tokens.\n \"\"\"\n return len(self.tokenizer.get_vocab().keys())\n\n def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -> list:\n r\"\"\"\n Encodes a given molecule string once training is done\n\n Args:\n sample_str: Sample string to encode molecule\n ids_only: whether to return only the ids or the encoding objet\n\n Returns:\n object: Returns encoded list of IDs\n \"\"\"\n if isinstance(sample_str, str):\n enc = self.tokenizer.encode(sample_str, **kwargs)\n if ids_only:\n return enc.ids\n return enc\n\n encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n if ids_only:\n return [enc.ids for enc in encs]\n return encs\n\n def to_dict(self, **kwargs):\n \"\"\"Convert tokenizer to dict\"\"\"\n # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n if self.splitter is None:\n tk_data = json.loads(self.tokenizer.to_str())\n else:\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n # temporary replace pre tokenizer with whitespace\n tk_data = json.loads(self.tokenizer.to_str())\n tk_data[\"custom_pre_tokenizer\"] = True\n tk_data[\"tokenizer_type\"] = self.tokenizer_type\n tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n return tk_data\n\n def save_pretrained(self, *args, **kwargs):\n \"\"\"Save pretrained tokenizer\"\"\"\n self.tokenizer.save_pretrained(*args, **kwargs)\n\n def save(self, file_name=None):\n r\"\"\"\n Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n Args:\n file_name (str, optional): File where to save tokenizer\n \"\"\"\n # EN: whole logic here assumes noone is going to mess with the special token\n tk_data = self.to_dict()\n with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n out_str = json.dumps(tk_data, ensure_ascii=False)\n OUT.write(out_str)\n\n @classmethod\n def from_dict(cls, data: dict):\n \"\"\"Load tokenizer from dict\n\n Args:\n data: dictionary containing the tokenizer info\n \"\"\"\n tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n tokenizer = Tokenizer.from_str(json.dumps(data))\n if custom_pre_tokenizer:\n tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n mol_tokenizer = cls(tokenizer_type)\n mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n return mol_tokenizer\n\n @classmethod\n def load(cls, file_name):\n \"\"\"Load the current tokenizer from file\"\"\"\n with fsspec.open(file_name, \"r\") as OUT:\n data_str = OUT.read()\n data = json.loads(data_str)\n # EN: the rust json parser of tokenizers has a predefined structure\n # the next two lines are important\n return cls.from_dict(data)\n\n def decode(\n self,\n ids: list,\n skip_special_tokens: bool = True,\n ignore_stops: bool = False,\n stop_token_ids: Optional[List[int]] = None,\n ) -> str:\n r\"\"\"\n Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n Args:\n ids: list of IDs\n skip_special_tokens: whether to skip all special tokens when encountering them\n ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n stop_token_ids: optional list of stop token ids to use\n\n Returns:\n sequence: str representation of molecule\n \"\"\"\n old_id_list = ids\n if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n old_id_list = [ids]\n if not stop_token_ids:\n stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n new_ids_list = []\n for ids in old_id_list:\n new_ids = ids\n if not ignore_stops:\n new_ids = []\n # if first tokens are stop, we just remove it\n # this is because of bart essentially\n pos = 0\n if len(ids) > 1:\n while ids[pos] in stop_token_ids:\n pos += 1\n # we only ignore when there is a list of tokens\n ids = ids[pos:]\n for pos, id in enumerate(ids):\n if int(id) in stop_token_ids:\n break\n new_ids.append(id)\n new_ids_list.append(new_ids)\n if len(new_ids_list) == 1:\n return self.tokenizer.decode(\n list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n )\n return self.tokenizer.decode_batch(\n list(new_ids_list), skip_special_tokens=skip_special_tokens\n )\n\n def get_pretrained(self, **kwargs) -> PreTrainedTokenizerFast:\n r\"\"\"\n Get a pretrained tokenizer from this tokenizer\n\n Returns:\n Returns pre-trained fast tokenizer for hugging face models.\n \"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n # now we need to add special_tokens\n tk.add_special_tokens(\n {\n \"cls_token\": self.tokenizer.cls_token,\n \"bos_token\": self.tokenizer.bos_token,\n \"eos_token\": self.tokenizer.eos_token,\n \"mask_token\": self.tokenizer.mask_token,\n \"pad_token\": self.tokenizer.pad_token,\n \"unk_token\": self.tokenizer.unk_token,\n \"sep_token\": self.tokenizer.sep_token,\n }\n )\n if (\n tk.model_max_length is None\n or tk.model_max_length > 1e8\n and hasattr(self.tokenizer, \"model_max_length\")\n ):\n tk.model_max_length = self.tokenizer.model_max_length\n setattr(\n tk,\n \"model_max_length\",\n getattr(self.tokenizer, \"model_max_length\"),\n )\n return tk\n\n def push_to_hub(\n self,\n repo_id: str,\n use_temp_dir: Optional[bool] = None,\n commit_message: Optional[str] = None,\n private: Optional[bool] = None,\n token: Optional[Union[bool, str]] = None,\n max_shard_size: Optional[Union[int, str]] = \"10GB\",\n create_pr: bool = False,\n safe_serialization: bool = False,\n **deprecated_kwargs,\n ) -> str:\n \"\"\"\n Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n Args:\n repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n when pushing to a given organization.\n use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n private: Whether or not the repository created should be private.\n token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n is not specified.\n max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n by a unit (like `\"5MB\"`).\n create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n \"\"\"\n use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n if repo_path_or_name is not None:\n # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n # repo_id from the folder path, if it exists.\n warnings.warn(\n \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n \"`repo_id` instead.\",\n FutureWarning,\n )\n if repo_id is not None:\n raise ValueError(\n \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n )\n if os.path.isdir(repo_path_or_name):\n # repo_path: infer repo_id from the path\n repo_id = repo_id.split(os.path.sep)[-1]\n working_dir = repo_id\n else:\n # repo_name: use it as repo_id\n repo_id = repo_path_or_name\n working_dir = repo_id.split(\"/\")[-1]\n else:\n # Repo_id is passed correctly: infer working_dir from it\n working_dir = repo_id.split(\"/\")[-1]\n\n # Deprecation warning will be sent after for repo_url and organization\n repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n organization = deprecated_kwargs.pop(\"organization\", None)\n\n repo_id = self._create_repo(\n repo_id, private, token, repo_url=repo_url, organization=organization\n )\n\n if use_temp_dir is None:\n use_temp_dir = not os.path.isdir(working_dir)\n\n with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n files_timestamps = self._get_files_timestamps(work_dir)\n\n # Save all files.\n with contextlib.suppress(Exception):\n self.save_pretrained(\n work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n )\n\n self.save(os.path.join(work_dir, self.vocab_files_names))\n\n return self._upload_modified_files(\n work_dir,\n repo_id,\n files_timestamps,\n commit_message=commit_message,\n token=token,\n create_pr=create_pr,\n )\n\n @classmethod\n def from_pretrained(\n cls,\n pretrained_model_name_or_path: Union[str, os.PathLike],\n cache_dir: Optional[Union[str, os.PathLike]] = None,\n force_download: bool = False,\n local_files_only: bool = False,\n token: Optional[Union[str, bool]] = None,\n return_fast_tokenizer: Optional[bool] = False,\n proxies: Optional[Dict[str, str]] = None,\n **kwargs,\n ):\n r\"\"\"\n Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n tokenizer.\n\n Args:\n pretrained_model_name_or_path:\n Can be either:\n\n - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n user or organization name, like `dbmdz/bert-base-german-cased`.\n - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n `./my_model_directory/`.\n - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n `./my_model_directory/vocab.txt`.\n cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n standard cache should not be used.\n force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n token: The token to use as HTTP bearer authorization for remote files.\n If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n Examples:\n ``` py\n # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n # Download vocabulary from huggingface.co and cache.\n tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n # If the tokenizer uses a single vocabulary file, you can point directly to this file\n tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n ```\n \"\"\"\n resume_download = kwargs.pop(\"resume_download\", False)\n use_auth_token = kwargs.pop(\"use_auth_token\", None)\n subfolder = kwargs.pop(\"subfolder\", None)\n from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n from_auto_class = kwargs.pop(\"_from_auto\", False)\n commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n user_agent = {\n \"file_type\": \"tokenizer\",\n \"from_auto_class\": from_auto_class,\n \"is_fast\": \"Fast\" in cls.__name__,\n }\n if from_pipeline is not None:\n user_agent[\"using_pipeline\"] = from_pipeline\n\n if is_offline_mode() and not local_files_only:\n logger.info(\"Offline mode: forcing local_files_only=True\")\n local_files_only = True\n\n pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n os.path.isdir(pretrained_model_name_or_path)\n file_path = None\n if os.path.isfile(pretrained_model_name_or_path):\n file_path = pretrained_model_name_or_path\n elif is_remote_url(pretrained_model_name_or_path):\n file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n else:\n # EN: remove this when transformers package has uniform API\n cached_file_extra_kwargs = {\"use_auth_token\": token}\n if packaging.version.parse(transformers_version) >= packaging.version.parse(\"5.0\"):\n cached_file_extra_kwargs = {\"token\": token}\n # Try to get the tokenizer config to see if there are versioned tokenizer files.\n resolved_vocab_files = cached_file(\n pretrained_model_name_or_path,\n cls.vocab_files_names,\n cache_dir=cache_dir,\n force_download=force_download,\n resume_download=resume_download,\n proxies=proxies,\n local_files_only=local_files_only,\n subfolder=subfolder,\n user_agent=user_agent,\n _raise_exceptions_for_missing_entries=False,\n _raise_exceptions_for_connection_errors=False,\n _commit_hash=commit_hash,\n **cached_file_extra_kwargs,\n )\n commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n file_path = resolved_vocab_files\n\n if not os.path.isfile(file_path):\n logger.info(\n f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n )\n\n tokenizer = cls.load(file_path)\n if return_fast_tokenizer:\n return tokenizer.get_pretrained()\n return tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.bos_token_id","title":"bos_token_id
property
","text":"Get the bos token id
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.eos_token_id","title":"eos_token_id
property
","text":"Get the bos token id
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.pad_token_id","title":"pad_token_id
property
","text":"Get the bos token id
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__getstate__","title":"__getstate__()
","text":"Getting state to allow pickling
Source code insafe/tokenizer.py
def __getstate__(self):\n \"\"\"Getting state to allow pickling\"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n d = copy.deepcopy(self.__dict__)\n # copy back tokenizer level attribute\n d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n d[\"tokenizer\"].pre_tokenizer = Whitespace()\n return d\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__len__","title":"__len__()
","text":"Gets the count of tokens in vocab along with special tokens.
Source code insafe/tokenizer.py
def __len__(self):\n r\"\"\"\n Gets the count of tokens in vocab along with special tokens.\n \"\"\"\n return len(self.tokenizer.get_vocab().keys())\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__setstate__","title":"__setstate__(d)
","text":"Setting state during reloading pickling
Source code insafe/tokenizer.py
def __setstate__(self, d):\n \"\"\"Setting state during reloading pickling\"\"\"\n use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n if use_pretokenizer:\n d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n self.__dict__.update(d)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.decode","title":"decode(ids, skip_special_tokens=True, ignore_stops=False, stop_token_ids=None)
","text":"Decodes a list of ids to molecular representation in the format in which this tokenizer was created.
Parameters:
Name Type Description Defaultids
list
list of IDs
requiredskip_special_tokens
bool
whether to skip all special tokens when encountering them
True
ignore_stops
bool
whether to ignore the stop tokens, thus decoding till the end
False
stop_token_ids
Optional[List[int]]
optional list of stop token ids to use
None
Returns:
Name Type Descriptionsequence
str
str representation of molecule
Source code insafe/tokenizer.py
def decode(\n self,\n ids: list,\n skip_special_tokens: bool = True,\n ignore_stops: bool = False,\n stop_token_ids: Optional[List[int]] = None,\n) -> str:\n r\"\"\"\n Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n Args:\n ids: list of IDs\n skip_special_tokens: whether to skip all special tokens when encountering them\n ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n stop_token_ids: optional list of stop token ids to use\n\n Returns:\n sequence: str representation of molecule\n \"\"\"\n old_id_list = ids\n if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n old_id_list = [ids]\n if not stop_token_ids:\n stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n new_ids_list = []\n for ids in old_id_list:\n new_ids = ids\n if not ignore_stops:\n new_ids = []\n # if first tokens are stop, we just remove it\n # this is because of bart essentially\n pos = 0\n if len(ids) > 1:\n while ids[pos] in stop_token_ids:\n pos += 1\n # we only ignore when there is a list of tokens\n ids = ids[pos:]\n for pos, id in enumerate(ids):\n if int(id) in stop_token_ids:\n break\n new_ids.append(id)\n new_ids_list.append(new_ids)\n if len(new_ids_list) == 1:\n return self.tokenizer.decode(\n list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n )\n return self.tokenizer.decode_batch(\n list(new_ids_list), skip_special_tokens=skip_special_tokens\n )\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.encode","title":"encode(sample_str, ids_only=True, **kwargs)
","text":"Encodes a given molecule string once training is done
Parameters:
Name Type Description Defaultsample_str
str
Sample string to encode molecule
requiredids_only
bool
whether to return only the ids or the encoding objet
True
Returns:
Name Type Descriptionobject
list
Returns encoded list of IDs
Source code insafe/tokenizer.py
def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -> list:\n r\"\"\"\n Encodes a given molecule string once training is done\n\n Args:\n sample_str: Sample string to encode molecule\n ids_only: whether to return only the ids or the encoding objet\n\n Returns:\n object: Returns encoded list of IDs\n \"\"\"\n if isinstance(sample_str, str):\n enc = self.tokenizer.encode(sample_str, **kwargs)\n if ids_only:\n return enc.ids\n return enc\n\n encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n if ids_only:\n return [enc.ids for enc in encs]\n return encs\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_dict","title":"from_dict(data)
classmethod
","text":"Load tokenizer from dict
Parameters:
Name Type Description Defaultdata
dict
dictionary containing the tokenizer info
required Source code insafe/tokenizer.py
@classmethod\ndef from_dict(cls, data: dict):\n \"\"\"Load tokenizer from dict\n\n Args:\n data: dictionary containing the tokenizer info\n \"\"\"\n tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n tokenizer = Tokenizer.from_str(json.dumps(data))\n if custom_pre_tokenizer:\n tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n mol_tokenizer = cls(tokenizer_type)\n mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n return mol_tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_pretrained","title":"from_pretrained(pretrained_model_name_or_path, cache_dir=None, force_download=False, local_files_only=False, token=None, return_fast_tokenizer=False, proxies=None, **kwargs)
classmethod
","text":"Instantiate a [~tokenization_utils_base.PreTrainedTokenizerBase
] (or a derived class) from a predefined tokenizer.
Parameters:
Name Type Description Defaultpretrained_model_name_or_path
Union[str, PathLike]
Can be either:
bert-base-uncased
, or namespaced under a user or organization name, like dbmdz/bert-base-german-cased
.~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained
] method, e.g., ./my_model_directory/
../my_model_directory/vocab.txt
.cache_dir
Optional[Union[str, PathLike]]
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
None
force_download
bool
Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.
False
proxies
Optional[Dict[str, str]]
A dictionary of proxy servers to use by protocol or endpoint, e.g., {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}
. The proxies are used on each request.
None
token
Optional[Union[str, bool]]
The token to use as HTTP bearer authorization for remote files. If True
, will use the token generated when running huggingface-cli login
(stored in ~/.huggingface
).
None
local_files_only
bool
Whether or not to only rely on local files and not to attempt to download any files.
False
return_fast_tokenizer
Optional[bool]
Whether to return fast tokenizer or not.
False
Examples:
# We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n # Download vocabulary from huggingface.co and cache.\n tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n # If the tokenizer uses a single vocabulary file, you can point directly to this file\n tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n
Source code in safe/tokenizer.py
@classmethod\ndef from_pretrained(\n cls,\n pretrained_model_name_or_path: Union[str, os.PathLike],\n cache_dir: Optional[Union[str, os.PathLike]] = None,\n force_download: bool = False,\n local_files_only: bool = False,\n token: Optional[Union[str, bool]] = None,\n return_fast_tokenizer: Optional[bool] = False,\n proxies: Optional[Dict[str, str]] = None,\n **kwargs,\n):\n r\"\"\"\n Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n tokenizer.\n\n Args:\n pretrained_model_name_or_path:\n Can be either:\n\n - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n user or organization name, like `dbmdz/bert-base-german-cased`.\n - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n `./my_model_directory/`.\n - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n `./my_model_directory/vocab.txt`.\n cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n standard cache should not be used.\n force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n token: The token to use as HTTP bearer authorization for remote files.\n If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n Examples:\n ``` py\n # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n # Download vocabulary from huggingface.co and cache.\n tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n # If the tokenizer uses a single vocabulary file, you can point directly to this file\n tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n ```\n \"\"\"\n resume_download = kwargs.pop(\"resume_download\", False)\n use_auth_token = kwargs.pop(\"use_auth_token\", None)\n subfolder = kwargs.pop(\"subfolder\", None)\n from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n from_auto_class = kwargs.pop(\"_from_auto\", False)\n commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n user_agent = {\n \"file_type\": \"tokenizer\",\n \"from_auto_class\": from_auto_class,\n \"is_fast\": \"Fast\" in cls.__name__,\n }\n if from_pipeline is not None:\n user_agent[\"using_pipeline\"] = from_pipeline\n\n if is_offline_mode() and not local_files_only:\n logger.info(\"Offline mode: forcing local_files_only=True\")\n local_files_only = True\n\n pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n os.path.isdir(pretrained_model_name_or_path)\n file_path = None\n if os.path.isfile(pretrained_model_name_or_path):\n file_path = pretrained_model_name_or_path\n elif is_remote_url(pretrained_model_name_or_path):\n file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n else:\n # EN: remove this when transformers package has uniform API\n cached_file_extra_kwargs = {\"use_auth_token\": token}\n if packaging.version.parse(transformers_version) >= packaging.version.parse(\"5.0\"):\n cached_file_extra_kwargs = {\"token\": token}\n # Try to get the tokenizer config to see if there are versioned tokenizer files.\n resolved_vocab_files = cached_file(\n pretrained_model_name_or_path,\n cls.vocab_files_names,\n cache_dir=cache_dir,\n force_download=force_download,\n resume_download=resume_download,\n proxies=proxies,\n local_files_only=local_files_only,\n subfolder=subfolder,\n user_agent=user_agent,\n _raise_exceptions_for_missing_entries=False,\n _raise_exceptions_for_connection_errors=False,\n _commit_hash=commit_hash,\n **cached_file_extra_kwargs,\n )\n commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n file_path = resolved_vocab_files\n\n if not os.path.isfile(file_path):\n logger.info(\n f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n )\n\n tokenizer = cls.load(file_path)\n if return_fast_tokenizer:\n return tokenizer.get_pretrained()\n return tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.get_pretrained","title":"get_pretrained(**kwargs)
","text":"Get a pretrained tokenizer from this tokenizer
Returns:
Type DescriptionPreTrainedTokenizerFast
Returns pre-trained fast tokenizer for hugging face models.
Source code insafe/tokenizer.py
def get_pretrained(self, **kwargs) -> PreTrainedTokenizerFast:\n r\"\"\"\n Get a pretrained tokenizer from this tokenizer\n\n Returns:\n Returns pre-trained fast tokenizer for hugging face models.\n \"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n # now we need to add special_tokens\n tk.add_special_tokens(\n {\n \"cls_token\": self.tokenizer.cls_token,\n \"bos_token\": self.tokenizer.bos_token,\n \"eos_token\": self.tokenizer.eos_token,\n \"mask_token\": self.tokenizer.mask_token,\n \"pad_token\": self.tokenizer.pad_token,\n \"unk_token\": self.tokenizer.unk_token,\n \"sep_token\": self.tokenizer.sep_token,\n }\n )\n if (\n tk.model_max_length is None\n or tk.model_max_length > 1e8\n and hasattr(self.tokenizer, \"model_max_length\")\n ):\n tk.model_max_length = self.tokenizer.model_max_length\n setattr(\n tk,\n \"model_max_length\",\n getattr(self.tokenizer, \"model_max_length\"),\n )\n return tk\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.load","title":"load(file_name)
classmethod
","text":"Load the current tokenizer from file
Source code insafe/tokenizer.py
@classmethod\ndef load(cls, file_name):\n \"\"\"Load the current tokenizer from file\"\"\"\n with fsspec.open(file_name, \"r\") as OUT:\n data_str = OUT.read()\n data = json.loads(data_str)\n # EN: the rust json parser of tokenizers has a predefined structure\n # the next two lines are important\n return cls.from_dict(data)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.push_to_hub","title":"push_to_hub(repo_id, use_temp_dir=None, commit_message=None, private=None, token=None, max_shard_size='10GB', create_pr=False, safe_serialization=False, **deprecated_kwargs)
","text":"Upload the tokenizer to the \ud83e\udd17 Model Hub.
Parameters:
Name Type Description Defaultrepo_id
str
The name of the repository you want to push your {object} to. It should contain your organization name when pushing to a given organization.
requireduse_temp_dir
Optional[bool]
Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub. Will default to True
if there is no directory named like repo_id
, False
otherwise.
None
commit_message
Optional[str]
Message to commit while pushing. Will default to \"Upload {object}\"
.
None
private
Optional[bool]
Whether or not the repository created should be private.
None
token
Optional[Union[bool, str]]
The token to use as HTTP bearer authorization for remote files. If True
, will use the token generated when running huggingface-cli login
(stored in ~/.huggingface
). Will default to True
if repo_url
is not specified.
None
max_shard_size
Optional[Union[int, str]]
Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size lower than this size. If expressed as a string, needs to be digits followed by a unit (like \"5MB\"
).
'10GB'
create_pr
bool
Whether or not to create a PR with the uploaded files or directly commit.
False
safe_serialization
bool
Whether or not to convert the model weights in safetensors format for safer serialization.
False
Source code in safe/tokenizer.py
def push_to_hub(\n self,\n repo_id: str,\n use_temp_dir: Optional[bool] = None,\n commit_message: Optional[str] = None,\n private: Optional[bool] = None,\n token: Optional[Union[bool, str]] = None,\n max_shard_size: Optional[Union[int, str]] = \"10GB\",\n create_pr: bool = False,\n safe_serialization: bool = False,\n **deprecated_kwargs,\n) -> str:\n \"\"\"\n Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n Args:\n repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n when pushing to a given organization.\n use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n private: Whether or not the repository created should be private.\n token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n is not specified.\n max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n by a unit (like `\"5MB\"`).\n create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n \"\"\"\n use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n if repo_path_or_name is not None:\n # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n # repo_id from the folder path, if it exists.\n warnings.warn(\n \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n \"`repo_id` instead.\",\n FutureWarning,\n )\n if repo_id is not None:\n raise ValueError(\n \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n )\n if os.path.isdir(repo_path_or_name):\n # repo_path: infer repo_id from the path\n repo_id = repo_id.split(os.path.sep)[-1]\n working_dir = repo_id\n else:\n # repo_name: use it as repo_id\n repo_id = repo_path_or_name\n working_dir = repo_id.split(\"/\")[-1]\n else:\n # Repo_id is passed correctly: infer working_dir from it\n working_dir = repo_id.split(\"/\")[-1]\n\n # Deprecation warning will be sent after for repo_url and organization\n repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n organization = deprecated_kwargs.pop(\"organization\", None)\n\n repo_id = self._create_repo(\n repo_id, private, token, repo_url=repo_url, organization=organization\n )\n\n if use_temp_dir is None:\n use_temp_dir = not os.path.isdir(working_dir)\n\n with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n files_timestamps = self._get_files_timestamps(work_dir)\n\n # Save all files.\n with contextlib.suppress(Exception):\n self.save_pretrained(\n work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n )\n\n self.save(os.path.join(work_dir, self.vocab_files_names))\n\n return self._upload_modified_files(\n work_dir,\n repo_id,\n files_timestamps,\n commit_message=commit_message,\n token=token,\n create_pr=create_pr,\n )\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save","title":"save(file_name=None)
","text":"Saves the :class:~tokenizers.Tokenizer
to the file at the given path.
Parameters:
Name Type Description Defaultfile_name
str
File where to save tokenizer
None
Source code in safe/tokenizer.py
def save(self, file_name=None):\n r\"\"\"\n Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n Args:\n file_name (str, optional): File where to save tokenizer\n \"\"\"\n # EN: whole logic here assumes noone is going to mess with the special token\n tk_data = self.to_dict()\n with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n out_str = json.dumps(tk_data, ensure_ascii=False)\n OUT.write(out_str)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save_pretrained","title":"save_pretrained(*args, **kwargs)
","text":"Save pretrained tokenizer
Source code insafe/tokenizer.py
def save_pretrained(self, *args, **kwargs):\n \"\"\"Save pretrained tokenizer\"\"\"\n self.tokenizer.save_pretrained(*args, **kwargs)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.set_special_tokens","title":"set_special_tokens(tokenizer, bos_token=CLS_TOKEN, eos_token=SEP_TOKEN)
classmethod
","text":"Set special tokens for a tokenizer
Parameters:
Name Type Description Defaulttokenizer
Tokenizer
tokenizer for which special tokens will be set
requiredbos_token
str
Optional bos token to use
CLS_TOKEN
eos_token
str
Optional eos token to use
SEP_TOKEN
Source code in safe/tokenizer.py
@classmethod\ndef set_special_tokens(\n cls,\n tokenizer: Tokenizer,\n bos_token: str = CLS_TOKEN,\n eos_token: str = SEP_TOKEN,\n):\n \"\"\"Set special tokens for a tokenizer\n\n Args:\n tokenizer: tokenizer for which special tokens will be set\n bos_token: Optional bos token to use\n eos_token: Optional eos token to use\n \"\"\"\n tokenizer.pad_token = PADDING_TOKEN\n tokenizer.cls_token = CLS_TOKEN\n tokenizer.sep_token = SEP_TOKEN\n tokenizer.mask_token = MASK_TOKEN\n tokenizer.unk_token = UNK_TOKEN\n tokenizer.eos_token = eos_token\n tokenizer.bos_token = bos_token\n\n if isinstance(tokenizer, Tokenizer):\n tokenizer.add_special_tokens(\n [\n PADDING_TOKEN,\n CLS_TOKEN,\n SEP_TOKEN,\n MASK_TOKEN,\n UNK_TOKEN,\n eos_token,\n bos_token,\n ]\n )\n return tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.to_dict","title":"to_dict(**kwargs)
","text":"Convert tokenizer to dict
Source code insafe/tokenizer.py
def to_dict(self, **kwargs):\n \"\"\"Convert tokenizer to dict\"\"\"\n # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n if self.splitter is None:\n tk_data = json.loads(self.tokenizer.to_str())\n else:\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n # temporary replace pre tokenizer with whitespace\n tk_data = json.loads(self.tokenizer.to_str())\n tk_data[\"custom_pre_tokenizer\"] = True\n tk_data[\"tokenizer_type\"] = self.tokenizer_type\n tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n return tk_data\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train","title":"train(files, **kwargs)
","text":"This is to train a new tokenizer from either a list of file or some input data
Args files (str): file in which your molecules are separated by new line kwargs (dict): optional args for the tokenizer train
safe/tokenizer.py
def train(self, files: Optional[List[str]], **kwargs):\n r\"\"\"\n This is to train a new tokenizer from either a list of file or some input data\n\n Args\n files (str): file in which your molecules are separated by new line\n kwargs (dict): optional args for the tokenizer `train`\n \"\"\"\n if isinstance(files, str):\n files = [files]\n self.tokenizer.train(files=files, trainer=self.trainer)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train_from_iterator","title":"train_from_iterator(data, **kwargs)
","text":"Train the Tokenizer using the provided iterator.
You can provide anything that is a Python Iterator * A list of sequences :obj:List[str]
* A generator that yields :obj:str
or :obj:List[str]
* A Numpy array of strings
Parameters:
Name Type Description Defaultdata
Iterator
data iterator
required**kwargs
Any
additional keyword argument for the tokenizer train_from_iterator
{}
Source code in safe/tokenizer.py
def train_from_iterator(self, data: Iterator, **kwargs: Any):\n \"\"\"Train the Tokenizer using the provided iterator.\n\n You can provide anything that is a Python Iterator\n * A list of sequences :obj:`List[str]`\n * A generator that yields :obj:`str` or :obj:`List[str]`\n * A Numpy array of strings\n\n Args:\n data: data iterator\n **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n \"\"\"\n self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n
"},{"location":"api/safe.html#utils","title":"Utils","text":""},{"location":"api/safe.html#safe.utils.MolSlicer","title":"MolSlicer
","text":"Slice a molecule into head-linker-tail
Source code insafe/utils.py
class MolSlicer:\n \"\"\"Slice a molecule into head-linker-tail\"\"\"\n\n BOND_SPLITTERS = [\n # two atoms connected by a non ring single bond, one of each is not in a ring and at least two heavy neighbor\n \"[R:1]-&!@[!R;!D1:2]\",\n # two atoms in different rings linked by a non-ring single bond\n \"[R:1]-&!@[R:2]\",\n ]\n _BOND_BUFFER = 1 # buffer around substructure match size.\n MAX_CUTS = 2 # maximum number of cuts. Here we need two cuts for head-linker-tail.\n\n _MERGING_RXN = dm.reactions.rxn_from_smarts(\n \"[#0][*:1].[#0][*:4].([#0][*:2].[#0][*:3])>>([*:1][*:2].[*:3][*:4])\"\n )\n\n def __init__(\n self,\n shortest_linker: bool = False,\n min_linker_size: int = 0,\n require_ring_system: bool = True,\n verbose: bool = False,\n ):\n \"\"\"\n Constructor of bond slicer.\n\n Args:\n shortest_linker: whether to consider longuest or shortest linker.\n Does not have any effect when expected_head group is provided during splitting\n min_linker_size: minimum linker size\n require_ring_system: whether all fragment needs to have a ring system\n verbose: whether to allow verbosity in logging\n \"\"\"\n\n self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n self.shortest_linker = shortest_linker\n self.min_linker_size = min_linker_size\n self.require_ring_system = require_ring_system\n self.verbose = verbose\n\n def get_ring_system(self, mol: dm.Mol):\n \"\"\"Get the list of ring system from a molecule\n\n Args:\n mol: input molecule for which we are computing the ring system\n \"\"\"\n mol.UpdatePropertyCache()\n ri = mol.GetRingInfo()\n systems = []\n for ring in ri.AtomRings():\n ring_atoms = set(ring)\n cur_system = [] # keep a track of ring system\n for system in systems:\n if len(ring_atoms.intersection(system)) > 0:\n ring_atoms = ring_atoms.union(system) # merge ring system that overlap\n else:\n cur_system.append(system)\n cur_system.append(ring_atoms)\n systems = cur_system\n return systems\n\n def _bond_selection_from_max_cuts(self, bond_list: List[int], dist_mat: np.ndarray):\n \"\"\"Select bonds based on maximum number of cuts allowed\"\"\"\n # for now we are just implementing to 2 max cuts algorithms\n if self.MAX_CUTS != 2:\n raise ValueError(f\"Only MAX_CUTS=2 is supported, got {self.MAX_CUTS}\")\n\n bond_pdist = np.full((len(bond_list), len(bond_list)), -1)\n for i in range(len(bond_list)):\n for j in range(i, len(bond_list)):\n # we get the minimum topological distance between bond to cut\n bond_pdist[i, j] = bond_pdist[j, i] = min(\n [dist_mat[a1, a2] for a1, a2 in itertools.product(bond_list[i], bond_list[j])]\n )\n\n masked_bond_pdist = np.ma.masked_less_equal(bond_pdist, self.min_linker_size)\n\n if self.shortest_linker:\n return np.unravel_index(np.ma.argmin(masked_bond_pdist), bond_pdist.shape)\n return np.unravel_index(np.ma.argmax(masked_bond_pdist), bond_pdist.shape)\n\n def _get_bonds_to_cut(self, mol: dm.Mol):\n \"\"\"Get possible bond to cuts\n\n Args:\n mol: input molecule\n \"\"\"\n # use this if you want to enumerate yourself the possible cuts\n\n ring_systems = self.get_ring_system(mol)\n candidate_bonds = []\n ring_query = Chem.rdqueries.IsInRingQueryAtom()\n\n for query in self.bond_splitters:\n bonds = mol.GetSubstructMatches(query, uniquify=True)\n cur_unique_bonds = [set(cbond) for cbond in candidate_bonds]\n # do not accept bonds part of the same ring system or already known\n for b in bonds:\n bond_id = mol.GetBondBetweenAtoms(*b).GetIdx()\n bond_cut = Chem.GetMolFrags(\n Chem.FragmentOnBonds(mol, [bond_id], addDummies=False), asMols=True\n )\n can_add = not self.require_ring_system or all(\n len(frag.GetAtomsMatchingQuery(ring_query)) > 0 for frag in bond_cut\n )\n if can_add and not (\n set(b) in cur_unique_bonds or any(x.issuperset(set(b)) for x in ring_systems)\n ):\n candidate_bonds.append(b)\n return candidate_bonds\n\n def _fragment_mol(self, mol: dm.Mol, bonds: List[dm.Bond]):\n \"\"\"Fragment molecules on bonds and return head, linker, tail combination\n\n Args:\n mol: input molecule\n bonds: list of bonds to cut\n \"\"\"\n tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in bonds])\n _frags = list(Chem.GetMolFrags(tmp, asMols=True))\n # linker is the one with 2 dummy atoms\n linker_pos = 0\n for pos, _frag in enumerate(_frags):\n if sum([at.GetSymbol() == \"*\" for at in _frag.GetAtoms()]) == 2:\n linker_pos = pos\n break\n linker = _frags.pop(linker_pos)\n head, tail = _frags\n return (head, linker, tail)\n\n def _compute_linker_score(self, linker: dm.Mol):\n \"\"\"Compute the score of a linker to help select between linkers\"\"\"\n\n # we need to take into account\n # case where we require the linker to have a ring system\n # case where we want the linker to be longuest or shortest\n\n # find shortest path\n attach1, attach2, *_ = [at.GetIdx() for at in linker.GetAtoms() if at.GetSymbol() == \"*\"]\n score = len(Chem.rdmolops.GetShortestPath(linker, attach1, attach2))\n ring_query = Chem.rdqueries.IsInRingQueryAtom()\n linker_ring_count = len(linker.GetAtomsMatchingQuery(ring_query))\n if self.require_ring_system:\n score *= int(linker_ring_count > 0)\n if score == 0:\n return float(\"inf\")\n if not self.shortest_linker:\n score = 1 / score\n return score\n\n def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n \"\"\"Perform slicing of the input molecule\n\n Args:\n mol: input molecule\n expected_head: substructure that should be part of the head.\n The small fragment containing this substructure would be kept as head\n \"\"\"\n\n mol = dm.to_mol(mol)\n # remove salt and solution\n mol = dm.keep_largest_fragment(mol)\n Chem.rdDepictor.Compute2DCoords(mol)\n dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n if expected_head is not None:\n if isinstance(expected_head, str):\n expected_head = dm.to_mol(expected_head)\n if not mol.HasSubstructMatch(expected_head):\n if self.verbose:\n logger.info(\n \"Expected head was provided, but does not match molecules. It will be ignored\"\n )\n expected_head = None\n\n candidate_bonds = self._get_bonds_to_cut(mol)\n\n # we have all the candidate bonds we can cut\n # now we need to pick the most plausible bonds\n selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n # CASE 1: no bond to cut ==> only head\n if len(selected_bonds) == 0:\n return (mol, None, None)\n\n # CASE 2: only one bond ==> linker is empty\n if len(selected_bonds) == 1:\n # there is not linker\n tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n head, tail = Chem.GetMolFrags(tmp, asMols=True)\n return (head, None, tail)\n\n # CASE 3a: we select the most plausible bond to cut on ourselves\n if expected_head is None:\n choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n selected_bonds = [selected_bonds[c] for c in choice]\n return self._fragment_mol(mol, selected_bonds)\n\n # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n # provided substructure\n bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n bond_score = float(\"inf\")\n linker_score = float(\"inf\")\n head, linker, tail = (None, None, None)\n for split_bonds in bond_combination:\n cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n # head can also be tail\n head_match = cur_head.GetSubstructMatch(expected_head)\n tail_match = cur_tail.GetSubstructMatch(expected_head)\n if not head_match and not tail_match:\n continue\n if not head_match and tail_match:\n cur_head, cur_tail = cur_tail, cur_head\n cur_bond_score = cur_head.GetNumHeavyAtoms()\n # compute linker score\n cur_linker_score = self._compute_linker_score(cur_linker)\n if (cur_bond_score < bond_score) or (\n cur_bond_score < self._BOND_BUFFER + bond_score and cur_linker_score < linker_score\n ):\n head, linker, tail = cur_head, cur_linker, cur_tail\n bond_score = cur_bond_score\n linker_score = cur_linker_score\n\n return (head, linker, tail)\n\n @classmethod\n def link_fragments(\n cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n ):\n \"\"\"Link fragments together using the provided linker\n\n Args:\n linker: linker to use\n head: head fragment\n tail: tail fragment\n \"\"\"\n if isinstance(linker, dm.Mol):\n linker = dm.to_smiles(linker)\n linker = standardize_attach(linker)\n reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n return dm.reactions.apply_reaction(\n cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n )\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.__call__","title":"__call__(mol, expected_head=None)
","text":"Perform slicing of the input molecule
Parameters:
Name Type Description Defaultmol
Union[Mol, str]
input molecule
requiredexpected_head
Union[Mol, str]
substructure that should be part of the head. The small fragment containing this substructure would be kept as head
None
Source code in safe/utils.py
def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n \"\"\"Perform slicing of the input molecule\n\n Args:\n mol: input molecule\n expected_head: substructure that should be part of the head.\n The small fragment containing this substructure would be kept as head\n \"\"\"\n\n mol = dm.to_mol(mol)\n # remove salt and solution\n mol = dm.keep_largest_fragment(mol)\n Chem.rdDepictor.Compute2DCoords(mol)\n dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n if expected_head is not None:\n if isinstance(expected_head, str):\n expected_head = dm.to_mol(expected_head)\n if not mol.HasSubstructMatch(expected_head):\n if self.verbose:\n logger.info(\n \"Expected head was provided, but does not match molecules. It will be ignored\"\n )\n expected_head = None\n\n candidate_bonds = self._get_bonds_to_cut(mol)\n\n # we have all the candidate bonds we can cut\n # now we need to pick the most plausible bonds\n selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n # CASE 1: no bond to cut ==> only head\n if len(selected_bonds) == 0:\n return (mol, None, None)\n\n # CASE 2: only one bond ==> linker is empty\n if len(selected_bonds) == 1:\n # there is not linker\n tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n head, tail = Chem.GetMolFrags(tmp, asMols=True)\n return (head, None, tail)\n\n # CASE 3a: we select the most plausible bond to cut on ourselves\n if expected_head is None:\n choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n selected_bonds = [selected_bonds[c] for c in choice]\n return self._fragment_mol(mol, selected_bonds)\n\n # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n # provided substructure\n bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n bond_score = float(\"inf\")\n linker_score = float(\"inf\")\n head, linker, tail = (None, None, None)\n for split_bonds in bond_combination:\n cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n # head can also be tail\n head_match = cur_head.GetSubstructMatch(expected_head)\n tail_match = cur_tail.GetSubstructMatch(expected_head)\n if not head_match and not tail_match:\n continue\n if not head_match and tail_match:\n cur_head, cur_tail = cur_tail, cur_head\n cur_bond_score = cur_head.GetNumHeavyAtoms()\n # compute linker score\n cur_linker_score = self._compute_linker_score(cur_linker)\n if (cur_bond_score < bond_score) or (\n cur_bond_score < self._BOND_BUFFER + bond_score and cur_linker_score < linker_score\n ):\n head, linker, tail = cur_head, cur_linker, cur_tail\n bond_score = cur_bond_score\n linker_score = cur_linker_score\n\n return (head, linker, tail)\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.__init__","title":"__init__(shortest_linker=False, min_linker_size=0, require_ring_system=True, verbose=False)
","text":"Constructor of bond slicer.
Parameters:
Name Type Description Defaultshortest_linker
bool
whether to consider longuest or shortest linker. Does not have any effect when expected_head group is provided during splitting
False
min_linker_size
int
minimum linker size
0
require_ring_system
bool
whether all fragment needs to have a ring system
True
verbose
bool
whether to allow verbosity in logging
False
Source code in safe/utils.py
def __init__(\n self,\n shortest_linker: bool = False,\n min_linker_size: int = 0,\n require_ring_system: bool = True,\n verbose: bool = False,\n):\n \"\"\"\n Constructor of bond slicer.\n\n Args:\n shortest_linker: whether to consider longuest or shortest linker.\n Does not have any effect when expected_head group is provided during splitting\n min_linker_size: minimum linker size\n require_ring_system: whether all fragment needs to have a ring system\n verbose: whether to allow verbosity in logging\n \"\"\"\n\n self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n self.shortest_linker = shortest_linker\n self.min_linker_size = min_linker_size\n self.require_ring_system = require_ring_system\n self.verbose = verbose\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.get_ring_system","title":"get_ring_system(mol)
","text":"Get the list of ring system from a molecule
Parameters:
Name Type Description Defaultmol
Mol
input molecule for which we are computing the ring system
required Source code insafe/utils.py
def get_ring_system(self, mol: dm.Mol):\n \"\"\"Get the list of ring system from a molecule\n\n Args:\n mol: input molecule for which we are computing the ring system\n \"\"\"\n mol.UpdatePropertyCache()\n ri = mol.GetRingInfo()\n systems = []\n for ring in ri.AtomRings():\n ring_atoms = set(ring)\n cur_system = [] # keep a track of ring system\n for system in systems:\n if len(ring_atoms.intersection(system)) > 0:\n ring_atoms = ring_atoms.union(system) # merge ring system that overlap\n else:\n cur_system.append(system)\n cur_system.append(ring_atoms)\n systems = cur_system\n return systems\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.link_fragments","title":"link_fragments(linker, head, tail)
classmethod
","text":"Link fragments together using the provided linker
Parameters:
Name Type Description Defaultlinker
Union[Mol, str]
linker to use
requiredhead
Union[Mol, str]
head fragment
requiredtail
Union[Mol, str]
tail fragment
required Source code insafe/utils.py
@classmethod\ndef link_fragments(\n cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n):\n \"\"\"Link fragments together using the provided linker\n\n Args:\n linker: linker to use\n head: head fragment\n tail: tail fragment\n \"\"\"\n if isinstance(linker, dm.Mol):\n linker = dm.to_smiles(linker)\n linker = standardize_attach(linker)\n reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n return dm.reactions.apply_reaction(\n cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n )\n
"},{"location":"api/safe.html#safe.utils.attr_as","title":"attr_as(obj, field, value)
","text":"Temporary replace the value of an object
Parameters:
Name Type Description Defaultobj
Any
object to temporary patch
requiredfield
str
name of the key to change
requiredvalue
Any
value of key to be temporary changed
required Source code insafe/utils.py
@contextmanager\ndef attr_as(obj: Any, field: str, value: Any):\n \"\"\"Temporary replace the value of an object\n\n Args:\n obj: object to temporary patch\n field: name of the key to change\n value: value of key to be temporary changed\n \"\"\"\n old_value = getattr(obj, field, None)\n setattr(obj, field, value)\n yield\n with suppress(TypeError):\n setattr(obj, field, old_value)\n
"},{"location":"api/safe.html#safe.utils.compute_side_chains","title":"compute_side_chains(mol, core, label_by_index=False)
","text":"Compute the side chain of a molecule given a core
Finding the side chains
The algorithm to find the side chains from core assumes that the core we get as input has attachment points. Those attachment points are never considered as part of the query, rather they are used to define the attachment points on the side chains. Removing the attachment points from the core is exactly the same as keeping them.
mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\ncore0 = \"CC1(C)CN2C(CC2=O)S1\"\ncore1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\ncore2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\nside_chain = compute_side_chain(core=core0, mol=mol)\ndm.to_image([side_chain, core0, mol])\n
Therefore on the above, core0 and core1 are equivalent for the molecule mol
, but core2 is not. Parameters:
Name Type Description Defaultmol
Mol
molecule to split
requiredcore
Mol
core to use for deriving the side chains
required Source code insafe/utils.py
def compute_side_chains(mol: dm.Mol, core: dm.Mol, label_by_index: bool = False):\n \"\"\"Compute the side chain of a molecule given a core\n\n !!! note \"Finding the side chains\"\n The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n Those attachment points are never considered as part of the query, rather they are used to define the attachment points\n on the side chains. Removing the attachment points from the core is exactly the same as keeping them.\n\n ```python\n mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\n core0 = \"CC1(C)CN2C(CC2=O)S1\"\n core1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\n core2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\n side_chain = compute_side_chain(core=core0, mol=mol)\n dm.to_image([side_chain, core0, mol])\n ```\n Therefore on the above, core0 and core1 are equivalent for the molecule `mol`, but core2 is not.\n\n Args:\n mol: molecule to split\n core: core to use for deriving the side chains\n \"\"\"\n\n if isinstance(mol, str):\n mol = dm.to_mol(mol)\n if isinstance(core, str):\n core = dm.to_mol(core)\n core_query_param = AdjustQueryParameters()\n core_query_param.makeDummiesQueries = True\n core_query_param.adjustDegree = False\n core_query_param.aromatizeIfPossible = True\n core_query_param.makeBondsGeneric = False\n core_query = AdjustQueryProperties(core, core_query_param)\n return ReplaceCore(\n mol, core_query, labelByIndex=label_by_index, replaceDummies=False, requireDummyMatch=False\n )\n
"},{"location":"api/safe.html#safe.utils.convert_to_safe","title":"convert_to_safe(mol, canonical=False, randomize=False, seed=1, slicer='brics', split_fragment=True, fraction_hs=None, resolution=0.5)
","text":"Convert a molecule to a safe representation
Parameters:
Name Type Description Defaultmol
Mol
molecule to convert
requiredcanonical
bool
whether to use canonical encoding
False
randomize
bool
whether to randomize the encoding
False
seed
Optional[int]
random seed
1
slicer
str
the slicer to use for fragmentation
'brics'
split_fragment
bool
whether to split fragments
True
fraction_hs
bool
proportion of random atom to which we will add explicit hydrogens
None
resolution
Optional[float]
resolution for the partitioning algorithm
0.5
seed
Optional[int]
random seed
1
Source code in safe/utils.py
def convert_to_safe(\n mol: dm.Mol,\n canonical: bool = False,\n randomize: bool = False,\n seed: Optional[int] = 1,\n slicer: str = \"brics\",\n split_fragment: bool = True,\n fraction_hs: bool = None,\n resolution: Optional[float] = 0.5,\n):\n \"\"\"Convert a molecule to a safe representation\n\n Args:\n mol: molecule to convert\n canonical: whether to use canonical encoding\n randomize: whether to randomize the encoding\n seed: random seed\n slicer: the slicer to use for fragmentation\n split_fragment: whether to split fragments\n fraction_hs: proportion of random atom to which we will add explicit hydrogens\n resolution: resolution for the partitioning algorithm\n seed: random seed\n \"\"\"\n x = None\n try:\n x = sf.encode(mol, canonical=canonical, randomize=randomize, slicer=slicer, seed=seed)\n except sf.SAFEFragmentationError:\n if split_fragment:\n if \".\" in mol:\n return None\n try:\n x = sf.encode(\n mol,\n canonical=False,\n randomize=randomize,\n seed=seed,\n slicer=partial(\n fragment_aware_spliting,\n fraction_hs=fraction_hs,\n resolution=resolution,\n seed=seed,\n ),\n )\n except (sf.SAFEEncodeError, sf.SAFEFragmentationError):\n # logger.exception(e)\n return x\n # we need to resplit using attachment point but here we are only adding\n except sf.SAFEEncodeError:\n return x\n return x\n
"},{"location":"api/safe.html#safe.utils.filter_by_substructure_constraints","title":"filter_by_substructure_constraints(sequences, substruct, n_jobs=-1)
","text":"Check whether the input substructures are present in each of the molecule in the sequences
Parameters:
Name Type Description Defaultsequences
List[Union[str, Mol]]
list of molecules to validate
requiredsubstruct
Union[str, Mol]
substructure to use as query
requiredn_jobs
int
number of jobs to use for parallelization
-1
Source code in safe/utils.py
def filter_by_substructure_constraints(\n sequences: List[Union[str, dm.Mol]], substruct: Union[str, dm.Mol], n_jobs: int = -1\n):\n \"\"\"Check whether the input substructures are present in each of the molecule in the sequences\n\n Args:\n sequences: list of molecules to validate\n substruct: substructure to use as query\n n_jobs: number of jobs to use for parallelization\n\n \"\"\"\n\n if isinstance(substruct, str):\n substruct = standardize_attach(substruct)\n substruct = dm.from_smarts(substruct)\n\n def _check_match(mol):\n with suppress(Exception):\n mol = dm.to_mol(mol)\n return mol.HasSubstructMatch(substruct)\n return False\n\n matches = dm.parallelized(_check_match, sequences, n_jobs=n_jobs)\n return list(compress(sequences, matches))\n
"},{"location":"api/safe.html#safe.utils.find_partition_edges","title":"find_partition_edges(G, partition)
","text":"Find the edges connecting the subgraphs in a given partition of a graph.
Parameters:
Name Type Description DefaultG
Graph
The original graph.
requiredpartition
list of list of nodes
The partition of the graph where each element is a list of nodes representing a subgraph.
requiredReturns:
Name Type Descriptionlist
List[Tuple]
A list of edges connecting the subgraphs in the partition.
Source code insafe/utils.py
def find_partition_edges(G: nx.Graph, partition: List[List]) -> List[Tuple]:\n \"\"\"\n Find the edges connecting the subgraphs in a given partition of a graph.\n\n Args:\n G (networkx.Graph): The original graph.\n partition (list of list of nodes): The partition of the graph where each element is a list of nodes representing a subgraph.\n\n Returns:\n list: A list of edges connecting the subgraphs in the partition.\n \"\"\"\n partition_edges = []\n for subgraph1, subgraph2 in combinations(partition, 2):\n edges = nx.edge_boundary(G, subgraph1, subgraph2)\n partition_edges.extend(edges)\n return partition_edges\n
"},{"location":"api/safe.html#safe.utils.fragment_aware_spliting","title":"fragment_aware_spliting(mol, fraction_hs=None, **kwargs)
","text":"Custom splitting algorithm for dataset building.
This slicing strategy will cut any bond including bonding with hydrogens However, only one cut per atom is allowed
Parameters:
Name Type Description Defaultmol
Mol
molecule to split
requiredfraction_hs
Optional[bool]
proportion of random atom to which we will add explicit hydrogens
None
kwargs
Any
additional arguments to pass to the partitioning algorithm
{}
Source code in safe/utils.py
def fragment_aware_spliting(mol: dm.Mol, fraction_hs: Optional[bool] = None, **kwargs: Any):\n \"\"\"Custom splitting algorithm for dataset building.\n\n This slicing strategy will cut any bond including bonding with hydrogens\n However, only one cut per atom is allowed\n\n Args:\n mol: molecule to split\n fraction_hs: proportion of random atom to which we will add explicit hydrogens\n kwargs: additional arguments to pass to the partitioning algorithm\n \"\"\"\n random.seed(kwargs.get(\"seed\", 1))\n mol = dm.to_mol(mol, remove_hs=False)\n mol = _selective_add_hs(mol, fraction_hs=fraction_hs)\n graph = dm.graph.to_graph(mol)\n d = mol_partition(mol, **kwargs)\n q = deque(d)\n partition = q.pop()\n return find_partition_edges(graph, partition)\n
"},{"location":"api/safe.html#safe.utils.list_individual_attach_points","title":"list_individual_attach_points(mol, depth=None)
","text":"List all individual attachement points.
We do not allow multiple attachment points per substitution position.
Parameters:
Name Type Description Defaultmol
Mol
molecule for which we need to open the attachment points
required Source code insafe/utils.py
def list_individual_attach_points(mol: dm.Mol, depth: Optional[int] = None):\n \"\"\"List all individual attachement points.\n\n We do not allow multiple attachment points per substitution position.\n\n Args:\n mol: molecule for which we need to open the attachment points\n\n \"\"\"\n ATTACHING_RXN = ReactionFromSmarts(\"[*;h;!$([*][#0]):1]>>[*:1][*]\")\n mols = [mol]\n curated_prods = set()\n num_attachs = len(mol.GetSubstructMatches(dm.from_smarts(\"[*;h:1]\"), uniquify=True))\n depth = depth or 1\n depth = min(max(depth, 1), num_attachs)\n while depth > 0:\n prods = set()\n for mol in mols:\n mol = dm.to_mol(mol)\n for p in ATTACHING_RXN.RunReactants((mol,)):\n try:\n m = dm.sanitize_mol(p[0])\n sm = dm.to_smiles(m, canonical=True)\n sm = dm.reactions.add_brackets_to_attachment_points(sm)\n prods.add(dm.reactions.convert_attach_to_isotope(sm, as_smiles=True))\n except Exception as e:\n logger.error(e)\n curated_prods.update(prods)\n mols = prods\n depth -= 1\n return list(curated_prods)\n
"},{"location":"api/safe.html#safe.utils.mol_partition","title":"mol_partition(mol, query=None, seed=None, **kwargs)
","text":"Partition a molecule into fragments using a bond query
Parameters:
Name Type Description Defaultmol
Mol
molecule to split
requiredquery
Optional[Mol]
bond query to use for splitting
None
seed
Optional[int]
random seed
None
kwargs
Any
additional arguments to pass to the partitioning algorithm
{}
Source code in safe/utils.py
@py_random_state(\"seed\")\ndef mol_partition(\n mol: dm.Mol, query: Optional[dm.Mol] = None, seed: Optional[int] = None, **kwargs: Any\n):\n \"\"\"Partition a molecule into fragments using a bond query\n\n Args:\n mol: molecule to split\n query: bond query to use for splitting\n seed: random seed\n kwargs: additional arguments to pass to the partitioning algorithm\n\n \"\"\"\n resolution = kwargs.get(\"resolution\", 1.0)\n threshold = kwargs.get(\"threshold\", 1e-7)\n weight = kwargs.get(\"weight\", \"weight\")\n\n if query is None:\n query = __mmpa_query\n\n G = dm.graph.to_graph(mol)\n bond_partition = [\n tuple(sorted(match)) for match in mol.GetSubstructMatches(query, uniquify=True)\n ]\n\n def get_relevant_edges(e1, e2):\n return tuple(sorted([e1, e2])) not in bond_partition\n\n subgraphs = nx.subgraph_view(G, filter_edge=get_relevant_edges)\n\n partition = [{u} for u in G.nodes()]\n inner_partition = sorted(nx.connected_components(subgraphs), key=lambda x: min(x))\n mod = nx.algorithms.community.modularity(\n G, inner_partition, resolution=resolution, weight=weight\n )\n is_directed = G.is_directed()\n graph = G.__class__()\n graph.add_nodes_from(G)\n graph.add_weighted_edges_from(G.edges(data=weight, default=1))\n graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n m = graph.size(weight=\"weight\")\n partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n graph, m, inner_partition, resolution, is_directed, seed\n )\n improvement = True\n while improvement:\n # gh-5901 protect the sets in the yielded list from further manipulation here\n yield [s.copy() for s in partition]\n new_mod = nx.algorithms.community.modularity(\n graph, inner_partition, resolution=resolution, weight=\"weight\"\n )\n if new_mod - mod <= threshold:\n return\n mod = new_mod\n graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n graph, m, partition, resolution, is_directed, seed\n )\n
"},{"location":"api/safe.html#safe.utils.standardize_attach","title":"standardize_attach(inputs, standard_attach='[*]')
","text":"Standardize the attachment points of a molecule
Parameters:
Name Type Description Defaultinputs
str
input molecule
requiredstandard_attach
str
standard attachment point to use
'[*]'
Source code in safe/utils.py
def standardize_attach(inputs: str, standard_attach: str = \"[*]\"):\n \"\"\"Standardize the attachment points of a molecule\n\n Args:\n inputs: input molecule\n standard_attach: standard attachment point to use\n \"\"\"\n\n for attach_regex in _SMILES_ATTACHMENT_POINTS:\n inputs = re.sub(attach_regex, standard_attach, inputs)\n return inputs\n
"},{"location":"api/safe.models.html","title":"Model training","text":""},{"location":"api/safe.models.html#config-file","title":"Config File","text":"The input config file for training a SAFE
model is very similar to the GPT2 config file, with the addition of an optional num_labels
attribute for training with descriptors regularization.
{\n \"activation_function\": \"gelu_new\",\n \"attn_pdrop\": 0.1,\n \"bos_token_id\": 10000,\n \"embd_pdrop\": 0.1,\n \"eos_token_id\": 1,\n \"initializer_range\": 0.02,\n \"layer_norm_epsilon\": 1e-05,\n \"model_type\": \"gpt2\",\n \"n_embd\": 768,\n \"n_head\": 12,\n \"n_inner\": null,\n \"n_layer\": 12,\n \"n_positions\": 1024,\n \"reorder_and_upcast_attn\": false,\n \"resid_pdrop\": 0.1,\n \"scale_attn_by_inverse_layer_idx\": false,\n \"scale_attn_weights\": true,\n \"summary_activation\": \"tanh\",\n \"summary_first_dropout\": 0.1,\n \"summary_proj_to_labels\": true,\n \"summary_type\": \"cls_index\",\n \"summary_hidden_size\": 128,\n \"summary_use_proj\": true,\n \"transformers_version\": \"4.31.0\",\n \"use_cache\": true,\n \"vocab_size\": 10000,\n \"num_labels\": 9\n}\n
"},{"location":"api/safe.models.html#safe-model","title":"SAFE Model","text":""},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead","title":"PropertyHead
","text":" Bases: Module
Compute a single vector summary of a sequence hidden states.
Parameters:
Name Type Description Defaultconfig
[`PretrainedConfig`]
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual config class of your model for the default values it uses):
str
) -- The method to use to make this summary. Accepted values are:- \"last\"
-- Take the last token hidden state (like XLNet) - \"first\"
-- Take the first token hidden state (like Bert) - \"mean\"
-- Take the mean of all tokens hidden states - \"cls_index\"
-- Supply a Tensor of classification token position (GPT/GPT-2)
Optional[str]
) -- Set to \"tanh\"
to add a tanh activation to the output, another string, or None
to add no activation.safe/trainer/model.py
class PropertyHead(torch.nn.Module):\n r\"\"\"\n Compute a single vector summary of a sequence hidden states.\n\n Args:\n config ([`PretrainedConfig`]):\n The config used by the model. Relevant arguments in the config class of the model are (refer to the actual\n config class of your model for the default values it uses):\n\n - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:\n\n - `\"last\"` -- Take the last token hidden state (like XLNet)\n - `\"first\"` -- Take the first token hidden state (like Bert)\n - `\"mean\"` -- Take the mean of all tokens hidden states\n - `\"cls_index\"` -- Supply a Tensor of classification token position (GPT/GPT-2)\n\n - **summary_activation** (`Optional[str]`) -- Set to `\"tanh\"` to add a tanh activation to the output,\n another string, or `None` to add no activation.\n \"\"\"\n\n def __init__(self, config: PretrainedConfig):\n super().__init__()\n\n self.summary_type = getattr(config, \"summary_type\", \"cls_index\")\n self.summary = torch.nn.Identity()\n last_hidden_size = config.hidden_size\n\n if getattr(config, \"summary_hidden_size\", None) and config.summary_hidden_size > 0:\n self.summary = nn.Linear(config.hidden_size, config.summary_hidden_size)\n last_hidden_size = config.summary_hidden_size\n\n activation_string = getattr(config, \"summary_activation\", None)\n self.activation: Callable = (\n get_activation(activation_string) if activation_string else nn.Identity()\n )\n\n self.out = torch.nn.Identity()\n if getattr(config, \"num_labels\", None) and config.num_labels > 0:\n num_labels = config.num_labels\n self.out = nn.Linear(last_hidden_size, num_labels)\n\n def forward(\n self,\n hidden_states: torch.FloatTensor,\n cls_index: Optional[torch.LongTensor] = None,\n ) -> torch.FloatTensor:\n \"\"\"\n Compute a single vector summary of a sequence hidden states.\n\n Args:\n hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n The hidden states of the last layer.\n cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n where ... are optional leading dimensions of `hidden_states`, *optional*\n Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n Returns:\n `torch.FloatTensor`: The summary of the sequence hidden states.\n \"\"\"\n if self.summary_type == \"last\":\n output = hidden_states[:, -1]\n elif self.summary_type == \"first\":\n output = hidden_states[:, 0]\n elif self.summary_type == \"mean\":\n output = hidden_states.mean(dim=1)\n elif self.summary_type == \"cls_index\":\n # if cls_index is None:\n # cls_index = torch.full_like(\n # hidden_states[..., :1, :],\n # hidden_states.shape[-2] - 1,\n # dtype=torch.long,\n # )\n # else:\n # cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n # cls_index = cls_index.expand(\n # (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n # )\n\n # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n # output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)\n batch_size = hidden_states.shape[0]\n output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n else:\n raise NotImplementedError\n\n output = self.summary(output)\n output = self.activation(output)\n return self.out(output)\n
"},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead.forward","title":"forward(hidden_states, cls_index=None)
","text":"Compute a single vector summary of a sequence hidden states.
Parameters:
Name Type Description Defaulthidden_states
FloatTensor
torch.FloatTensor
of shape [batch_size, seq_len, hidden_size]
) The hidden states of the last layer.
cls_index
Optional[LongTensor]
torch.LongTensor
of shape [batch_size]
or [batch_size, ...]
where ... are optional leading dimensions of hidden_states
, optional Used if summary_type == \"cls_index\"
and takes the last token of the sequence as classification token.
None
Returns:
Type DescriptionFloatTensor
torch.FloatTensor
: The summary of the sequence hidden states.
safe/trainer/model.py
def forward(\n self,\n hidden_states: torch.FloatTensor,\n cls_index: Optional[torch.LongTensor] = None,\n) -> torch.FloatTensor:\n \"\"\"\n Compute a single vector summary of a sequence hidden states.\n\n Args:\n hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n The hidden states of the last layer.\n cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n where ... are optional leading dimensions of `hidden_states`, *optional*\n Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n Returns:\n `torch.FloatTensor`: The summary of the sequence hidden states.\n \"\"\"\n if self.summary_type == \"last\":\n output = hidden_states[:, -1]\n elif self.summary_type == \"first\":\n output = hidden_states[:, 0]\n elif self.summary_type == \"mean\":\n output = hidden_states.mean(dim=1)\n elif self.summary_type == \"cls_index\":\n # if cls_index is None:\n # cls_index = torch.full_like(\n # hidden_states[..., :1, :],\n # hidden_states.shape[-2] - 1,\n # dtype=torch.long,\n # )\n # else:\n # cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n # cls_index = cls_index.expand(\n # (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n # )\n\n # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n # output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)\n batch_size = hidden_states.shape[0]\n output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n else:\n raise NotImplementedError\n\n output = self.summary(output)\n output = self.activation(output)\n return self.out(output)\n
"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel","title":"SAFEDoubleHeadsModel
","text":" Bases: GPT2DoubleHeadsModel
The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head
Source code insafe/trainer/model.py
class SAFEDoubleHeadsModel(GPT2DoubleHeadsModel):\n \"\"\"The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head\"\"\"\n\n def __init__(self, config):\n self.num_labels = getattr(config, \"num_labels\", None)\n super().__init__(config)\n self.config.num_labels = self.num_labels\n del self.multiple_choice_head\n self.multiple_choice_head = PropertyHead(config)\n\n @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\n def forward(\n self,\n input_ids: Optional[torch.LongTensor] = None,\n past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n attention_mask: Optional[torch.FloatTensor] = None,\n token_type_ids: Optional[torch.LongTensor] = None,\n position_ids: Optional[torch.LongTensor] = None,\n head_mask: Optional[torch.FloatTensor] = None,\n inputs_embeds: Optional[torch.FloatTensor] = None,\n mc_token_ids: Optional[torch.LongTensor] = None,\n labels: Optional[torch.LongTensor] = None,\n mc_labels: Optional[torch.LongTensor] = None,\n use_cache: Optional[bool] = None,\n output_attentions: Optional[bool] = None,\n output_hidden_states: Optional[bool] = None,\n return_dict: Optional[bool] = None,\n inputs: Optional[Any] = None, # do not remove because of trainer\n encoder_hidden_states: Optional[torch.Tensor] = None,\n **kwargs,\n ) -> Union[Tuple, GPT2DoubleHeadsModelOutput]:\n r\"\"\"\n\n Args:\n mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n 1]`.\n labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n Labels for computing the supervized loss for regularization.\n inputs: List of inputs, put here because the trainer removes information not in signature\n Returns:\n output (GPT2DoubleHeadsModelOutput): output of the model\n \"\"\"\n return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n transformer_outputs = self.transformer(\n input_ids,\n past_key_values=past_key_values,\n attention_mask=attention_mask,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n use_cache=use_cache,\n output_attentions=output_attentions,\n output_hidden_states=output_hidden_states,\n return_dict=return_dict,\n encoder_hidden_states=encoder_hidden_states,\n )\n\n hidden_states = transformer_outputs[0]\n lm_logits = self.lm_head(hidden_states)\n\n if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n lm_logits.device\n )\n\n # Set device for model parallelism\n if self.model_parallel:\n torch.cuda.set_device(self.transformer.first_device)\n hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n mc_loss = None\n mc_logits = None\n if mc_labels is not None and getattr(self.config, \"num_labels\", 0) > 0:\n mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n mc_labels = mc_labels.to(mc_logits.device)\n loss_fct = MSELoss()\n mc_loss = loss_fct(\n mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n )\n\n lm_loss = None\n if labels is not None:\n labels = labels.to(lm_logits.device)\n shift_logits = lm_logits[..., :-1, :].contiguous()\n shift_labels = labels[..., 1:].contiguous()\n loss_fct = CrossEntropyLoss()\n lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n if not return_dict:\n output = (lm_logits, mc_logits) + transformer_outputs[1:]\n return (\n lm_loss,\n mc_loss,\n ) + output\n\n return GPT2DoubleHeadsModelOutput(\n loss=lm_loss,\n mc_loss=mc_loss,\n logits=lm_logits,\n mc_logits=mc_logits,\n past_key_values=transformer_outputs.past_key_values,\n hidden_states=transformer_outputs.hidden_states,\n attentions=transformer_outputs.attentions,\n )\n
"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel.forward","title":"forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, labels=None, mc_labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, inputs=None, encoder_hidden_states=None, **kwargs)
","text":"Parameters:
Name Type Description Defaultmc_token_ids
`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input
Index of the classification token in each input sequence. Selected in the range [0, input_ids.size(-1) - 1]
.
None
labels
`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*
Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids
. Indices are selected in [-100, 0, ..., config.vocab_size - 1]
. All labels set to -100
are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size - 1]
None
mc_labels
`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*
Labels for computing the supervized loss for regularization.
None
inputs
Optional[Any]
List of inputs, put here because the trainer removes information not in signature
None
Returns: output (GPT2DoubleHeadsModelOutput): output of the model
Source code insafe/trainer/model.py
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n@replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\ndef forward(\n self,\n input_ids: Optional[torch.LongTensor] = None,\n past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n attention_mask: Optional[torch.FloatTensor] = None,\n token_type_ids: Optional[torch.LongTensor] = None,\n position_ids: Optional[torch.LongTensor] = None,\n head_mask: Optional[torch.FloatTensor] = None,\n inputs_embeds: Optional[torch.FloatTensor] = None,\n mc_token_ids: Optional[torch.LongTensor] = None,\n labels: Optional[torch.LongTensor] = None,\n mc_labels: Optional[torch.LongTensor] = None,\n use_cache: Optional[bool] = None,\n output_attentions: Optional[bool] = None,\n output_hidden_states: Optional[bool] = None,\n return_dict: Optional[bool] = None,\n inputs: Optional[Any] = None, # do not remove because of trainer\n encoder_hidden_states: Optional[torch.Tensor] = None,\n **kwargs,\n) -> Union[Tuple, GPT2DoubleHeadsModelOutput]:\n r\"\"\"\n\n Args:\n mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n 1]`.\n labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n Labels for computing the supervized loss for regularization.\n inputs: List of inputs, put here because the trainer removes information not in signature\n Returns:\n output (GPT2DoubleHeadsModelOutput): output of the model\n \"\"\"\n return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n transformer_outputs = self.transformer(\n input_ids,\n past_key_values=past_key_values,\n attention_mask=attention_mask,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n use_cache=use_cache,\n output_attentions=output_attentions,\n output_hidden_states=output_hidden_states,\n return_dict=return_dict,\n encoder_hidden_states=encoder_hidden_states,\n )\n\n hidden_states = transformer_outputs[0]\n lm_logits = self.lm_head(hidden_states)\n\n if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n lm_logits.device\n )\n\n # Set device for model parallelism\n if self.model_parallel:\n torch.cuda.set_device(self.transformer.first_device)\n hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n mc_loss = None\n mc_logits = None\n if mc_labels is not None and getattr(self.config, \"num_labels\", 0) > 0:\n mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n mc_labels = mc_labels.to(mc_logits.device)\n loss_fct = MSELoss()\n mc_loss = loss_fct(\n mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n )\n\n lm_loss = None\n if labels is not None:\n labels = labels.to(lm_logits.device)\n shift_logits = lm_logits[..., :-1, :].contiguous()\n shift_labels = labels[..., 1:].contiguous()\n loss_fct = CrossEntropyLoss()\n lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n if not return_dict:\n output = (lm_logits, mc_logits) + transformer_outputs[1:]\n return (\n lm_loss,\n mc_loss,\n ) + output\n\n return GPT2DoubleHeadsModelOutput(\n loss=lm_loss,\n mc_loss=mc_loss,\n logits=lm_logits,\n mc_logits=mc_logits,\n past_key_values=transformer_outputs.past_key_values,\n hidden_states=transformer_outputs.hidden_states,\n attentions=transformer_outputs.attentions,\n )\n
"},{"location":"api/safe.models.html#trainer","title":"Trainer","text":""},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer","title":"SAFETrainer
","text":" Bases: Trainer
Custom trainer for training SAFE model.
This custom trainer changes the loss function to support the property head
Source code insafe/trainer/trainer_utils.py
class SAFETrainer(Trainer):\n \"\"\"\n Custom trainer for training SAFE model.\n\n This custom trainer changes the loss function to support the property head\n\n \"\"\"\n\n def __init__(self, *args, prop_loss_coeff: float = 1e-3, **kwargs):\n super().__init__(*args, **kwargs)\n self.prop_loss_coeff = prop_loss_coeff\n\n def compute_loss(self, model, inputs, return_outputs=False):\n \"\"\"\n How the loss is computed by Trainer. By default, all models return the loss in the first element.\n \"\"\"\n labels = (\n inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n )\n\n outputs = model(**inputs)\n # Save past state if it exists\n # TODO: this needs to be fixed and made cleaner later.\n if self.args.past_index >= 0:\n self._past = outputs[self.args.past_index]\n\n if labels is not None:\n if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n loss = self.label_smoother(outputs, labels, shift_labels=True)\n else:\n loss = self.label_smoother(outputs, labels)\n else:\n if isinstance(outputs, dict) and \"loss\" not in outputs:\n raise ValueError(\n \"The model did not return a loss from the inputs, only the following keys: \"\n f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n )\n # We don't use .loss here since the model may return tuples instead of ModelOutput.\n loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n if mc_loss is not None:\n loss = loss + self.prop_loss_coeff * mc_loss\n return (loss, outputs) if return_outputs else loss\n
"},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer.compute_loss","title":"compute_loss(model, inputs, return_outputs=False)
","text":"How the loss is computed by Trainer. By default, all models return the loss in the first element.
Source code insafe/trainer/trainer_utils.py
def compute_loss(self, model, inputs, return_outputs=False):\n \"\"\"\n How the loss is computed by Trainer. By default, all models return the loss in the first element.\n \"\"\"\n labels = (\n inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n )\n\n outputs = model(**inputs)\n # Save past state if it exists\n # TODO: this needs to be fixed and made cleaner later.\n if self.args.past_index >= 0:\n self._past = outputs[self.args.past_index]\n\n if labels is not None:\n if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n loss = self.label_smoother(outputs, labels, shift_labels=True)\n else:\n loss = self.label_smoother(outputs, labels)\n else:\n if isinstance(outputs, dict) and \"loss\" not in outputs:\n raise ValueError(\n \"The model did not return a loss from the inputs, only the following keys: \"\n f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n )\n # We don't use .loss here since the model may return tuples instead of ModelOutput.\n loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n if mc_loss is not None:\n loss = loss + self.prop_loss_coeff * mc_loss\n return (loss, outputs) if return_outputs else loss\n
"},{"location":"api/safe.models.html#data-collator","title":"Data Collator","text":""},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator","title":"SAFECollator
","text":"Collate function for language modelling tasks
Note
The collate function is based on the default DataCollatorForLanguageModeling in huggingface see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py
Source code insafe/trainer/collator.py
class SAFECollator:\n \"\"\"Collate function for language modelling tasks\n\n\n !!! note\n The collate function is based on the default DataCollatorForLanguageModeling in huggingface\n see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py\n \"\"\"\n\n def __init__(\n self,\n tokenizer: Tokenizer,\n pad_to_multiple_of: Optional[int] = None,\n input_key: str = \"inputs\",\n label_key: str = \"labels\",\n property_key: str = \"descriptors\",\n include_descriptors: bool = False,\n max_length: Optional[int] = None,\n ):\n \"\"\"\n Default collator for huggingface transformers in izanagi.\n\n Args:\n tokenizer: Huggingface tokenizer\n input_key: key to use for input ids\n label_key: key to use for labels\n property_key: key to use for properties\n include_descriptors: whether to include training on descriptors or not\n pad_to_multiple_of: pad to multiple of this value\n \"\"\"\n\n self.tokenizer = tokenizer\n self.pad_to_multiple_of = pad_to_multiple_of\n self.input_key = input_key\n self.label_key = label_key\n self.property_key = property_key\n self.include_descriptors = include_descriptors\n self.max_length = max_length\n\n @functools.lru_cache()\n def get_tokenizer(self):\n \"\"\"Get underlying tokenizer\"\"\"\n if isinstance(self.tokenizer, SAFETokenizer):\n return self.tokenizer.get_pretrained()\n return self.tokenizer\n\n def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n \"\"\"\n Call collate function\n\n Args:\n samples: list of examples\n \"\"\"\n # Handle dict or lists with proper padding and conversion to tensor.\n tokenizer = self.get_tokenizer()\n\n # examples = samples\n examples = copy.deepcopy(samples)\n inputs = [example.pop(self.input_key, None) for example in examples]\n mc_labels = (\n torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n if self.property_key in examples[0]\n else None\n )\n\n if \"input_ids\" not in examples[0] and inputs is not None:\n batch = tokenizer(\n inputs,\n return_tensors=\"pt\",\n padding=True,\n truncation=True,\n max_length=self.max_length,\n pad_to_multiple_of=self.pad_to_multiple_of,\n )\n else:\n batch = tokenizer.pad(\n examples,\n return_tensors=\"pt\",\n padding=True,\n pad_to_multiple_of=self.pad_to_multiple_of,\n max_length=self.max_length,\n )\n\n # If special token mask has been preprocessed, pop it from the dict.\n batch.pop(\"special_tokens_mask\", None)\n labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n if tokenizer.pad_token_id is not None:\n labels[labels == tokenizer.pad_token_id] = -100\n batch[\"labels\"] = labels\n\n if mc_labels is not None and self.include_descriptors:\n batch.update(\n {\n \"mc_labels\": mc_labels,\n # \"input_text\": inputs,\n }\n )\n return batch\n
"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__call__","title":"__call__(samples)
","text":"Call collate function
Parameters:
Name Type Description Defaultsamples
List[Union[List[int], Any, Dict[str, Any]]]
list of examples
required Source code insafe/trainer/collator.py
def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n \"\"\"\n Call collate function\n\n Args:\n samples: list of examples\n \"\"\"\n # Handle dict or lists with proper padding and conversion to tensor.\n tokenizer = self.get_tokenizer()\n\n # examples = samples\n examples = copy.deepcopy(samples)\n inputs = [example.pop(self.input_key, None) for example in examples]\n mc_labels = (\n torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n if self.property_key in examples[0]\n else None\n )\n\n if \"input_ids\" not in examples[0] and inputs is not None:\n batch = tokenizer(\n inputs,\n return_tensors=\"pt\",\n padding=True,\n truncation=True,\n max_length=self.max_length,\n pad_to_multiple_of=self.pad_to_multiple_of,\n )\n else:\n batch = tokenizer.pad(\n examples,\n return_tensors=\"pt\",\n padding=True,\n pad_to_multiple_of=self.pad_to_multiple_of,\n max_length=self.max_length,\n )\n\n # If special token mask has been preprocessed, pop it from the dict.\n batch.pop(\"special_tokens_mask\", None)\n labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n if tokenizer.pad_token_id is not None:\n labels[labels == tokenizer.pad_token_id] = -100\n batch[\"labels\"] = labels\n\n if mc_labels is not None and self.include_descriptors:\n batch.update(\n {\n \"mc_labels\": mc_labels,\n # \"input_text\": inputs,\n }\n )\n return batch\n
"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__init__","title":"__init__(tokenizer, pad_to_multiple_of=None, input_key='inputs', label_key='labels', property_key='descriptors', include_descriptors=False, max_length=None)
","text":"Default collator for huggingface transformers in izanagi.
Parameters:
Name Type Description Defaulttokenizer
Tokenizer
Huggingface tokenizer
requiredinput_key
str
key to use for input ids
'inputs'
label_key
str
key to use for labels
'labels'
property_key
str
key to use for properties
'descriptors'
include_descriptors
bool
whether to include training on descriptors or not
False
pad_to_multiple_of
Optional[int]
pad to multiple of this value
None
Source code in safe/trainer/collator.py
def __init__(\n self,\n tokenizer: Tokenizer,\n pad_to_multiple_of: Optional[int] = None,\n input_key: str = \"inputs\",\n label_key: str = \"labels\",\n property_key: str = \"descriptors\",\n include_descriptors: bool = False,\n max_length: Optional[int] = None,\n):\n \"\"\"\n Default collator for huggingface transformers in izanagi.\n\n Args:\n tokenizer: Huggingface tokenizer\n input_key: key to use for input ids\n label_key: key to use for labels\n property_key: key to use for properties\n include_descriptors: whether to include training on descriptors or not\n pad_to_multiple_of: pad to multiple of this value\n \"\"\"\n\n self.tokenizer = tokenizer\n self.pad_to_multiple_of = pad_to_multiple_of\n self.input_key = input_key\n self.label_key = label_key\n self.property_key = property_key\n self.include_descriptors = include_descriptors\n self.max_length = max_length\n
"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.get_tokenizer","title":"get_tokenizer()
cached
","text":"Get underlying tokenizer
Source code insafe/trainer/collator.py
@functools.lru_cache()\ndef get_tokenizer(self):\n \"\"\"Get underlying tokenizer\"\"\"\n if isinstance(self.tokenizer, SAFETokenizer):\n return self.tokenizer.get_pretrained()\n return self.tokenizer\n
"},{"location":"api/safe.models.html#data-utils","title":"Data Utils","text":""},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset","title":"get_dataset(data_path, name=None, tokenizer=None, cache_dir=None, streaming=True, use_auth_token=False, tokenize_column='inputs', property_column='descriptors', max_length=None, num_shards=1024)
","text":"Get the datasets from the config file
Source code insafe/trainer/data_utils.py
def get_dataset(\n data_path,\n name: Optional[str] = None,\n tokenizer: Optional[Callable] = None,\n cache_dir: Optional[str] = None,\n streaming: bool = True,\n use_auth_token: bool = False,\n tokenize_column: Optional[str] = \"inputs\",\n property_column: Optional[str] = \"descriptors\",\n max_length: Optional[int] = None,\n num_shards=1024,\n):\n \"\"\"Get the datasets from the config file\"\"\"\n raw_datasets = {}\n if data_path is not None:\n data_path = upath.UPath(str(data_path))\n\n if data_path.exists():\n # then we need to load from disk\n data_path = str(data_path)\n # for some reason, the datasets package is not able to load the dataset\n # because the split where not originally proposed\n raw_datasets = datasets.load_from_disk(data_path)\n\n if streaming:\n if isinstance(raw_datasets, datasets.DatasetDict):\n previous_num_examples = {k: len(dt) for k, dt in raw_datasets.items()}\n raw_datasets = datasets.IterableDatasetDict(\n {\n k: dt.to_iterable_dataset(num_shards=num_shards)\n for k, dt in raw_datasets.items()\n }\n )\n for k, dt in raw_datasets.items():\n if previous_num_examples[k] is not None:\n setattr(dt, \"num_examples\", previous_num_examples[k])\n else:\n num_examples = len(raw_datasets)\n raw_datasets = raw_datasets.to_iterable_dataset(num_shards=num_shards)\n setattr(raw_datasets, \"num_examples\", num_examples)\n\n else:\n data_path = str(data_path)\n raw_datasets = datasets.load_dataset(\n data_path,\n name=name,\n cache_dir=cache_dir,\n use_auth_token=True if use_auth_token else None,\n streaming=streaming,\n )\n # that means we need to return a tokenized version of the dataset\n\n if property_column not in [\"mc_labels\", None]:\n raw_datasets = raw_datasets.rename_column(property_column, \"mc_labels\")\n\n columns_to_remove = None\n if tokenize_column is not None:\n columns_to_remove = [\n x\n for x in (get_dataset_column_names(raw_datasets) or [])\n if x not in [tokenize_column, \"mc_labels\"] and \"label\" not in x\n ] or None\n\n if tokenizer is None:\n if columns_to_remove is not None:\n raw_datasets = raw_datasets.remove_columns(columns_to_remove)\n return raw_datasets\n\n return raw_datasets.map(\n partial(\n tokenize_fn,\n tokenizer=tokenizer,\n tokenize_column=tokenize_column,\n max_length=max_length,\n ),\n batched=True,\n remove_columns=columns_to_remove,\n )\n
"},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset_column_names","title":"get_dataset_column_names(dataset)
","text":"Get the column names in a dataset
Parameters:
Name Type Description Defaultdataset
Union[Dataset, IterableDataset, Mapping]
dataset to get the column names from
required Source code insafe/trainer/data_utils.py
def get_dataset_column_names(dataset: Union[datasets.Dataset, datasets.IterableDataset, Mapping]):\n \"\"\"Get the column names in a dataset\n\n Args:\n dataset: dataset to get the column names from\n\n \"\"\"\n if isinstance(dataset, (datasets.IterableDatasetDict, Mapping)):\n column_names = {split: dataset[split].column_names for split in dataset}\n else:\n column_names = dataset.column_names\n if isinstance(column_names, dict):\n column_names = list(column_names.values())[0]\n return column_names\n
"},{"location":"api/safe.models.html#safe.trainer.data_utils.take","title":"take(n, iterable)
","text":"Return first n items of the iterable as a list
Source code insafe/trainer/data_utils.py
def take(n, iterable):\n \"Return first n items of the iterable as a list\"\n return list(itertools.islice(iterable, n))\n
"},{"location":"api/safe.models.html#safe.trainer.data_utils.tokenize_fn","title":"tokenize_fn(row, tokenizer, tokenize_column='inputs', max_length=None, padding=False)
","text":"Perform the tokenization of a row Args: row: row to tokenize tokenizer: tokenizer to use tokenize_column: column to tokenize max_length: maximum size of the tokenized sequence padding: whether to pad the sequence
Source code insafe/trainer/data_utils.py
def tokenize_fn(\n row: Dict[str, Any],\n tokenizer: Callable,\n tokenize_column: str = \"inputs\",\n max_length: Optional[int] = None,\n padding: bool = False,\n):\n \"\"\"Perform the tokenization of a row\n Args:\n row: row to tokenize\n tokenizer: tokenizer to use\n tokenize_column: column to tokenize\n max_length: maximum size of the tokenized sequence\n padding: whether to pad the sequence\n \"\"\"\n # there's probably a way to do this with the tokenizer settings\n # but again, gotta move fast\n\n fast_tokenizer = (\n tokenizer.get_pretrained() if isinstance(tokenizer, SAFETokenizer) else tokenizer\n )\n\n return fast_tokenizer(\n row[tokenize_column],\n truncation=(max_length is not None),\n max_length=max_length,\n padding=padding,\n return_tensors=None,\n )\n
"},{"location":"api/safe.viz.html","title":"Visualization","text":""},{"location":"api/safe.viz.html#safe.viz.to_image","title":"to_image(safe_str, fragments=None, legend=None, mol_size=(300, 300), use_svg=True, highlight_mode='lasso', highlight_bond_width_multiplier=12, **kwargs)
","text":"Display a safe string by highlighting the fragments that make it.
Parameters:
Name Type Description Defaultsafe_str
str
the safe string to display
requiredfragments
Optional[Union[str, Mol]]
list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.
None
legend
Union[str, None]
A string to use as the legend under the molecule.
None
mol_size
Union[Tuple[int, int], int]
The size of the image to be returned
(300, 300)
use_svg
Optional[bool]
Whether to return an svg or png image
True
highlight_mode
Optional[str]
the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown
'lasso'
highlight_bond_width_multiplier
int
the multiplier to use for the bond width when using the 'fill' mode
12
**kwargs
Any
Additional arguments to pass to the drawing function. See RDKit documentation related to MolDrawOptions
for more details at https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.
{}
Source code in safe/viz.py
def to_image(\n safe_str: str,\n fragments: Optional[Union[str, dm.Mol]] = None,\n legend: Union[str, None] = None,\n mol_size: Union[Tuple[int, int], int] = (300, 300),\n use_svg: Optional[bool] = True,\n highlight_mode: Optional[str] = \"lasso\",\n highlight_bond_width_multiplier: int = 12,\n **kwargs: Any,\n):\n \"\"\"Display a safe string by highlighting the fragments that make it.\n\n Args:\n safe_str: the safe string to display\n fragments: list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.\n legend: A string to use as the legend under the molecule.\n mol_size: The size of the image to be returned\n use_svg: Whether to return an svg or png image\n highlight_mode: the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown\n highlight_bond_width_multiplier: the multiplier to use for the bond width when using the 'fill' mode\n **kwargs: Additional arguments to pass to the drawing function. See RDKit\n documentation related to `MolDrawOptions` for more details at\n https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.\n\n \"\"\"\n\n kwargs[\"legends\"] = legend\n kwargs[\"mol_size\"] = mol_size\n kwargs[\"use_svg\"] = use_svg\n if highlight_bond_width_multiplier is not None:\n kwargs[\"highlightBondWidthMultiplier\"] = highlight_bond_width_multiplier\n\n if highlight_mode == \"color\":\n kwargs[\"continuousHighlight\"] = False\n kwargs[\"circleAtoms\"] = kwargs.get(\"circleAtoms\", False) or False\n\n if isinstance(fragments, (str, dm.Mol)):\n fragments = [fragments]\n\n if fragments is None and highlight_mode is not None:\n fragments = [\n sf.decode(x, as_mol=False, remove_dummies=False, ignore_errors=False)\n for x in safe_str.split(\".\")\n ]\n elif fragments and len(fragments) > 0:\n parsed_fragments = []\n for fg in fragments:\n if isinstance(fg, str) and dm.to_mol(fg) is None:\n fg = sf.decode(fg, as_mol=False, remove_dummies=False, ignore_errors=False)\n parsed_fragments.append(fg)\n fragments = parsed_fragments\n else:\n fragments = []\n mol = dm.to_mol(safe_str, remove_hs=False)\n cm = plt.get_cmap(\"gist_rainbow\")\n current_colors = [cm(1.0 * i / len(fragments)) for i in range(len(fragments))]\n\n if highlight_mode == \"lasso\":\n return dm.viz.lasso_highlight_image(mol, fragments, **kwargs)\n\n atom_indices = []\n bond_indices = []\n atom_colors = {}\n bond_colors = {}\n\n for i, frag in enumerate(fragments):\n frag = dm.from_smarts(frag)\n atom_matches, bond_matches = dm.substructure_matching_bonds(mol, frag)\n atom_matches = list(itertools.chain(*atom_matches))\n bond_matches = list(itertools.chain(*bond_matches))\n atom_indices.extend(atom_matches)\n bond_indices.extend(bond_matches)\n atom_colors.update({x: current_colors[i] for x in atom_matches})\n bond_colors.update({x: current_colors[i] for x in bond_matches})\n\n return dm.viz.to_image(\n mol,\n highlight_atom=[atom_indices],\n highlight_bond=[bond_indices],\n highlightAtomColors=[atom_colors],\n highlightBondColors=[bond_colors],\n **kwargs,\n )\n
"},{"location":"tutorials/design-with-safe.html","title":"Molecular design","text":"In\u00a0[2]: Copied! import os\n\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n\n\nimport safe as sf\nimport datamol as dm\nimport os os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\" import safe as sf import datamol as dm
Load the default pretrained Safe model.
We will use this unique model for all the downstream molecular design tasks.
In\u00a0[3]: Copied!designer = sf.SAFEDesign.load_default(verbose=True)\n\ndesigner.model\ndesigner = sf.SAFEDesign.load_default(verbose=True) designer.model Out[3]:
SAFEDoubleHeadsModel(\n (transformer): GPT2Model(\n (wte): Embedding(1880, 768)\n (wpe): Embedding(1024, 768)\n (drop): Dropout(p=0.1, inplace=False)\n (h): ModuleList(\n (0-11): 12 x GPT2Block(\n (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n (attn): GPT2Attention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n )\n (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n (mlp): GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n )\n )\n )\n (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n )\n (lm_head): Linear(in_features=768, out_features=1880, bias=False)\n (multiple_choice_head): PropertyHead(\n (summary): Linear(in_features=768, out_features=64, bias=True)\n (activation): ReLU()\n (out): Linear(in_features=64, out_features=1, bias=True)\n )\n)
Let's start with the below molecule.
In\u00a0[4]: Copied!candidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\"\ncandidate_mol = dm.to_mol(candidate_smiles)\n\ndm.to_image(candidate_mol)\ncandidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\" candidate_mol = dm.to_mol(candidate_smiles) dm.to_image(candidate_mol) Out[4]: In\u00a0[6]: Copied!
generated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12)\n\ngenerated_smiles[:5]\ngenerated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12) generated_smiles[:5]
0%| | 0/1 [00:00<?, ?it/s]
/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n warnings.warn(\n2023-10-28 11:37:25.393 | INFO | safe.sample:de_novo_generation:581 - After sanitization, 82 / 100 (82.00 %) generated molecules are valid !\nOut[6]:
['CCCCOc1c(Br)cc(C)cc1-c1nc(C2(CC)CCN(C(C)C)CC2)cn2nc(C)nc12',\n 'CC(C)(C)OC(=O)Nc1ccc(C[NH+]2CC[C@@H]3OCCC[C@H]3C2)cn1',\n 'Cc1ccc(Br)c(NCCC(C)C(C)C)c1',\n 'CCOC(=O)C1=C(C)N=c2s/c(=C/c3c(C)[nH]c4ccccc34)c(=O)n2[C@@H]1c1ccc(OC)cc1',\n 'CCc1ccccc1-n1cc(O)c(C(=O)Nc2ccc(Cl)c(F)c2)n1']In\u00a0[7]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[7]: In\u00a0[8]: Copied!
scaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\"\n\ndm.to_image(scaffold)\nscaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\" dm.to_image(scaffold) Out[8]: In\u00a0[9]: Copied!
generated_smiles = designer.scaffold_decoration(\n scaffold=scaffold,\n n_samples_per_trial=12,\n n_trials=2,\n sanitize=True,\n do_not_fragment_further=True,\n)\n\ngenerated_mols = [dm.to_mol(x) for x in generated_smiles]\ngenerated_smiles = designer.scaffold_decoration( scaffold=scaffold, n_samples_per_trial=12, n_trials=2, sanitize=True, do_not_fragment_further=True, ) generated_mols = [dm.to_mol(x) for x in generated_smiles]
0%| | 0/2 [00:00<?, ?it/s]
/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n warnings.warn(\n2023-10-28 11:37:48.620 | INFO | safe.sample:scaffold_decoration:542 - After sanitization, 21 / 24 (87.50 %) generated molecules are valid !\nIn\u00a0[10]: Copied!
dm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1)\ndm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1) Out[10]: In\u00a0[11]: Copied!
superstructure = \"c1ccc2ncncc2c1\"\n\ndm.to_image(superstructure)\nsuperstructure = \"c1ccc2ncncc2c1\" dm.to_image(superstructure) Out[11]: In\u00a0[12]: Copied!
generated_smiles = designer.super_structure(\n core=superstructure,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n attachment_point_depth=3,\n)\n\ngenerated_smiles\ngenerated_smiles = designer.super_structure( core=superstructure, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, attachment_point_depth=3, ) generated_smiles
0%| | 0/1 [00:00<?, ?it/s]
/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n warnings.warn(\n2023-10-28 11:38:24.884 | INFO | safe.sample:super_structure:496 - After sanitization, 12 / 12 (100.00 %) generated molecules are valid !\nOut[12]:
['c1ncc2c(N3CCOCC3)ccc(N3CCNCC3)c2n1',\n 'N[C@H](CNc1ccc(C(F)(F)F)c2ncncc12)C(F)(F)F',\n 'C=CCCCNC(=S)Nc1ccc(C(F)(F)F)c2cncnc12',\n 'O=C(N[C@@H](CO)CCF)c1ccc(C(=O)[O-])c2ncncc12',\n 'O=C(CC=Nc1ccc(OC(F)(F)F)c2ncncc12)C(F)(F)F',\n 'NC(=Nc1ccc([N+](=O)[O-])c2cncnc12)C(F)(F)F',\n 'O=C(CCC(F)=C(F)F)Nc1ccc(C(F)(F)F)c2ncncc12',\n 'O=S(=O)(CCC(F)(F)F)Nc1cccc2cncnc12',\n 'O=S(=O)(Cl)c1ccc(C(F)(F)F)c2ncncc12',\n 'c1ncc2c(N3CCCCCC3)ccc(-c3cn[nH]c3)c2n1',\n 'NC(=O)CSCC(=O)Nc1ccc(C(=O)[O-])c2ncncc12',\n 'c1ncc2c(-n3cncn3)ccc(C3CCCCN3)c2n1']In\u00a0[14]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[14]: In\u00a0[15]: Copied!
motif = \"[*]-N1CCCCC1\"\n\ndm.to_image(motif)\nmotif = \"[*]-N1CCCCC1\" dm.to_image(motif) Out[15]: In\u00a0[26]: Copied!
# let's make some long sequence\ngenerated_smiles = designer.motif_extension(\n motif=motif,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n min_length=25,\n max_length=80,\n)\n\ngenerated_smiles\n# let's make some long sequence generated_smiles = designer.motif_extension( motif=motif, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, min_length=25, max_length=80, ) generated_smiles
0%| | 0/1 [00:00<?, ?it/s]
2023-10-28 11:41:52.959 | INFO | safe.sample:scaffold_decoration:542 - After sanitization, 10 / 12 (83.33 %) generated molecules are valid !\nOut[26]:
['C1CCN([C@@H]2CCCC[C@@H]2[NH+]2CCOCC2)CC1',\n 'FC(F)(F)C(F)(F)CN1CCCCC1',\n 'O=NN(/C(=C/N1CCCCC1)N1CCCCC1)c1ccccc1',\n 'C1CCC(CC2(CC3CCCC3)CCCCC2C2CCCCCC2N2CCCCC2)CC1',\n '[Na+].[Na+].[O-]S(=S)(=S)N1CCCCC1',\n 'NC(CS)C(O)=NC(O)C(=O)N1CCCCC1',\n 'O=P(O)(O)CCOCCOP(=O)(O)SCCN1CCCCC1',\n 'C1CCN(N=c2nn[nH][nH]2)CC1.O.O',\n 'N.N#CC1C=CCN1N1CCCCC1',\n 'O=C1CCCCC1.O=C1COCCCN1N1CCCCC1']In\u00a0[27]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[27]: In\u00a0[28]: Copied!
side_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\"\n\ndm.to_image(side_chains)\nside_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\" dm.to_image(side_chains) Out[28]: In\u00a0[29]: Copied!
generated_smiles = designer.scaffold_morphing(\n side_chains=side_chains,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n random_seed=100,\n)\n\ndm.to_image(generated_smiles[:12], mol_size=(350, 200))\ngenerated_smiles = designer.scaffold_morphing( side_chains=side_chains, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, random_seed=100, ) dm.to_image(generated_smiles[:12], mol_size=(350, 200))
0%| | 0/1 [00:00<?, ?it/s]
2023-10-28 11:42:05.888 | INFO | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %) generated molecules are valid !\nOut[29]: In\u00a0[30]: Copied!
linker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"]\n\ndm.to_image(linker_generation)\nlinker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"] dm.to_image(linker_generation) Out[30]: In\u00a0[31]: Copied!
generated_smiles = designer.linker_generation(\n *linker_generation,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n random_seed=100,\n)\n\ngenerated_smiles\ngenerated_smiles = designer.linker_generation( *linker_generation, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, random_seed=100, ) generated_smiles
0%| | 0/1 [00:00<?, ?it/s]
2023-10-28 11:42:14.034 | INFO | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %) generated molecules are valid !\nOut[31]:
['O=C(Oc1cccc(-c2nc(N3CCCCC3)nc3c2CCN3)c1)c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'O=C(Oc1cccc(-c2nc(-c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)nc3c2CCN3)c1)N1CCCCC1',\n 'N=C(N)NCCCN1C(=O)N(CN2CCCCC2)C(=O)C2CC(c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)CC21',\n 'N=C(N)NCCCN1C(=O)N(Cc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)C(=O)C2CC(N3CCCCC3)CC21',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc5c4oc4c6ccccc6c(Nc6cccc(N7CCCCC7)c6)cc54)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc(Nc5cc6c7cccc(N8CCCCC8)c7oc6c6ccccc56)c4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cc(-c5nc6n(n5)CC=C[C@H]6N5CCCCC5)ncn4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc([C@@H]4C=CCn5nc(-c6cc(N7CCCCC7)ncn6)nc54)cc23)c1',\n 'O=C1C[C@@H]2C[C@H]3[C@H](N4CCCCC4)CC4COCCC42O[C@@H]3CC(CCc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)O1',\n 'O=C1C[C@@H]2C[C@@H]3[C@@H](CC(CCN4CCCCC4)O1)OC21CCOCC1C[C@H]3c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'Brc1cccc(Nc2ncnc3ccc(NNc4ccc(SCCCCCCc5ccc(N6CCCCC6)cc5)cc4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4ccc(CCCCCCSc5ccc(NNN6CCCCC6)cc5)cc4)cc23)c1']In\u00a0[32]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[32]:
The End !
"},{"location":"tutorials/design-with-safe.html#de-novo-generation","title":"De novo generation\u00b6","text":"Generation of novel molecules without any constraints.
"},{"location":"tutorials/design-with-safe.html#scaffold-decoration","title":"Scaffold Decoration\u00b6","text":"For scaffold decoration, we wish to generate new molecules that would contain a given scaffold as core. Usually, the attachment point on the scaffold should dictate where the new vectors will be added.
"},{"location":"tutorials/design-with-safe.html#super-structure-generation","title":"Super structure generation\u00b6","text":"In super structure generation, we just want to generate superstructure of a molecular subgraph
"},{"location":"tutorials/design-with-safe.html#motif-extension","title":"Motif Extension\u00b6","text":"In motif extension, we are interested in generating a molecule containing a given motif as starting point.
"},{"location":"tutorials/design-with-safe.html#scaffold-morphing","title":"Scaffold Morphing\u00b6","text":"In scaffold morphing, we wish to replace a scaffold by another one in a molecule. The process requires as input that the user provides either the side chains or the input molecules and the core
"},{"location":"tutorials/design-with-safe.html#linker-generation","title":"Linker generation\u00b6","text":"Linker generation is mostly the same thing as scaffold morphing ...
"},{"location":"tutorials/extracting-representation-molfeat.html","title":"so really we just need our custom converter","text":"In\u00a0[1]: Copied!%load_ext autoreload\n%autoreload 2\n%load_ext autoreload %autoreload 2 In\u00a0[2]: Copied!
import safe\nimport torch\nimport datamol as dm\nimport types\nfrom molfeat.trans.pretrained import PretrainedMolTransformer\nfrom molfeat.trans.pretrained import PretrainedHFTransformer\n\nfrom molfeat.trans.pretrained.hf_transformers import HFModel\nfrom safe.trainer.model import SAFEDoubleHeadsModel\nfrom safe.tokenizer import SAFETokenizer\nimport safe import torch import datamol as dm import types from molfeat.trans.pretrained import PretrainedMolTransformer from molfeat.trans.pretrained import PretrainedHFTransformer from molfeat.trans.pretrained.hf_transformers import HFModel from safe.trainer.model import SAFEDoubleHeadsModel from safe.tokenizer import SAFETokenizer In\u00a0[3]: Copied!
safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\nsafe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\nsafe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\") safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")
We now need to build the molfeat
's HFModel
instance by wrapping our model.
safe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\nsafe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())
You can put the above process in the __init__
of the SAFEMolTransformer
if you wish as we will be doing below.
class SAFEMolTransformer(PretrainedHFTransformer):\n \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is \n how we convert the input molecules into the safe format\"\"\"\n def __init__(self, kind=None, notation=\"safe\", **kwargs):\n if kind is None:\n # we load the default SAFE model if the exact SAFE GPT model \n # to use is not provided\n safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\n safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\n super().__init__(kind, notation=None, **kwargs)\n # now we change the internal converter\n # overriding the internal converter of SmilesConverter leverages the exception handling\n # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS\n # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds.\n self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe)\n # you could also do any of the following:\n # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode)\n # self.converter = safe # the safe module\nclass SAFEMolTransformer(PretrainedHFTransformer): \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is how we convert the input molecules into the safe format\"\"\" def __init__(self, kind=None, notation=\"safe\", **kwargs): if kind is None: # we load the default SAFE model if the exact SAFE GPT model # to use is not provided safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\") safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\") kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained()) super().__init__(kind, notation=None, **kwargs) # now we change the internal converter # overriding the internal converter of SmilesConverter leverages the exception handling # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds. self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe) # you could also do any of the following: # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode) # self.converter = safe # the safe module
2023-12-20 22:57:39.310 | WARNING | molfeat.trans.base:__init__:51 - The 'SAFEMolTransformer' interaction has been superseded by a new class with id 0x2ad77d6a0\n
Let's use the GPT pooler which uses the last non padding token (often eos
) since the model is GPT2 like. For other options, see: https://molfeat-docs.datamol.io/stable/api/molfeat.utils.html#pooling
# Let's use the GPT pooling method and only take the last hidden layer\nsafe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1])\nsafe_transformers\n# Let's use the GPT pooling method and only take the last hidden layer safe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1]) safe_transformers Out[116]:
SAFEMolTransformer(dtype=np.float32)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.SAFEMolTransformer
SAFEMolTransformer(dtype=np.float32)In\u00a0[117]: Copied!
mols = dm.data.freesolv().iloc[:10].smiles.values\nmols = dm.data.freesolv().iloc[:10].smiles.values In\u00a0[118]: Copied!
safe_transformers(mols)\nsafe_transformers(mols) Out[118]:
array([[ 0.05216356, 0.10754181, 0.07509107, ..., 0.04756968,\n -0.08228929, -0.11568106],\n [ 0.02449008, 0.04048932, 0.14489463, ..., 0.11410899,\n -0.02203353, 0.08706839],\n [-0.07425696, 0.11859665, 0.19010407, ..., 0.10526019,\n 0.08878426, -0.06609854],\n ...,\n [ 0.07867863, 0.19300285, 0.23054805, ..., -0.00737952,\n 0.07542405, 0.00289541],\n [ 0.12092628, -0.01785688, 0.19791883, ..., 0.13796932,\n 0.11520796, -0.15333697],\n [-0.02005584, 0.13946685, 0.18568742, ..., 0.07080407,\n 0.06991849, -0.07151204]], dtype=float32)In\u00a0[119]: Copied!
from sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import Pipeline\n\ndf = dm.data.freesolv()\ndf[\"safe\"] = df[\"smiles\"].apply(safe_transformers.converter.encode)\ndf = df.dropna(subset=\"safe\")\n# we have to remove the molecules that cannot be converted \n# (no breakable bonds with our default methodology)\nfrom sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline df = dm.data.freesolv() df[\"safe\"] = df[\"smiles\"].apply(safe_transformers.converter.encode) df = df.dropna(subset=\"safe\") # we have to remove the molecules that cannot be converted # (no breakable bonds with our default methodology) In\u00a0[120]: Copied!
X, y = df[\"smiles\"].values, df[\"expt\"].values\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2)\n\n# The Molfeat transformer seemingly integrates with Scikit-learn Pipeline!\npipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())])\nX, y = df[\"smiles\"].values, df[\"expt\"].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2) # The Molfeat transformer seemingly integrates with Scikit-learn Pipeline! pipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())]) In\u00a0[121]: Copied!
with dm.without_rdkit_log():\n pipe.fit(X_train, y_train)\n score = pipe.score(X_test, y_test)\n y_pred = pipe.predict(X_test)\nwith dm.without_rdkit_log(): pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) y_pred = pipe.predict(X_test) In\u00a0[122]: Copied!
print(\"R2 score:\", score)\nprint(\"R2 score:\", score)
R2 score: 0.4971483821661925\nIn\u00a0[123]: Copied!
import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots()\nax.scatter(y_test, y_pred)\nax.set_xlabel(\"Target\")\nax.set_ylabel(\"Preds\")\nimport matplotlib.pyplot as plt fig, ax = plt.subplots() ax.scatter(y_test, y_pred) ax.set_xlabel(\"Target\") ax.set_ylabel(\"Preds\") Out[123]:
Text(0, 0.5, 'Preds')
Not really a great result. Any other model in molfeat
would do better.
Because the SAFE model is not a standard HuggingFace transformers
model, we need to wrap it.
Why are we doing this ? Because we want to leverage the structure of molfeat
and not have to write our own pooling for the model. This can be done by using the huggingface molecule transformer PretrainedHFTransformer
rather than the general purpose pretrained model class PretrainedMolTransformer
where we will have to define our own _embed
and _convert
function.
We have multiple options here, we can override the _convert
method or even the _embed
method but the best thing about molfeat
is how flexible it is and all the shortcuts it provides.
In this case, we just need to change the custom
"},{"location":"tutorials/extracting-representation-molfeat.html#so-really-we-just-need-our-custom-converter","title":"so really we just need our custom converter\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#basic-test","title":"Basic Test\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#tips","title":"Tips\u00b6","text":"None
molecules at some steps in the conversion to SAFE. This can happen if there your slicing algorithm of choice is not working. In that case, please filter your datasets to remove molecules that fails the encoding steps first. You can always use the very robus safe.utils.convert_to_safe
, which augment default BRICS slicing with some graph partitioning algorithm.import safe as sf\nimport datamol as dm\n\ncelecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\"\ncelecoxib_mol = dm.to_mol(celecoxib)\n\ndisplay(dm.to_image(celecoxib_mol))\nimport safe as sf import datamol as dm celecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\" celecoxib_mol = dm.to_mol(celecoxib) display(dm.to_image(celecoxib_mol)) In\u00a0[3]: Copied!
safe_str = sf.encode(celecoxib_mol)\n\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\nsafe_str = sf.encode(celecoxib_mol) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")
c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\nRepresentation using 4 fragments\n
SAFE string are SMILES
Any SAFE string is a valid SMILES and can be read by RDKit without any decoding trick.
In\u00a0[4]: Copied!reconstructed = dm.to_mol(safe_str)\n\ndisplay(dm.to_image(reconstructed))\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\nreconstructed = dm.to_mol(safe_str) display(dm.to_image(reconstructed)) assert dm.same_mol(celecoxib_mol, reconstructed)
SAFE supports randomization
You can generate randomized SAFE strings.
In\u00a0[5]: Copied!random_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True)\n\nprint(random_safe_str)\n\nreconstructed = dm.to_mol(safe_str)\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\nrandom_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True) print(random_safe_str) reconstructed = dm.to_mol(safe_str) assert dm.same_mol(celecoxib_mol, reconstructed)
c15ccc(S(N)(=O)=O)cc1.c16cc4nn15.C4(F)(F)F.c16ccc(C)cc1\n
Fragment order in SAFE does not matter
Any permutation of the fragment order in a SAFE string preserve the molecule identity
In\u00a0[6]: Copied!import numpy as np\n\nfragments = safe_str.split(\".\")\nrandomized_fragment_safe_str = np.random.permutation(fragments).tolist()\nrandomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str)\n\nprint(randomized_fragment_safe_str, safe_str)\nassert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)\nimport numpy as np fragments = safe_str.split(\".\") randomized_fragment_safe_str = np.random.permutation(fragments).tolist() randomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str) print(randomized_fragment_safe_str, safe_str) assert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)
c14ccc(S(N)(=O)=O)cc1.c15cc3nn14.Cc1ccc5cc1.C3(F)(F)F c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\n
Use your own slicing logic
By default SAFE strings are generated using BRICS
, however, the following are supported:
hr
)recap
)mmpa
)attach
)Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.
In\u00a0[7]: Copied!def my_slicer(mol):\n \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\"\n for bond in mol.GetBonds():\n if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()):\n yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())\ndef my_slicer(mol): \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\" for bond in mol.GetBonds(): if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()): yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) In\u00a0[9]: Copied!
safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\nsafe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")
c14cc(C(F)(F)F)nn13.c13ccc(S(N)(=O)=O)cc1.Cc1ccc4cc1\nRepresentation using 3 fragments\n
Or simply use a SMARTS or a list of SMARTS.
In\u00a0[11]: Copied!# The above is equivalent to using the following SMARTS:\nsmart_slicer = [\"[r]-;!@[r]\"]\nsafe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n# The above is equivalent to using the following SMARTS: smart_slicer = [\"[r]-;!@[r]\"] safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")
c13cc(C(F)(F)F)nn14.c14ccc(S(N)(=O)=O)cc1.Cc1ccc3cc1\nRepresentation using 3 fragments\nIn\u00a0[13]: Copied!
safe_fragment = safe_str.split(\".\")\nsafe_fragment\nsafe_fragment = safe_str.split(\".\") safe_fragment Out[13]:
['c13cc(C(F)(F)F)nn14', 'c14ccc(S(N)(=O)=O)cc1', 'Cc1ccc3cc1']In\u00a0[14]: Copied!
# the following will fail\ndm.to_mol(safe_fragment[0])\n# the following will fail dm.to_mol(safe_fragment[0])
[11:20:14] SMILES Parse Error: unclosed ring for input: 'c13cc(C(F)(F)F)nn14'\nIn\u00a0[15]: Copied!
# while this works\nsf.decode(safe_fragment[0], as_mol=True)\n# while this works sf.decode(safe_fragment[0], as_mol=True) Out[15]: In\u00a0[16]: Copied!
# if you want to keep the attachment points, then use remove_dummies=False\nsf.decode(safe_fragment[0], as_mol=True, remove_dummies=False)\n# if you want to keep the attachment points, then use remove_dummies=False sf.decode(safe_fragment[0], as_mol=True, remove_dummies=False) Out[16]: In\u00a0[17]: Copied!
sf.to_image(safe_str)\nsf.to_image(safe_str) Out[17]:
There are 3 display modes for highlighting the fragments in a SAFE string. The difference between those modes is highlighted below using two different slicing algorithm.
Overlapping fragments
Note that because some fragment might be matching overlapping substructure of the molecules (for example the same fragment appearing multiple time in the molecule), the highlighting might assigned the same color to these fragments.
In\u00a0[18]: Copied!from IPython.display import display\nfrom ipywidgets import widgets, HBox\n\ndef display_image(safe_str):\n image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml')\n image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml')\n image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml')\n hbox = HBox([image_lasso, image_fill, image_color])\n display(hbox)\nfrom IPython.display import display from ipywidgets import widgets, HBox def display_image(safe_str): image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml') image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml') image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml') hbox = HBox([image_lasso, image_fill, image_color]) display(hbox) In\u00a0[19]: Copied!
# display for brics\nsafe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\")\ndisplay_image(safe_str_brics)\n# display for brics safe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\") display_image(safe_str_brics)
HBox(children=(Image(value=b'<svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'<s\u2026In\u00a0[20]: Copied!
# display with HR\nsafe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\")\ndisplay_image(safe_str_hr)\n# display with HR safe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\") display_image(safe_str_hr)
HBox(children=(Image(value=b'<svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'<s\u2026
The End !
"},{"location":"tutorials/getting-started.html#getting-started-with-safe","title":"Getting Started with SAFE\u00b6","text":"The SAFE encoding format is a rewriting of SMILES to ensure that any molecule can be written as a sequence of fragments where atoms or tokens corresponding to given fragments form a substring (ontiguous sequence) in the line notation representation.
SAFE addresses some of the limitation of SMILES strings when it comes to generative design:
Safe Others - native support for (sub)structure-constrained design - different generative models for different generative tasks - extensive substructure matching for filtering after generation - multiple steps generative process (e.g Liao et al. 2023 ) - graph based approaches with their limitations - any molecule generation as a simple NLP task (sequence completion or mask filling) - a single autoregressive sequence model for both linker generation and scaffold decoration. - complex training and decoding schemes for scaffold-constrained generation (e.g Ar\u00fas-Pous et al. 2020 ) - complex sampling algorithms for scaffold-constrained generation (e.g Langevin et al. 2020) - SAFE strings are SMILES strings - requires a different chemical language (e.g Krenn et al. 2022)"},{"location":"tutorials/getting-started.html#using-safe","title":"Using SAFE\u00b6","text":"In the following we will highlight how to use SAFE and some of the properties of SAFE strings.
"},{"location":"tutorials/getting-started.html#encoding","title":"Encoding\u00b6","text":"SAFE represents fragments
SAFE represents molecules as a set of N [Fragment_1].[Fragment_i].[Fragment_N]
"},{"location":"tutorials/getting-started.html#decoding","title":"Decoding\u00b6","text":"Fragment order in SAFE does not matter
Each SAFE fragment
is a valid molecule itself, however, you need to use the decoder to recover molecules where all attachment point are not fullfiled.
We provide a visualization module to display a safe string, with highlight of all the fragments that compose it.
"},{"location":"tutorials/how-it-works.html","title":"How SAFE encoding works?","text":"In\u00a0[1]: Copied!import datamol as dm\n\nfrom rdkit import Chem\nfrom rdkit.Chem.Draw import rdDepictor\nfrom rdkit.Chem import rdChemReactions as rdr\nrdDepictor.SetPreferCoordGen(True)\nimport datamol as dm from rdkit import Chem from rdkit.Chem.Draw import rdDepictor from rdkit.Chem import rdChemReactions as rdr rdDepictor.SetPreferCoordGen(True) In\u00a0[2]: Copied!
smiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"]\nlegends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"]\ndm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True)\nsmiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"] legends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"] dm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True) Out[2]:
In the example above, we can see that phenol
can be represented as two fragments that can be connected given proper attachment point.
To achieve this we are interested in attaching 2 fragments together (the methoxy
and the phenyl
groups). In RDKit, this can usually be achieved using chemical reactions. For convenience, we will prefer a standardized representation of attachment points that includes an atom mapping.
smiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] #\ndm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True)\nsmiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] # dm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True) Out[3]:
To attach the two fragments, I can write a simple chemical transformation. Since smarts and smiles syntax do not mix very well when it comes to *
I will assume an isotopic representation [1*]
instead of [*:1]
rxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]>>[*:1][*:2]\")\nrxn\nrxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]>>[*:1][*:2]\") rxn Out[4]: In\u00a0[5]: Copied!
# replace atom map by isotopes\nphenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\")\nmethoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\")\n\n# runreactions\nprod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy)))\nprod[0][0]\n# replace atom map by isotopes phenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\") methoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\") # runreactions prod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy))) prod[0][0] Out[5]:
We can achieve the same result by using rdkit API in an slightly more tedious way.
In\u00a0[6]: Copied!replacement_sub = Chem.MolFromSmarts(\"[1*]\")\nprod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0)\nprod = dm.remove_dummies(prod[0], dummy=\"[1*]\")\nprod\nreplacement_sub = Chem.MolFromSmarts(\"[1*]\") prod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0) prod = dm.remove_dummies(prod[0], dummy=\"[1*]\") prod
[11:14:08] WARNING: not removing hydrogen atom without neighbors\nOut[6]:
But wait, could we attach the fragment using only the string operations on the smiles ?
Well, it's not possible by trying to perform substring replacement, but recall we just said that numbers in smiles represents connectivity points
?
phenyl = \"c1cc([*:1])ccc1\"\nmethoxy = \"O([*:1])C\"\ncomposite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C\ncompo = dm.to_mol(composite)\nphenyl = \"c1cc([*:1])ccc1\" methoxy = \"O([*:1])C\" composite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C compo = dm.to_mol(composite)
Since 1
\"connectivity point\" is already present in the phenyl group. We need to start by opening a new connectivity point: 2
attached_composite = composite.replace(\"[*:1]\", \"2\")\ndm.to_mol(attached_composite)\nattached_composite = composite.replace(\"[*:1]\", \"2\") dm.to_mol(attached_composite)
[11:14:10] SMILES Parse Error: syntax error while parsing: c1cc(2)ccc1.O(2)C\n[11:14:10] SMILES Parse Error: Failed parsing SMILES 'c1cc(2)ccc1.O(2)C' for input: 'c1cc(2)ccc1.O(2)C'\n
The previous line does not work because of violation in the smiles syntax. As we are not taking into account the branching bracket surrounding the attachment point.
We could try to regenerate the smiles or scan the sequence and remove the brackets when it's possible, but we want to limit the operations to str.replace
. So let's try again.
attached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\")\ndm.to_image(attached_composite, legends=[attached_composite])\nattached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\") dm.to_image(attached_composite, legends=[attached_composite]) Out[9]:
You can see that the phenol molecule is represented as two \"fragments\" [Fragment1].[Fragment2]
. That is what SAFE is about.
In summary, to build a SAFE string, we just need to follow the step below:
The End !
"},{"location":"tutorials/how-it-works.html#how-safe-encoding-works","title":"How SAFE encoding works?\u00b6","text":"The intuition behind safe is quite simple: we want to represent any molecule as a set of connected fragments
.
Let's start first by revisiting some information about the SMILES syntax:
An asterisk *
in a smiles is usually employed to indicate any atom OR an attachment point of any group. It's particularly useful for smarts matching.
Number in smiles syntax indicates connectivity points between two atoms. For 2 digits numbers they would need to be preceeded by %
.
This is partially explained on the wikipedia ring section of SMILES.
.
in smiles indicates the presence of additional fragments and is used to separate them.A good ressource on the subject is the DAYLIGHT page.
We illustrate these informations below !
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"\ud83e\uddba SAFE Sequential Attachment-based Fragment Embedding (SAFE) is a novel molecular line notation that represents molecules as an unordered sequence of fragment blocks to improve molecule design using generative models.Paper | Docs | \ud83e\udd17 Model | \ud83e\udd17 Training Dataset
"},{"location":"index.html#overview-of-safe","title":"Overview of SAFE","text":"
SAFE is the deep learning molecular representation. It's an encoding leveraging a peculiarity in the decoding schemes of SMILES, to allow representation of molecules as a contiguous sequence of connected fragments. SAFE strings are valid SMILES strings, and thus are able to preserve the same amount of information. The intuitive representation of molecules as an ordered sequence of connected fragments greatly simplifies the following tasks often encountered in molecular design:
The construction of a SAFE strings requires defining a molecular fragmentation algorithm. By default, we use [BRICS], but any other fragmentation algorithm can be used. The image below illustrates the process of building a SAFE string. The resulting string is a valid SMILES that can be read by datamol or RDKit.
"},{"location":"index.html#news","title":"News \ud83d\ude80","text":""},{"location":"index.html#20240115","title":"\ud83d\udca5 2024/01/15 \ud83d\udca5","text":"You can install safe
using pip:
pip install safe-mol\n
You can use conda/mamba:
mamba install -c conda-forge safe-mol\n
"},{"location":"index.html#datasets-and-models","title":"Datasets and Models","text":"Type Name Infos Size Comment Model datamol-io/safe-gpt 87M params 350M Default model Training Dataset datamol-io/safe-gpt 1.1B rows 250GB Training dataset Drug Benchmark Dataset datamol-io/safe-drugs 26 rows 20 kB Benchmarking dataset"},{"location":"index.html#usage","title":"Usage","text":"The tutorials in the documentation can help you get started with safe
and SAFE-GPT
.
We summarize some key functions provided by the safe
package below.
safe.encode
Translates a SMILES string into its corresponding SAFE string. safe.decode
Translates a SAFE string into its corresponding SMILES string. The SAFE decoder just augment RDKit's Chem.MolFromSmiles
with an optional correction argument to take care of missing hydrogens bonds. safe.split
Tokenizes a SAFE string to build a generative model."},{"location":"index.html#examples","title":"Examples","text":""},{"location":"index.html#translation-between-safe-and-smiles-representations","title":"Translation between SAFE and SMILES representations","text":"import safe\n\nibuprofen = \"CC(Cc1ccc(cc1)C(C(=O)O)C)C\"\n\n# SMILES -> SAFE -> SMILES translation\ntry:\n ibuprofen_sf = safe.encode(ibuprofen) # c12ccc3cc1.C3(C)C(=O)O.CC(C)C2\n ibuprofen_smi = safe.decode(ibuprofen_sf, canonical=True) # CC(C)Cc1ccc(C(C)C(=O)O)cc1\nexcept safe.EncoderError:\n pass\nexcept safe.DecoderError:\n pass\n\nibuprofen_tokens = list(safe.split(ibuprofen_sf))\n
"},{"location":"index.html#trainingfinetuning-a-new-model","title":"Training/Finetuning a (new) model","text":"A command line interface is available to train a new model, please run safe-train --help
. You can also provide an existing checkpoint to continue training or finetune on you own dataset.
For example:
safe-train --config <path to config> \\\n --model-path <path to model> \\\n --tokenizer <path to tokenizer> \\\n --dataset <path to dataset> \\\n --num_labels 9 \\\n --torch_compile True \\\n --optim \"adamw_torch\" \\\n --learning_rate 1e-5 \\\n --prop_loss_coeff 1e-3 \\\n --gradient_accumulation_steps 1 \\\n --output_dir \"<path to outputdir>\" \\\n --max_steps 5\n
"},{"location":"index.html#references","title":"References","text":"If you use this repository, please cite the following related paper:
@misc{noutahi2023gotta,\n title={Gotta be SAFE: A New Framework for Molecular Design},\n author={Emmanuel Noutahi and Cristian Gabellini and Michael Craig and Jonathan S. C Lim and Prudencio Tossou},\n year={2023},\n eprint={2310.10773},\n archivePrefix={arXiv},\n primaryClass={cs.LG}\n}\n
"},{"location":"index.html#license","title":"License","text":"Note that all data and model weights of SAFE are exclusively licensed for research purposes. The accompanying dataset is licensed under CC BY 4.0, which permits solely non-commercial usage. See DATA_LICENSE for details.
This code base is licensed under the Apache-2.0 license. See LICENSE for details.
"},{"location":"index.html#development-lifecycle","title":"Development lifecycle","text":""},{"location":"index.html#setup-dev-environment","title":"Setup dev environment","text":"mamba create -n safe -f env.yml\nmamba activate safe\n\npip install --no-deps -e .\n
"},{"location":"index.html#tests","title":"Tests","text":"You can run tests locally with:
pytest\n
"},{"location":"cli.html","title":"CLI for model Training","text":"You can train a new SAFE
generative models using the provided CLI, which uses \ud83e\udd17 Transformers !
Usage:
safe-train [-h] [--model_path MODEL_PATH] [--config CONFIG] [--tokenizer TOKENIZER] [--num_labels NUM_LABELS]\n [--include_descriptors [INCLUDE_DESCRIPTORS]] [--no_include_descriptors] [--prop_loss_coeff PROP_LOSS_COEFF]\n [--wandb_project WANDB_PROJECT] [--wandb_watch {gradients,all}] [--cache_dir CACHE_DIR]\n [--torch_dtype {auto,bfloat16,float16,float32}] [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] [--model_max_length MODEL_MAX_LENGTH]\n [--dataset DATASET] [--is_tokenized [IS_TOKENIZED]] [--streaming [STREAMING]] [--text_column TEXT_COLUMN] --output_dir\n OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]\n [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}] [--prediction_loss_only [PREDICTION_LOSS_ONLY]]\n [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]\n [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]\n [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]\n [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]\n [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS]\n [--max_steps MAX_STEPS]\n [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}]\n [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}]\n [--log_level_replica {debug,info,warning,error,critical,passive}] [--log_on_each_node [LOG_ON_EACH_NODE]]\n [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]\n [--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]\n [--no_logging_nan_inf_filter] [--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT]\n [--save_safetensors [SAVE_SAFETENSORS]] [--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]]\n [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]\n [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL]\n [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}] [--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]]\n [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}] [--tpu_num_cores TPU_NUM_CORES]\n [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG [DEBUG ...]] [--dataloader_drop_last [DATALOADER_DROP_LAST]]\n [--eval_steps EVAL_STEPS] [--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME]\n [--disable_tqdm DISABLE_TQDM] [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns]\n [--label_names LABEL_NAMES [LABEL_NAMES ...]] [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]\n [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER] [--ignore_data_skip [IGNORE_DATA_SKIP]]\n [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]\n [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED]\n [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]\n [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]\n [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]]\n [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]\n [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]\n [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] [--no_dataloader_pin_memory]\n [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics]\n [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] [--push_to_hub [PUSH_TO_HUB]]\n [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID]\n [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] [--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]]\n [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]\n [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]\n [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]\n [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO]\n [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT] [--torch_compile [TORCH_COMPILE]]\n [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]\n
Options:
-h, --help show this help message and exit\n--model_path MODEL_PATH\n Optional model path or model name to use as a starting point for the safe model (default: None)\n--config CONFIG Path to the default config file to use for the safe model (default: None)\n--tokenizer TOKENIZER\n--num_labels NUM_LABELS\n Optional number of labels for the descriptors (default: None)\n--include_descriptors [INCLUDE_DESCRIPTORS]\n Whether to train with descriptors if they are available or Not (default: True)\n--no_include_descriptors\n Whether to train with descriptors if they are available or Not (default: False)\n--prop_loss_coeff PROP_LOSS_COEFF\n coefficient for the propery loss (default: 0.01)\n--wandb_project WANDB_PROJECT\n Name of the wandb project to use to log the SAFE model parameter (default: safe-gpt2)\n--wandb_watch {gradients,all}\n Whether to watch the wandb models or not (default: None)\n--cache_dir CACHE_DIR\n Where do you want to store the pretrained models downloaded from s3 (default: None)\n--torch_dtype {auto,bfloat16,float16,float32}\n Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the dtype will be\n automatically derived from the model's weights. (default: None)\n--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]\n It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights\n are loaded.set True will benefit LLM loading time and RAM consumption. Only valid when loading a pretrained model\n (default: False)\n--model_max_length MODEL_MAX_LENGTH\n Maximum sequence length. Sequences will be right padded (and possibly truncated) up to that value. (default: 1024)\n--dataset DATASET Path to the preprocessed dataset to use for the safe model building (default: None)\n--is_tokenized [IS_TOKENIZED]\n whether the dataset submitted as input is already tokenized or not (default: False)\n--streaming [STREAMING]\n Whether to use a streaming dataset or not (default: False)\n--text_column TEXT_COLUMN\n Column containing text data to process. (default: inputs)\n--output_dir OUTPUT_DIR\n The output directory where the model predictions and checkpoints will be written. (default: None)\n--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]\n Overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint\n directory. (default: False)\n--do_train [DO_TRAIN]\n Whether to run training. (default: False)\n--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)\n--do_predict [DO_PREDICT]\n Whether to run predictions on the test set. (default: False)\n--evaluation_strategy {no,steps,epoch}\n The evaluation strategy to use. (default: no)\n--prediction_loss_only [PREDICTION_LOSS_ONLY]\n When performing evaluation and predictions, only returns the loss. (default: False)\n--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE\n Batch size per GPU/TPU core/CPU for training. (default: 8)\n--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE\n Batch size per GPU/TPU core/CPU for evaluation. (default: 8)\n--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE\n Deprecated, the use of `--per_device_train_batch_size` is preferred. Batch size per GPU/TPU core/CPU for training.\n (default: None)\n--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE\n Deprecated, the use of `--per_device_eval_batch_size` is preferred. Batch size per GPU/TPU core/CPU for evaluation.\n (default: None)\n--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS\n Number of updates steps to accumulate before performing a backward/update pass. (default: 1)\n--eval_accumulation_steps EVAL_ACCUMULATION_STEPS\n Number of predictions steps to accumulate before moving the tensors to the CPU. (default: None)\n--eval_delay EVAL_DELAY\n Number of epochs or steps to wait for before the first evaluation can be performed, depending on the evaluation_strategy.\n (default: 0)\n--learning_rate LEARNING_RATE\n The initial learning rate for AdamW. (default: 5e-05)\n--weight_decay WEIGHT_DECAY\n Weight decay for AdamW if we apply some. (default: 0.0)\n--adam_beta1 ADAM_BETA1\n Beta1 for AdamW optimizer (default: 0.9)\n--adam_beta2 ADAM_BETA2\n Beta2 for AdamW optimizer (default: 0.999)\n--adam_epsilon ADAM_EPSILON\n Epsilon for AdamW optimizer. (default: 1e-08)\n--max_grad_norm MAX_GRAD_NORM\n Max gradient norm. (default: 1.0)\n--num_train_epochs NUM_TRAIN_EPOCHS\n Total number of training epochs to perform. (default: 3.0)\n--max_steps MAX_STEPS\n If > 0: set total number of training steps to perform. Override num_train_epochs. (default: -1)\n--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}\n The scheduler type to use. (default: linear)\n--warmup_ratio WARMUP_RATIO\n Linear warmup over warmup_ratio fraction of total steps. (default: 0.0)\n--warmup_steps WARMUP_STEPS\n Linear warmup over warmup_steps. (default: 0)\n--log_level {debug,info,warning,error,critical,passive}\n Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug', 'info', 'warning',\n 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the application set the level. Defaults\n to 'passive'. (default: passive)\n--log_level_replica {debug,info,warning,error,critical,passive}\n Logger log level to use on replica nodes. Same choices and defaults as ``log_level`` (default: warning)\n--log_on_each_node [LOG_ON_EACH_NODE]\n When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: True)\n--no_log_on_each_node\n When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: False)\n--logging_dir LOGGING_DIR\n Tensorboard log dir. (default: None)\n--logging_strategy {no,steps,epoch}\n The logging strategy to use. (default: steps)\n--logging_first_step [LOGGING_FIRST_STEP]\n Log the first global_step (default: False)\n--logging_steps LOGGING_STEPS\n Log every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted as\n ratio of total training steps. (default: 500)\n--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]\n Filter nan and inf losses for logging. (default: True)\n--no_logging_nan_inf_filter\n Filter nan and inf losses for logging. (default: False)\n--save_strategy {no,steps,epoch}\n The checkpoint save strategy to use. (default: steps)\n--save_steps SAVE_STEPS\n Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be\n interpreted as ratio of total training steps. (default: 500)\n--save_total_limit SAVE_TOTAL_LIMIT\n If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. When\n `load_best_model_at_end` is enabled, the 'best' checkpoint according to `metric_for_best_model` will always be retained in\n addition to the most recent ones. For example, for `save_total_limit=5` and `load_best_model_at_end=True`, the four last\n checkpoints will always be retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,\n it is possible that two checkpoints are saved: the last one and the best one (if they are different). Default is unlimited\n checkpoints (default: None)\n--save_safetensors [SAVE_SAFETENSORS]\n Use safetensors saving and loading for state dicts instead of default torch.load and torch.save. (default: False)\n--save_on_each_node [SAVE_ON_EACH_NODE]\n When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one\n (default: False)\n--no_cuda [NO_CUDA] Do not use CUDA even when it is available (default: False)\n--use_mps_device [USE_MPS_DEVICE]\n This argument is deprecated. `mps` device will be used if available similar to `cuda` device. It will be removed in\n version 5.0 of \ud83e\udd17 Transformers (default: False)\n--seed SEED Random seed that will be set at the beginning of training. (default: 42)\n--data_seed DATA_SEED\n Random seed to be used with data samplers. (default: None)\n--jit_mode_eval [JIT_MODE_EVAL]\n Whether or not to use PyTorch jit trace for inference (default: False)\n--use_ipex [USE_IPEX]\n Use Intel extension for PyTorch when it is available, installation: 'https://github.com/intel/intel-extension-for-pytorch'\n (default: False)\n--bf16 [BF16] Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or using CPU\n (no_cuda). This is an experimental API and it may change. (default: False)\n--fp16 [FP16] Whether to use fp16 (mixed) precision instead of 32-bit (default: False)\n--fp16_opt_level FP16_OPT_LEVEL\n For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at\n https://nvidia.github.io/apex/amp.html (default: O1)\n--half_precision_backend {auto,cuda_amp,apex,cpu_amp}\n The backend to be used for half precision. (default: auto)\n--bf16_full_eval [BF16_FULL_EVAL]\n Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may change. (default: False)\n--fp16_full_eval [FP16_FULL_EVAL]\n Whether to use full float16 evaluation instead of 32-bit (default: False)\n--tf32 TF32 Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API and it may\n change. (default: None)\n--local_rank LOCAL_RANK\n For distributed training: local_rank (default: -1)\n--ddp_backend {nccl,gloo,mpi,ccl}\n The backend to be used for distributed training (default: None)\n--tpu_num_cores TPU_NUM_CORES\n TPU: Number of TPU cores (automatically passed by launcher script) (default: None)\n--tpu_metrics_debug [TPU_METRICS_DEBUG]\n Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU: Whether to print debug metrics (default: False)\n--debug DEBUG [DEBUG ...]\n Whether or not to enable debug mode. Current options: `underflow_overflow` (Detect underflow and overflow in activations\n and weights), `tpu_metrics_debug` (print debug metrics on TPU). (default: None)\n--dataloader_drop_last [DATALOADER_DROP_LAST]\n Drop the last incomplete batch if it is not divisible by the batch size. (default: False)\n--eval_steps EVAL_STEPS\n Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted\n as ratio of total training steps. (default: None)\n--dataloader_num_workers DATALOADER_NUM_WORKERS\n Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process.\n (default: 0)\n--past_index PAST_INDEX\n If >=0, uses the corresponding part of the output as the past state for next step. (default: -1)\n--run_name RUN_NAME An optional descriptor for the run. Notably used for wandb logging. (default: None)\n--disable_tqdm DISABLE_TQDM\n Whether or not to disable the tqdm progress bars. (default: None)\n--remove_unused_columns [REMOVE_UNUSED_COLUMNS]\n Remove columns not required by the model when using an nlp.Dataset. (default: True)\n--no_remove_unused_columns\n Remove columns not required by the model when using an nlp.Dataset. (default: False)\n--label_names LABEL_NAMES [LABEL_NAMES ...]\n The list of keys in your dictionary of inputs that correspond to the labels. (default: None)\n--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]\n Whether or not to load the best model found during training at the end of training. When this option is enabled, the best\n checkpoint will always be saved. See `save_total_limit` for more. (default: False)\n--metric_for_best_model METRIC_FOR_BEST_MODEL\n The metric to use to compare two different models. (default: None)\n--greater_is_better GREATER_IS_BETTER\n Whether the `metric_for_best_model` should be maximized or not. (default: None)\n--ignore_data_skip [IGNORE_DATA_SKIP]\n When resuming training, whether or not to skip the first epochs and batches to get to the same training data. (default:\n False)\n--sharded_ddp SHARDED_DDP\n Whether or not to use sharded DDP training (in distributed training only). The base option should be `simple`, `zero_dp_2`\n or `zero_dp_3` and you can add CPU-offload to `zero_dp_2` or `zero_dp_3` like this: zero_dp_2 offload` or `zero_dp_3\n offload`. You can add auto-wrap to `zero_dp_2` or `zero_dp_3` with the same syntax: zero_dp_2 auto_wrap` or `zero_dp_3\n auto_wrap`. (default: )\n--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training only). The base option\n should be `full_shard`, `shard_grad_op` or `no_shard` and you can add CPU-offload to `full_shard` or `shard_grad_op` like\n this: full_shard offload` or `shard_grad_op offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the\n same syntax: full_shard auto_wrap` or `shard_grad_op auto_wrap`. (default: )\n--fsdp_min_num_params FSDP_MIN_NUM_PARAMS\n This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp`\n field is passed). (default: 0)\n--fsdp_config FSDP_CONFIG\n Config to be used with FSDP (Pytorch Fully Sharded Data Parallel). The value is either afsdp json config file (e.g.,\n `fsdp_config.json`) or an already loaded json file as `dict`. (default: None)\n--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP\n This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,\n `T5Block` .... (useful only when `fsdp` flag is passed). (default: None)\n--deepspeed DEEPSPEED\n Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a\n dict (default: None)\n--label_smoothing_factor LABEL_SMOOTHING_FACTOR\n The label smoothing epsilon to apply (zero means no label smoothing). (default: 0.0)\n--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}\n The optimizer to use. (default: adamw_hf)\n--optim_args OPTIM_ARGS\n Optional arguments to supply to optimizer. (default: None)\n--adafactor [ADAFACTOR]\n Whether or not to replace AdamW by Adafactor. (default: False)\n--group_by_length [GROUP_BY_LENGTH]\n Whether or not to group samples of roughly the same length together when batching. (default: False)\n--length_column_name LENGTH_COLUMN_NAME\n Column name with precomputed lengths to use when grouping by length. (default: length)\n--report_to REPORT_TO [REPORT_TO ...]\n The list of integrations to report the results and logs to. (default: None)\n--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS\n When using distributed training, the value of the flag `find_unused_parameters` passed to `DistributedDataParallel`.\n (default: None)\n--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB\n When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`. (default:\n None)\n--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS\n When using distributed training, the value of the flag `broadcast_buffers` passed to `DistributedDataParallel`. (default:\n None)\n--dataloader_pin_memory [DATALOADER_PIN_MEMORY]\n Whether or not to pin memory for DataLoader. (default: True)\n--no_dataloader_pin_memory\n Whether or not to pin memory for DataLoader. (default: False)\n--skip_memory_metrics [SKIP_MEMORY_METRICS]\n Whether or not to skip adding of memory profiler reports to metrics. (default: True)\n--no_skip_memory_metrics\n Whether or not to skip adding of memory profiler reports to metrics. (default: False)\n--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]\n Whether or not to use the legacy prediction_loop in the Trainer. (default: False)\n--push_to_hub [PUSH_TO_HUB]\n Whether or not to upload the trained model to the model hub after training. (default: False)\n--resume_from_checkpoint RESUME_FROM_CHECKPOINT\n The path to a folder with a valid checkpoint for your model. (default: None)\n--hub_model_id HUB_MODEL_ID\n The name of the repository to keep in sync with the local `output_dir`. (default: None)\n--hub_strategy {end,every_save,checkpoint,all_checkpoints}\n The hub strategy to use when `--push_to_hub` is activated. (default: every_save)\n--hub_token HUB_TOKEN\n The token to use to push to the Model Hub. (default: None)\n--hub_private_repo [HUB_PRIVATE_REPO]\n Whether the model repository is private or not. (default: False)\n--gradient_checkpointing [GRADIENT_CHECKPOINTING]\n If True, use gradient checkpointing to save memory at the expense of slower backward pass. (default: False)\n--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]\n Whether or not the inputs will be passed to the `compute_metrics` function. (default: False)\n--fp16_backend {auto,cuda_amp,apex,cpu_amp}\n Deprecated. Use half_precision_backend instead (default: auto)\n--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID\n The name of the repository to which push the `Trainer`. (default: None)\n--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION\n The name of the organization in with to which push the `Trainer`. (default: None)\n--push_to_hub_token PUSH_TO_HUB_TOKEN\n The token to use to push to the Model Hub. (default: None)\n--mp_parameters MP_PARAMETERS\n Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer (default: )\n--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]\n Whether to automatically decrease the batch size in half and rerun the training loop again each time a CUDA Out-of-Memory\n was reached (default: False)\n--full_determinism [FULL_DETERMINISM]\n Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed training. Important: this\n will negatively impact the performance, so only use it for debugging. (default: False)\n--torchdynamo TORCHDYNAMO\n This argument is deprecated, use `--torch_compile_backend` instead. (default: None)\n--ray_scope RAY_SCOPE\n The scope to use when doing hyperparameter search with Ray. By default, `\"last\"` will be used. Ray will then use the last\n checkpoint of all trials, compare those, and select the best one. However, other options are also available. See the Ray\n documentation (https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for\n more options. (default: last)\n--ddp_timeout DDP_TIMEOUT\n Overrides the default timeout for distributed training (value should be given in seconds). (default: 1800)\n--torch_compile [TORCH_COMPILE]\n If set to `True`, the model will be wrapped in `torch.compile`. (default: False)\n--torch_compile_backend TORCH_COMPILE_BACKEND\n Which backend to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--torch_compile_mode TORCH_COMPILE_MODE\n Which mode to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--xpu_backend {mpi,ccl,gloo}\n The backend to be used for distributed training on Intel XPU. (default: None)\n
"},{"location":"data_license.html","title":"Data License","text":"# Creative Commons Attribution 4.0 International License (CC BY 4.0)\n\nThis work is licensed under the Creative Commons Attribution 4.0 International License.\n\nTo view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.\n
"},{"location":"license.html","title":"License","text":"Apache License\n Version 2.0, January 2004\n http://www.apache.org/licenses/\n\n TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n 1. Definitions.\n\n \"License\" shall mean the terms and conditions for use, reproduction,\n and distribution as defined by Sections 1 through 9 of this document.\n\n \"Licensor\" shall mean the copyright owner or entity authorized by\n the copyright owner that is granting the License.\n\n \"Legal Entity\" shall mean the union of the acting entity and all\n other entities that control, are controlled by, or are under common\n control with that entity. For the purposes of this definition,\n \"control\" means (i) the power, direct or indirect, to cause the\n direction or management of such entity, whether by contract or\n otherwise, or (ii) ownership of fifty percent (50%) or more of the\n outstanding shares, or (iii) beneficial ownership of such entity.\n\n \"You\" (or \"Your\") shall mean an individual or Legal Entity\n exercising permissions granted by this License.\n\n \"Source\" form shall mean the preferred form for making modifications,\n including but not limited to software source code, documentation\n source, and configuration files.\n\n \"Object\" form shall mean any form resulting from mechanical\n transformation or translation of a Source form, including but\n not limited to compiled object code, generated documentation,\n and conversions to other media types.\n\n \"Work\" shall mean the work of authorship, whether in Source or\n Object form, made available under the License, as indicated by a\n copyright notice that is included in or attached to the work\n (an example is provided in the Appendix below).\n\n \"Derivative Works\" shall mean any work, whether in Source or Object\n form, that is based on (or derived from) the Work and for which the\n editorial revisions, annotations, elaborations, or other modifications\n represent, as a whole, an original work of authorship. For the purposes\n of this License, Derivative Works shall not include works that remain\n separable from, or merely link (or bind by name) to the interfaces of,\n the Work and Derivative Works thereof.\n\n \"Contribution\" shall mean any work of authorship, including\n the original version of the Work and any modifications or additions\n to that Work or Derivative Works thereof, that is intentionally\n submitted to Licensor for inclusion in the Work by the copyright owner\n or by an individual or Legal Entity authorized to submit on behalf of\n the copyright owner. For the purposes of this definition, \"submitted\"\n means any form of electronic, verbal, or written communication sent\n to the Licensor or its representatives, including but not limited to\n communication on electronic mailing lists, source code control systems,\n and issue tracking systems that are managed by, or on behalf of, the\n Licensor for the purpose of discussing and improving the Work, but\n excluding communication that is conspicuously marked or otherwise\n designated in writing by the copyright owner as \"Not a Contribution.\"\n\n \"Contributor\" shall mean Licensor and any individual or Legal Entity\n on behalf of whom a Contribution has been received by Licensor and\n subsequently incorporated within the Work.\n\n 2. Grant of Copyright License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n copyright license to reproduce, prepare Derivative Works of,\n publicly display, publicly perform, sublicense, and distribute the\n Work and such Derivative Works in Source or Object form.\n\n 3. Grant of Patent License. Subject to the terms and conditions of\n this License, each Contributor hereby grants to You a perpetual,\n worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n (except as stated in this section) patent license to make, have made,\n use, offer to sell, sell, import, and otherwise transfer the Work,\n where such license applies only to those patent claims licensable\n by such Contributor that are necessarily infringed by their\n Contribution(s) alone or by combination of their Contribution(s)\n with the Work to which such Contribution(s) was submitted. If You\n institute patent litigation against any entity (including a\n cross-claim or counterclaim in a lawsuit) alleging that the Work\n or a Contribution incorporated within the Work constitutes direct\n or contributory patent infringement, then any patent licenses\n granted to You under this License for that Work shall terminate\n as of the date such litigation is filed.\n\n 4. Redistribution. You may reproduce and distribute copies of the\n Work or Derivative Works thereof in any medium, with or without\n modifications, and in Source or Object form, provided that You\n meet the following conditions:\n\n (a) You must give any other recipients of the Work or\n Derivative Works a copy of this License; and\n\n (b) You must cause any modified files to carry prominent notices\n stating that You changed the files; and\n\n (c) You must retain, in the Source form of any Derivative Works\n that You distribute, all copyright, patent, trademark, and\n attribution notices from the Source form of the Work,\n excluding those notices that do not pertain to any part of\n the Derivative Works; and\n\n (d) If the Work includes a \"NOTICE\" text file as part of its\n distribution, then any Derivative Works that You distribute must\n include a readable copy of the attribution notices contained\n within such NOTICE file, excluding those notices that do not\n pertain to any part of the Derivative Works, in at least one\n of the following places: within a NOTICE text file distributed\n as part of the Derivative Works; within the Source form or\n documentation, if provided along with the Derivative Works; or,\n within a display generated by the Derivative Works, if and\n wherever such third-party notices normally appear. The contents\n of the NOTICE file are for informational purposes only and\n do not modify the License. You may add Your own attribution\n notices within Derivative Works that You distribute, alongside\n or as an addendum to the NOTICE text from the Work, provided\n that such additional attribution notices cannot be construed\n as modifying the License.\n\n You may add Your own copyright statement to Your modifications and\n may provide additional or different license terms and conditions\n for use, reproduction, or distribution of Your modifications, or\n for any such Derivative Works as a whole, provided Your use,\n reproduction, and distribution of the Work otherwise complies with\n the conditions stated in this License.\n\n 5. Submission of Contributions. Unless You explicitly state otherwise,\n any Contribution intentionally submitted for inclusion in the Work\n by You to the Licensor shall be under the terms and conditions of\n this License, without any additional terms or conditions.\n Notwithstanding the above, nothing herein shall supersede or modify\n the terms of any separate license agreement you may have executed\n with Licensor regarding such Contributions.\n\n 6. Trademarks. This License does not grant permission to use the trade\n names, trademarks, service marks, or product names of the Licensor,\n except as required for reasonable and customary use in describing the\n origin of the Work and reproducing the content of the NOTICE file.\n\n 7. Disclaimer of Warranty. Unless required by applicable law or\n agreed to in writing, Licensor provides the Work (and each\n Contributor provides its Contributions) on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n implied, including, without limitation, any warranties or conditions\n of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n PARTICULAR PURPOSE. You are solely responsible for determining the\n appropriateness of using or redistributing the Work and assume any\n risks associated with Your exercise of permissions under this License.\n\n 8. Limitation of Liability. In no event and under no legal theory,\n whether in tort (including negligence), contract, or otherwise,\n unless required by applicable law (such as deliberate and grossly\n negligent acts) or agreed to in writing, shall any Contributor be\n liable to You for damages, including any direct, indirect, special,\n incidental, or consequential damages of any character arising as a\n result of this License or out of the use or inability to use the\n Work (including but not limited to damages for loss of goodwill,\n work stoppage, computer failure or malfunction, or any and all\n other commercial damages or losses), even if such Contributor\n has been advised of the possibility of such damages.\n\n 9. Accepting Warranty or Additional Liability. While redistributing\n the Work or Derivative Works thereof, You may choose to offer,\n and charge a fee for, acceptance of support, warranty, indemnity,\n or other liability obligations and/or rights consistent with this\n License. However, in accepting such obligations, You may act only\n on Your own behalf and on Your sole responsibility, not on behalf\n of any other Contributor, and only if You agree to indemnify,\n defend, and hold each Contributor harmless for any liability\n incurred by, or claims asserted against, such Contributor by reason\n of your accepting any such warranty or additional liability.\n\n END OF TERMS AND CONDITIONS\n\n APPENDIX: How to apply the Apache License to your work.\n\n To apply the Apache License to your work, attach the following\n boilerplate notice, with the fields enclosed by brackets \"[]\"\n replaced with your own identifying information. (Don't include\n the brackets!) The text should be enclosed in the appropriate\n comment syntax for the file format. We also recommend that a\n file or class name and description of purpose be included on the\n same \"printed page\" as the copyright notice for easier\n identification within third-party archives.\n\n Copyright 2023 Emmanuel Noutahi\n\n Licensed under the Apache License, Version 2.0 (the \"License\");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\n Unless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.\n
"},{"location":"api/safe.html","title":"SAFE","text":""},{"location":"api/safe.html#safe-encoder-decoder","title":"SAFE Encoder-Decoder","text":""},{"location":"api/safe.html#safe.converter.SAFEConverter","title":"SAFEConverter
","text":"Molecule line notation conversion from SMILES to SAFE
A SAFE representation is a string based representation of a molecule decomposition into fragment components, separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves, unless explicitely correct to add missing hydrogens.
Slicing algorithms
By default SAFE strings are generated using BRICS
, however, the following alternative are supported:
hr
)recap
)mmpa
)attach
)Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.
Source code insafe/converter.py
class SAFEConverter:\n \"\"\"Molecule line notation conversion from SMILES to SAFE\n\n A SAFE representation is a string based representation of a molecule decomposition into fragment components,\n separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves,\n unless explicitely correct to add missing hydrogens.\n\n !!! note \"Slicing algorithms\"\n\n By default SAFE strings are generated using `BRICS`, however, the following alternative are supported:\n\n * [Hussain-Rea (`hr`)](https://pubs.acs.org/doi/10.1021/ci900450m)\n * [RECAP (`recap`)](https://pubmed.ncbi.nlm.nih.gov/9611787/)\n * [RDKit's MMPA (`mmpa`)](https://www.rdkit.org/docs/source/rdkit.Chem.rdMMPA.html)\n * Any possible attachment points (`attach`)\n\n Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms\n corresponding to the bonds to break.\n\n \"\"\"\n\n SUPPORTED_SLICERS = [\"hr\", \"rotatable\", \"recap\", \"mmpa\", \"attach\", \"brics\"]\n __SLICE_SMARTS = {\n \"hr\": [\"[*]!@-[*]\"], # any non ring single bond\n \"recap\": [\n \"[$([C;!$(C([#7])[#7])](=!@[O]))]!@[$([#7;+0;!D1])]\",\n \"[$(C=!@O)]!@[$([O;+0])]\",\n \"[$([N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*]))]-!@[$([*])]\",\n \"[$(C(=!@O)([#7;+0;D2,D3])!@[#7;+0;D2,D3])]!@[$([#7;+0;D2,D3])]\",\n \"[$([O;+0](-!@[#6!$(C=O)])-!@[#6!$(C=O)])]-!@[$([#6!$(C=O)])]\",\n \"C=!@C\",\n \"[N;+1;D4]!@[#6]\",\n \"[$([n;+0])]-!@C\",\n \"[$([O]=[C]-@[N;+0])]-!@[$([C])]\",\n \"c-!@c\",\n \"[$([#7;+0;D2,D3])]-!@[$([S](=[O])=[O])]\",\n ],\n \"mmpa\": [\"[#6+0;!$(*=,#[!#6])]!@!=!#[*]\"], # classical mmpa slicing smarts\n \"attach\": [\"[*]!@[*]\"], # any potential attachment point, including hydrogens when explicit\n \"rotatable\": [\"[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]\"],\n }\n\n def __init__(\n self,\n slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n require_hs: Optional[bool] = None,\n use_original_opener_for_attach: bool = True,\n ignore_stereo: bool = False,\n ):\n \"\"\"Constructor for the SAFE converter\n\n Args:\n slicer: slicer algorithm to use for encoding.\n Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n or a custom callable that returns the bond ids that can be sliced.\n require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n `attach` slicer requires adding hydrogens.\n use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n mapping number to attachment points, or use simple enumeration.\n ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n \"\"\"\n self.slicer = slicer\n if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n if self.slicer != \"brics\" and isinstance(self.slicer, str):\n self.slicer = [self.slicer]\n if isinstance(self.slicer, (list, tuple)):\n self.slicer = [dm.from_smarts(x) for x in self.slicer]\n if any(x is None for x in self.slicer):\n raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n self.require_hs = require_hs or (slicer == \"attach\")\n self.use_original_opener_for_attach = use_original_opener_for_attach\n self.ignore_stereo = ignore_stereo\n\n @staticmethod\n def randomize(mol: dm.Mol, rng: Optional[int] = None):\n \"\"\"Randomize the position of the atoms in a mol.\n\n Args:\n mol: molecules to randomize\n rng: optional seed to use\n \"\"\"\n if isinstance(rng, int):\n rng = np.random.default_rng(rng)\n if mol.GetNumAtoms() == 0:\n return mol\n atom_indices = list(range(mol.GetNumAtoms()))\n atom_indices = rng.permutation(atom_indices).tolist()\n return Chem.RenumberAtoms(mol, atom_indices)\n\n @classmethod\n def _find_branch_number(cls, inp: str):\n \"\"\"Find the branch number and ring closure in the SMILES representation using regexp\n\n Args:\n inp: input smiles\n \"\"\"\n inp = re.sub(r\"\\[.*?\\]\", \"\", inp) # noqa\n matching_groups = re.findall(r\"((?<=%)\\d{2})|((?<!%)\\d+)(?![^\\[]*\\])\", inp)\n # first match is for multiple connection as multiple digits\n # second match is for single connections requiring 2 digits\n # SMILES does not support triple digits\n branch_numbers = []\n for m in matching_groups:\n if m[0] == \"\":\n branch_numbers.extend(int(mm) for mm in m[1])\n elif m[1] == \"\":\n branch_numbers.append(int(m[0].replace(\"%\", \"\")))\n return branch_numbers\n\n def _ensure_valid(self, inp: str):\n \"\"\"Ensure that the input SAFE string is valid by fixing the missing attachment points\n\n Args:\n inp: input SAFE string\n\n \"\"\"\n missing_tokens = [inp]\n branch_numbers = self._find_branch_number(inp)\n # only use the set that have exactly 1 element\n # any branch number that is not pairwise should receive a dummy atom to complete the attachment point\n branch_numbers = Counter(branch_numbers)\n for i, (bnum, bcount) in enumerate(branch_numbers.items()):\n if bcount % 2 != 0:\n bnum_str = str(bnum) if bnum < 10 else f\"%{bnum}\"\n _tk = f\"[*:{i+1}]{bnum_str}\"\n if self.use_original_opener_for_attach:\n bnum_digit = bnum_str.strip(\"%\") # strip out the % sign\n _tk = f\"[*:{bnum_digit}]{bnum_str}\"\n missing_tokens.append(_tk)\n return \".\".join(missing_tokens)\n\n def decoder(\n self,\n inp: str,\n as_mol: bool = False,\n canonical: bool = False,\n fix: bool = True,\n remove_dummies: bool = True,\n remove_added_hs: bool = True,\n ):\n \"\"\"Convert input SAFE representation to smiles\n\n Args:\n inp: input SAFE representation to decode as a valid molecule or smiles\n as_mol: whether to return a molecule object or a smiles string\n canonical: whether to return a canonical\n fix: whether to fix the SAFE representation to take into account non-connected attachment points\n remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n \"\"\"\n\n if fix:\n inp = self._ensure_valid(inp)\n mol = dm.to_mol(inp)\n if remove_dummies:\n with suppress(Exception):\n du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n mol = dm.remove_dummies(out)\n if as_mol:\n if remove_added_hs:\n mol = dm.remove_hs(mol, update_explicit_count=True)\n if canonical:\n mol = dm.standardize_mol(mol)\n mol = dm.canonical_tautomer(mol)\n return mol\n out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n if canonical:\n out = dm.standardize_smiles(out)\n return out\n\n def _fragment(self, mol: dm.Mol, allow_empty: bool = False):\n \"\"\"\n Perform bond cutting in place for the input molecule, given the slicing algorithm\n\n Args:\n mol: input molecule to split\n allow_empty: whether to allow the slicing algorithm to return empty bonds\n Raises:\n SAFEFragmentationError: if the slicing algorithm return empty bonds\n \"\"\"\n\n if self.slicer is None:\n matching_bonds = []\n\n elif callable(self.slicer):\n matching_bonds = self.slicer(mol)\n matching_bonds = list(matching_bonds)\n\n elif self.slicer == \"brics\":\n matching_bonds = BRICS.FindBRICSBonds(mol)\n matching_bonds = [brics_match[0] for brics_match in matching_bonds]\n\n else:\n matches = set()\n for smarts in self.slicer:\n matches |= {\n tuple(sorted(match)) for match in mol.GetSubstructMatches(smarts, uniquify=True)\n }\n matching_bonds = list(matches)\n\n if matching_bonds is None or len(matching_bonds) == 0 and not allow_empty:\n raise SAFEFragmentationError(\n \"Slicing algorithms did not return any bonds that can be cut !\"\n )\n return matching_bonds or []\n\n def encoder(\n self,\n inp: Union[str, dm.Mol],\n canonical: bool = True,\n randomize: Optional[bool] = False,\n seed: Optional[int] = None,\n constraints: Optional[List[dm.Mol]] = None,\n allow_empty: bool = False,\n rdkit_safe: bool = True,\n ):\n \"\"\"Convert input smiles to SAFE representation\n\n Args:\n inp: input smiles\n canonical: whether to return canonical smiles string. Defaults to True\n randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n seed: optional seed to use when allowing randomization of the SAFE encoding.\n Randomization happens at two steps:\n 1. at the original smiles representation by randomization the atoms.\n 2. at the SAFE conversion by randomizing fragment orders\n constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n happen outside of a substructure matching one of the patterns.\n allow_empty: whether to allow the slicing algorithm to return empty bonds\n rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n \"\"\"\n rng = None\n if randomize:\n rng = np.random.default_rng(seed)\n if not canonical:\n inp = dm.to_mol(inp, remove_hs=False)\n inp = self.randomize(inp, rng)\n\n if isinstance(inp, dm.Mol):\n inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n # EN: we first normalize the attachment if the molecule is a query:\n # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n branch_numbers = self._find_branch_number(inp)\n\n mol = dm.to_mol(inp, remove_hs=False)\n potential_stereos = Chem.FindPotentialStereo(mol)\n has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n if self.ignore_stereo:\n mol = dm.remove_stereochemistry(mol)\n\n bond_map_id = 1\n for atom in mol.GetAtoms():\n if atom.GetAtomicNum() == 0:\n atom.SetAtomMapNum(0)\n atom.SetIsotope(bond_map_id)\n bond_map_id += 1\n\n if self.require_hs:\n mol = dm.add_hs(mol)\n matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n substructed_ignored = []\n if constraints is not None:\n substructed_ignored = list(\n itertools.chain(\n *[\n mol.GetSubstructMatches(constraint, uniquify=True)\n for constraint in constraints\n ]\n )\n )\n\n bonds = []\n for i_a, i_b in matching_bonds:\n # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n # on the other end, a bond between two substructure to preserved independently is perfectly fine\n if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n continue\n obond = mol.GetBondBetweenAtoms(i_a, i_b)\n bonds.append(obond.GetIdx())\n\n if len(bonds) > 0:\n mol = Chem.FragmentOnBonds(\n mol,\n bonds,\n dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n )\n # here we need to be clever and disable rooted atom as the atom with mapping\n\n frags = list(Chem.GetMolFrags(mol, asMols=True))\n if randomize:\n frags = rng.permutation(frags).tolist()\n elif canonical:\n frags = sorted(\n frags,\n key=lambda x: x.GetNumAtoms(),\n reverse=True,\n )\n\n frags_str = []\n for frag in frags:\n non_map_atom_idxs = [\n atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n ]\n frags_str.append(\n Chem.MolToSmiles(\n frag,\n isomericSmiles=True,\n canonical=True, # needs to always be true\n rootedAtAtom=non_map_atom_idxs[0],\n )\n )\n\n scaffold_str = \".\".join(frags_str)\n # EN: fix for https://github.com/datamol-io/safe/issues/37\n # we were using the wrong branch number count which did not take into account\n # possible change in digit utilization after bond slicing\n scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n # don't capture atom mapping in the scaffold\n attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n if canonical:\n attach_pos = sorted(attach_pos)\n starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n for attach in attach_pos:\n val = str(starting_num) if starting_num < 10 else f\"%{starting_num}\"\n # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n scaffold_str = attach_regexp.sub(val, scaffold_str)\n starting_num += 1\n\n # now we need to remove all the parenthesis around digit only number\n wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n scaffold_str = wrong_attach.sub(r\"\\g<1>\", scaffold_str)\n # furthermore, we autoapply rdkit-compatible digit standardization.\n if rdkit_safe:\n pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n replacement = r\"\\g<1>\\g<2>\"\n scaffold_str = re.sub(pattern, replacement, scaffold_str)\n if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n logger.warning(\n \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n )\n return scaffold_str\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.__init__","title":"__init__(slicer='brics', require_hs=None, use_original_opener_for_attach=True, ignore_stereo=False)
","text":"Constructor for the SAFE converter
Parameters:
Name Type Description Defaultslicer
Optional[Union[str, List[str], Callable]]
slicer algorithm to use for encoding. Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS) or a custom callable that returns the bond ids that can be sliced.
'brics'
require_hs
Optional[bool]
whether the slicing algorithm require the molecule to have hydrogen explictly added. attach
slicer requires adding hydrogens.
None
use_original_opener_for_attach
bool
whether to use the original branch opener digit when adding back mapping number to attachment points, or use simple enumeration.
True
ignore_stereo
bool
RDKIT does not support some particular SAFE subset when stereochemistry is defined.
False
Source code in safe/converter.py
def __init__(\n self,\n slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n require_hs: Optional[bool] = None,\n use_original_opener_for_attach: bool = True,\n ignore_stereo: bool = False,\n):\n \"\"\"Constructor for the SAFE converter\n\n Args:\n slicer: slicer algorithm to use for encoding.\n Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n or a custom callable that returns the bond ids that can be sliced.\n require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n `attach` slicer requires adding hydrogens.\n use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n mapping number to attachment points, or use simple enumeration.\n ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n \"\"\"\n self.slicer = slicer\n if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n if self.slicer != \"brics\" and isinstance(self.slicer, str):\n self.slicer = [self.slicer]\n if isinstance(self.slicer, (list, tuple)):\n self.slicer = [dm.from_smarts(x) for x in self.slicer]\n if any(x is None for x in self.slicer):\n raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n self.require_hs = require_hs or (slicer == \"attach\")\n self.use_original_opener_for_attach = use_original_opener_for_attach\n self.ignore_stereo = ignore_stereo\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.decoder","title":"decoder(inp, as_mol=False, canonical=False, fix=True, remove_dummies=True, remove_added_hs=True)
","text":"Convert input SAFE representation to smiles
Parameters:
Name Type Description Defaultinp
str
input SAFE representation to decode as a valid molecule or smiles
requiredas_mol
bool
whether to return a molecule object or a smiles string
False
canonical
bool
whether to return a canonical
False
fix
bool
whether to fix the SAFE representation to take into account non-connected attachment points
True
remove_dummies
bool
whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with
True
remove_added_hs
bool
whether to remove all the added hydrogen atoms after applying dummy removal for recovery
True
Source code in safe/converter.py
def decoder(\n self,\n inp: str,\n as_mol: bool = False,\n canonical: bool = False,\n fix: bool = True,\n remove_dummies: bool = True,\n remove_added_hs: bool = True,\n):\n \"\"\"Convert input SAFE representation to smiles\n\n Args:\n inp: input SAFE representation to decode as a valid molecule or smiles\n as_mol: whether to return a molecule object or a smiles string\n canonical: whether to return a canonical\n fix: whether to fix the SAFE representation to take into account non-connected attachment points\n remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n \"\"\"\n\n if fix:\n inp = self._ensure_valid(inp)\n mol = dm.to_mol(inp)\n if remove_dummies:\n with suppress(Exception):\n du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n mol = dm.remove_dummies(out)\n if as_mol:\n if remove_added_hs:\n mol = dm.remove_hs(mol, update_explicit_count=True)\n if canonical:\n mol = dm.standardize_mol(mol)\n mol = dm.canonical_tautomer(mol)\n return mol\n out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n if canonical:\n out = dm.standardize_smiles(out)\n return out\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.encoder","title":"encoder(inp, canonical=True, randomize=False, seed=None, constraints=None, allow_empty=False, rdkit_safe=True)
","text":"Convert input smiles to SAFE representation
Parameters:
Name Type Description Defaultinp
Union[str, Mol]
input smiles
requiredcanonical
bool
whether to return canonical smiles string. Defaults to True
True
randomize
Optional[bool]
whether to randomize the safe string encoding. Will be ignored if canonical is provided
False
seed
Optional[int]
optional seed to use when allowing randomization of the SAFE encoding. Randomization happens at two steps: 1. at the original smiles representation by randomization the atoms. 2. at the SAFE conversion by randomizing fragment orders
None
constraints
Optional[List[Mol]]
List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would happen outside of a substructure matching one of the patterns.
None
allow_empty
bool
whether to allow the slicing algorithm to return empty bonds
False
rdkit_safe
bool
whether to apply rdkit-safe digit standardization to the output SAFE string.
True
Source code in safe/converter.py
def encoder(\n self,\n inp: Union[str, dm.Mol],\n canonical: bool = True,\n randomize: Optional[bool] = False,\n seed: Optional[int] = None,\n constraints: Optional[List[dm.Mol]] = None,\n allow_empty: bool = False,\n rdkit_safe: bool = True,\n):\n \"\"\"Convert input smiles to SAFE representation\n\n Args:\n inp: input smiles\n canonical: whether to return canonical smiles string. Defaults to True\n randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n seed: optional seed to use when allowing randomization of the SAFE encoding.\n Randomization happens at two steps:\n 1. at the original smiles representation by randomization the atoms.\n 2. at the SAFE conversion by randomizing fragment orders\n constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n happen outside of a substructure matching one of the patterns.\n allow_empty: whether to allow the slicing algorithm to return empty bonds\n rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n \"\"\"\n rng = None\n if randomize:\n rng = np.random.default_rng(seed)\n if not canonical:\n inp = dm.to_mol(inp, remove_hs=False)\n inp = self.randomize(inp, rng)\n\n if isinstance(inp, dm.Mol):\n inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n # EN: we first normalize the attachment if the molecule is a query:\n # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n branch_numbers = self._find_branch_number(inp)\n\n mol = dm.to_mol(inp, remove_hs=False)\n potential_stereos = Chem.FindPotentialStereo(mol)\n has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n if self.ignore_stereo:\n mol = dm.remove_stereochemistry(mol)\n\n bond_map_id = 1\n for atom in mol.GetAtoms():\n if atom.GetAtomicNum() == 0:\n atom.SetAtomMapNum(0)\n atom.SetIsotope(bond_map_id)\n bond_map_id += 1\n\n if self.require_hs:\n mol = dm.add_hs(mol)\n matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n substructed_ignored = []\n if constraints is not None:\n substructed_ignored = list(\n itertools.chain(\n *[\n mol.GetSubstructMatches(constraint, uniquify=True)\n for constraint in constraints\n ]\n )\n )\n\n bonds = []\n for i_a, i_b in matching_bonds:\n # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n # on the other end, a bond between two substructure to preserved independently is perfectly fine\n if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n continue\n obond = mol.GetBondBetweenAtoms(i_a, i_b)\n bonds.append(obond.GetIdx())\n\n if len(bonds) > 0:\n mol = Chem.FragmentOnBonds(\n mol,\n bonds,\n dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n )\n # here we need to be clever and disable rooted atom as the atom with mapping\n\n frags = list(Chem.GetMolFrags(mol, asMols=True))\n if randomize:\n frags = rng.permutation(frags).tolist()\n elif canonical:\n frags = sorted(\n frags,\n key=lambda x: x.GetNumAtoms(),\n reverse=True,\n )\n\n frags_str = []\n for frag in frags:\n non_map_atom_idxs = [\n atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n ]\n frags_str.append(\n Chem.MolToSmiles(\n frag,\n isomericSmiles=True,\n canonical=True, # needs to always be true\n rootedAtAtom=non_map_atom_idxs[0],\n )\n )\n\n scaffold_str = \".\".join(frags_str)\n # EN: fix for https://github.com/datamol-io/safe/issues/37\n # we were using the wrong branch number count which did not take into account\n # possible change in digit utilization after bond slicing\n scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n # don't capture atom mapping in the scaffold\n attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n if canonical:\n attach_pos = sorted(attach_pos)\n starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n for attach in attach_pos:\n val = str(starting_num) if starting_num < 10 else f\"%{starting_num}\"\n # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n scaffold_str = attach_regexp.sub(val, scaffold_str)\n starting_num += 1\n\n # now we need to remove all the parenthesis around digit only number\n wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n scaffold_str = wrong_attach.sub(r\"\\g<1>\", scaffold_str)\n # furthermore, we autoapply rdkit-compatible digit standardization.\n if rdkit_safe:\n pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n replacement = r\"\\g<1>\\g<2>\"\n scaffold_str = re.sub(pattern, replacement, scaffold_str)\n if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n logger.warning(\n \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n )\n return scaffold_str\n
"},{"location":"api/safe.html#safe.converter.SAFEConverter.randomize","title":"randomize(mol, rng=None)
staticmethod
","text":"Randomize the position of the atoms in a mol.
Parameters:
Name Type Description Defaultmol
Mol
molecules to randomize
requiredrng
Optional[int]
optional seed to use
None
Source code in safe/converter.py
@staticmethod\ndef randomize(mol: dm.Mol, rng: Optional[int] = None):\n \"\"\"Randomize the position of the atoms in a mol.\n\n Args:\n mol: molecules to randomize\n rng: optional seed to use\n \"\"\"\n if isinstance(rng, int):\n rng = np.random.default_rng(rng)\n if mol.GetNumAtoms() == 0:\n return mol\n atom_indices = list(range(mol.GetNumAtoms()))\n atom_indices = rng.permutation(atom_indices).tolist()\n return Chem.RenumberAtoms(mol, atom_indices)\n
"},{"location":"api/safe.html#safe.converter.encode","title":"encode(inp, canonical=True, randomize=False, seed=None, slicer=None, require_hs=None, constraints=None, ignore_stereo=False)
","text":"Convert input smiles to SAFE representation
Parameters:
Name Type Description Defaultinp
Union[str, Mol]
input smiles
requiredcanonical
bool
whether to return canonical SAFE string. Defaults to True
True
randomize
Optional[bool]
whether to randomize the safe string encoding. Will be ignored if canonical is provided
False
seed
Optional[int]
optional seed to use when allowing randomization of the SAFE encoding.
None
slicer
Optional[Union[List[str], str, Callable]]
slicer algorithm to use for encoding. Defaults to \"brics\".
None
require_hs
Optional[bool]
whether the slicing algorithm require the molecule to have hydrogen explictly added.
None
constraints
Optional[List[Mol]]
List of molecules or pattern to preserve during the SAFE construction.
None
ignore_stereo
Optional[bool]
RDKIT does not support some particular SAFE subset when stereochemistry is defined.
False
Source code in safe/converter.py
def encode(\n inp: Union[str, dm.Mol],\n canonical: bool = True,\n randomize: Optional[bool] = False,\n seed: Optional[int] = None,\n slicer: Optional[Union[List[str], str, Callable]] = None,\n require_hs: Optional[bool] = None,\n constraints: Optional[List[dm.Mol]] = None,\n ignore_stereo: Optional[bool] = False,\n):\n \"\"\"\n Convert input smiles to SAFE representation\n\n Args:\n inp: input smiles\n canonical: whether to return canonical SAFE string. Defaults to True\n randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n seed: optional seed to use when allowing randomization of the SAFE encoding.\n slicer: slicer algorithm to use for encoding. Defaults to \"brics\".\n require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n constraints: List of molecules or pattern to preserve during the SAFE construction.\n ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n \"\"\"\n if slicer is None:\n slicer = \"brics\"\n with dm.without_rdkit_log():\n safe_obj = SAFEConverter(slicer=slicer, require_hs=require_hs, ignore_stereo=ignore_stereo)\n try:\n encoded = safe_obj.encoder(\n inp,\n canonical=canonical,\n randomize=randomize,\n constraints=constraints,\n seed=seed,\n )\n except SAFEFragmentationError as e:\n raise e\n except Exception as e:\n raise SAFEEncodeError(f\"Failed to encode {inp} with {slicer}\") from e\n return encoded\n
"},{"location":"api/safe.html#safe.converter.decode","title":"decode(safe_str, as_mol=False, canonical=False, fix=True, remove_added_hs=True, remove_dummies=True, ignore_errors=False)
","text":"Convert input SAFE representation to smiles Args: safe_str: input SAFE representation to decode as a valid molecule or smiles as_mol: whether to return a molecule object or a smiles string canonical: whether to return a canonical smiles or a randomized smiles fix: whether to fix the SAFE representation to take into account non-connected attachment points remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string. remove_dummies: whether to remove dummy atoms from the SAFE representation ignore_errors: whether to ignore error and return None on decoding failure or raise an error
Source code insafe/converter.py
def decode(\n safe_str: str,\n as_mol: bool = False,\n canonical: bool = False,\n fix: bool = True,\n remove_added_hs: bool = True,\n remove_dummies: bool = True,\n ignore_errors: bool = False,\n):\n \"\"\"Convert input SAFE representation to smiles\n Args:\n safe_str: input SAFE representation to decode as a valid molecule or smiles\n as_mol: whether to return a molecule object or a smiles string\n canonical: whether to return a canonical smiles or a randomized smiles\n fix: whether to fix the SAFE representation to take into account non-connected attachment points\n remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string.\n remove_dummies: whether to remove dummy atoms from the SAFE representation\n ignore_errors: whether to ignore error and return None on decoding failure or raise an error\n\n \"\"\"\n with dm.without_rdkit_log():\n safe_obj = SAFEConverter()\n try:\n decoded = safe_obj.decoder(\n safe_str,\n as_mol=as_mol,\n canonical=canonical,\n fix=fix,\n remove_dummies=remove_dummies,\n remove_added_hs=remove_added_hs,\n )\n\n except Exception as e:\n if ignore_errors:\n return None\n raise SAFEDecodeError(f\"Failed to decode {safe_str}\") from e\n return decoded\n
"},{"location":"api/safe.html#safe-design","title":"SAFE Design","text":""},{"location":"api/safe.html#safe.sample.SAFEDesign","title":"SAFEDesign
","text":"Molecular generation using SAFE pretrained model
Source code insafe/sample.py
class SAFEDesign:\n \"\"\"Molecular generation using SAFE pretrained model\"\"\"\n\n _DEFAULT_MAX_LENGTH = 1024 # default max length used during training\n _DEFAULT_MODEL_PATH = \"datamol-io/safe-gpt\"\n\n def __init__(\n self,\n model: Union[SAFEDoubleHeadsModel, str],\n tokenizer: Union[str, SAFETokenizer],\n generation_config: Optional[Union[str, GenerationConfig]] = None,\n safe_encoder: Optional[sf.SAFEConverter] = None,\n verbose: bool = True,\n ):\n \"\"\"SAFEDesign constructor\n\n !!! info\n Design methods in SAFE are not deterministic when it comes to the token sampling step.\n If a method accepts a `random_seed`, it's for the SAFE-related algorithms and not the\n sampling from the autoregressive model. To ensure you get a deterministic sampling,\n please set the seed at the `transformers` package level.\n\n ```python\n import safe as sf\n import transformers\n my_seed = 100\n designer = sf.SAFEDesign(...)\n\n transformers.set_seed(100) # use this before calling a design function\n designer.linker_generation(...)\n ```\n\n\n Args:\n model: input SAFEDoubleHeadsModel to use for generation\n tokenizer: input SAFETokenizer to use for generation\n generation_config: input GenerationConfig to use for generation\n safe_encoder: custom safe encoder to use\n verbose: whether to print out logging information during generation\n \"\"\"\n if isinstance(model, (str, os.PathLike)):\n model = SAFEDoubleHeadsModel.from_pretrained(model)\n\n if isinstance(tokenizer, (str, os.PathLike)):\n tokenizer = SAFETokenizer.load(tokenizer)\n\n model.eval()\n self.model = model\n self.tokenizer = tokenizer\n if isinstance(generation_config, os.PathLike):\n generation_config = GenerationConfig.from_pretrained(generation_config)\n if generation_config is None:\n generation_config = GenerationConfig.from_model_config(model.config)\n self.generation_config = generation_config\n for special_token_id in [\"bos_token_id\", \"eos_token_id\", \"pad_token_id\"]:\n if getattr(self.generation_config, special_token_id) is None:\n setattr(\n self.generation_config, special_token_id, getattr(tokenizer, special_token_id)\n )\n\n self.verbose = verbose\n self.safe_encoder = safe_encoder or sf.SAFEConverter()\n\n @classmethod\n def load_default(\n cls, verbose: bool = False, model_dir: Optional[str] = None, device: str = None\n ) -> \"SAFEDesign\":\n \"\"\"Load default SAFEGenerator model\n\n Args:\n verbose: whether to print out logging information during generation\n model_dir: Optional path to model folder to use instead of the default one.\n If provided the tokenizer should be in the model_dir named as `tokenizer.json`\n device: optional device where to move the model\n \"\"\"\n if model_dir is None or not model_dir:\n model_dir = cls._DEFAULT_MODEL_PATH\n model = SAFEDoubleHeadsModel.from_pretrained(model_dir)\n tokenizer = SAFETokenizer.from_pretrained(model_dir)\n gen_config = GenerationConfig.from_pretrained(model_dir)\n if device is not None:\n model = model.to(device)\n return cls(model=model, tokenizer=tokenizer, generation_config=gen_config, verbose=verbose)\n\n def linker_generation(\n self,\n *groups: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n model_only: Optional[bool] = False,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform linker generation using the pretrained SAFE model.\n Linker generation is really just scaffold morphing underlying.\n\n Args:\n groups: list of fragments to link together, they are joined in the order provided\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n model_only: whether to use the model only ability and nothing more.\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n side_chains = list(groups)\n\n if len(side_chains) != 2:\n raise ValueError(\n \"Linker generation only works when providing two groups as side chains\"\n )\n\n return self._fragment_linking(\n side_chains=side_chains,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n sanitize=sanitize,\n do_not_fragment_further=do_not_fragment_further,\n random_seed=random_seed,\n is_linking=True,\n model_only=model_only,\n **kwargs,\n )\n\n def scaffold_morphing(\n self,\n side_chains: Optional[Union[dm.Mol, str, List[Union[str, dm.Mol]]]] = None,\n mol: Optional[Union[dm.Mol, str]] = None,\n core: Optional[Union[dm.Mol, str]] = None,\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform scaffold morphing decoration using the pretrained SAFE model\n\n For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.\n If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the\n scaffold morphing then.\n\n !!! note \"Finding the side chains\"\n The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n Those attachment points are never considered as part of the query, rather they are used to define the attachment points.\n See ~sf.utils.compute_side_chains for more information.\n\n Args:\n side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)\n mol: input molecules when side_chains are not provided\n core: core to morph into another scaffold\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n return self._fragment_linking(\n side_chains=side_chains,\n mol=mol,\n core=core,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n sanitize=sanitize,\n do_not_fragment_further=do_not_fragment_further,\n random_seed=random_seed,\n is_linking=False,\n **kwargs,\n )\n\n def _fragment_linking(\n self,\n side_chains: Optional[Union[dm.Mol, str, List[Union[str, dm.Mol]]]] = None,\n mol: Optional[Union[dm.Mol, str]] = None,\n core: Optional[Union[dm.Mol, str]] = None,\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = False,\n random_seed: Optional[int] = None,\n is_linking: Optional[bool] = False,\n model_only: Optional[bool] = False,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform scaffold morphing decoration using the pretrained SAFE model\n\n For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.\n If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the\n scaffold morphing then.\n\n !!! note \"Finding the side chains\"\n The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n Those attachment points are never considered as part of the query, rather they are used to define the attachment points.\n See ~sf.utils.compute_side_chains for more information.\n\n Args:\n side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)\n mol: input molecules when side_chains are not provided\n core: core to morph into another scaffold\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n is_linking: whether it's a linking task or not.\n For linking tasks, we use a different custom strategy of completing up to the attachment signal\n model_only: whether to use the model only ability and nothing more. Only relevant when doing linker generation\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n if side_chains is None:\n if mol is None and core is None:\n raise ValueError(\n \"Either side_chains OR mol+core should be provided for scaffold morphing\"\n )\n side_chains = sf.trainer.utils.compute_side_chains(mol, core)\n side_chains = (\n [dm.to_mol(x) for x in side_chains]\n if isinstance(side_chains, list)\n else [dm.to_mol(side_chains)]\n )\n\n side_chains = \".\".join([dm.to_smiles(x) for x in side_chains])\n\n if \"*\" not in side_chains and self.verbose:\n logger.warning(\n f\"Side chain {side_chains} does not contain any dummy atoms, this might not be what you want\"\n )\n\n rng = random.Random(random_seed)\n new_seed = rng.randint(1, 1000)\n\n total_sequences = []\n n_trials = n_trials or 1\n for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n with dm.without_rdkit_log():\n context_mng = (\n sf.utils.attr_as(self.safe_encoder, \"slicer\", None)\n if do_not_fragment_further\n else suppress()\n )\n old_slicer = getattr(self.safe_encoder, \"slicer\", None)\n with context_mng:\n try:\n encoded_fragment = self.safe_encoder.encoder(\n side_chains,\n canonical=False,\n randomize=False,\n constraints=None,\n allow_empty=True,\n seed=new_seed,\n )\n\n except Exception as e:\n if self.verbose:\n logger.error(e)\n raise sf.SAFEEncodeError(f\"Failed to encode {side_chains}\") from e\n finally:\n if old_slicer is not None:\n self.safe_encoder.slicer = old_slicer\n\n fragments = encoded_fragment.split(\".\")\n missing_closure = Counter(self.safe_encoder._find_branch_number(encoded_fragment))\n missing_closure = [f\"{str(x)}\" for x in missing_closure if missing_closure[x] % 2 == 1]\n\n closure_pos = [\n m.start() for x in missing_closure for m in re.finditer(x, encoded_fragment)\n ]\n fragment_pos = [m.start() for m in re.finditer(r\"\\.\", encoded_fragment)]\n min_pos = 0\n while fragment_pos[min_pos] < closure_pos[0] and min_pos < len(fragment_pos):\n min_pos += 1\n min_pos += 1\n max_pos = len(fragment_pos)\n while fragment_pos[max_pos - 1] > closure_pos[-1] and max_pos > 0:\n max_pos -= 1\n\n split_index = rng.randint(min_pos, max_pos)\n prefix, suffixes = \".\".join(fragments[:split_index]), \".\".join(fragments[split_index:])\n\n missing_prefix_closure = Counter(self.safe_encoder._find_branch_number(prefix))\n missing_suffix_closure = Counter(self.safe_encoder._find_branch_number(suffixes))\n\n missing_prefix_closure = (\n [\".\"] + [x for x in missing_closure if int(x) not in missing_prefix_closure] + [\".\"]\n )\n missing_suffix_closure = (\n [\".\"] + [x for x in missing_closure if int(x) not in missing_suffix_closure] + [\".\"]\n )\n\n constraints_ids = []\n for permutation in itertools.permutations(missing_closure + [\".\"]):\n constraints_ids.append(\n self.tokenizer.encode(list(permutation), add_special_tokens=False)\n )\n\n # prefix_constraints_ids = self.tokenizer.encode(missing_prefix_closure, add_special_tokens=False)\n # suffix_constraints_ids = self.tokenizer.encode(missing_suffix_closure, add_special_tokens=False)\n\n # suffix_ids = self.tokenizer.encode([suffixes+self.tokenizer.tokenizer.eos_token], add_special_tokens=False)\n # prefix_ids = self.tokenizer.encode([prefix], add_special_tokens=False)\n\n prefix_kwargs = kwargs.copy()\n suffix_kwargs = prefix_kwargs.copy()\n\n if is_linking and model_only:\n for _kwargs in [prefix_kwargs, suffix_kwargs]:\n _kwargs.setdefault(\"how\", \"beam\")\n _kwargs.setdefault(\"num_beams\", n_samples_per_trial)\n _kwargs.setdefault(\"do_sample\", False)\n\n prefix_kwargs[\"constraints\"] = []\n suffix_kwargs[\"constraints\"] = []\n # prefix_kwargs[\"constraints\"] = [PhrasalConstraint(tkl) for tkl in suffix_constraints_ids]\n # suffix_kwargs[\"constraints\"] = [PhrasalConstraint(tkl) for tkl in prefix_constraints_ids]\n\n # we first generate a part of the fragment with for unique constraint that it should contain\n # the closure required to join something to the suffix.\n prefix_kwargs[\"constraints\"] += [\n DisjunctiveConstraint(tkl) for tkl in constraints_ids\n ]\n suffix_kwargs[\"constraints\"] += [\n DisjunctiveConstraint(tkl) for tkl in constraints_ids\n ]\n\n prefix_sequences = self._generate(\n n_samples=n_samples_per_trial, safe_prefix=prefix, **prefix_kwargs\n )\n suffix_sequences = self._generate(\n n_samples=n_samples_per_trial, safe_prefix=suffixes, **suffix_kwargs\n )\n\n prefix_sequences = [\n self._find_fragment_cut(x, prefix, missing_prefix_closure[1])\n for x in prefix_sequences\n ]\n suffix_sequences = [\n self._find_fragment_cut(x, suffixes, missing_suffix_closure[1])\n for x in suffix_sequences\n ]\n\n linkers = [x for x in set(prefix_sequences + suffix_sequences) if x]\n sequences = [f\"{prefix}.{linker}.{suffixes}\" for linker in linkers]\n sequences += self._decode_safe(sequences, canonical=True, remove_invalid=sanitize)\n\n else:\n mol_linker_slicer = sf.utils.MolSlicer(\n shortest_linker=(not is_linking), require_ring_system=(not is_linking)\n )\n prefix_smiles = sf.decode(prefix, remove_dummies=False, as_mol=False)\n suffix_smiles = sf.decode(suffixes, remove_dummies=False, as_mol=False)\n\n prefix_sequences = self._generate(\n n_samples=n_samples_per_trial, safe_prefix=prefix + \".\", **prefix_kwargs\n )\n suffix_sequences = self._generate(\n n_samples=n_samples_per_trial, safe_prefix=suffixes + \".\", **suffix_kwargs\n )\n\n prefix_sequences = self._decode_safe(\n prefix_sequences, canonical=True, remove_invalid=True\n )\n suffix_sequences = self._decode_safe(\n suffix_sequences, canonical=True, remove_invalid=True\n )\n sequences = self.__mix_sequences(\n prefix_sequences,\n suffix_sequences,\n prefix_smiles,\n suffix_smiles,\n n_samples_per_trial,\n mol_linker_slicer,\n )\n\n total_sequences.extend(sequences)\n\n # then we should filter out molecules that do not match the requested\n if sanitize:\n total_sequences = sf.utils.filter_by_substructure_constraints(\n total_sequences, side_chains\n )\n if self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n\n def motif_extension(\n self,\n motif: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform motif extension using the pretrained SAFE model.\n Motif extension is really just scaffold decoration underlying.\n\n Args:\n motif: scaffold (with attachment points) to decorate\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules and check\n random_seed: random seed to use\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n return self.scaffold_decoration(\n motif,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n sanitize=sanitize,\n do_not_fragment_further=do_not_fragment_further,\n random_seed=random_seed,\n add_dot=True,\n **kwargs,\n )\n\n def super_structure(\n self,\n core: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n attachment_point_depth: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform super structure generation using the pretrained SAFE model.\n\n To generate super-structure, we basically just create various attachment points to the input core,\n then perform scaffold decoration.\n\n Args:\n core: input substructure to use. We aim to generate super structures of this molecule\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of different attachment points to consider\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n attachment_point_depth: depth of opening the attachment points.\n Increasing this, means you increase the number of substitution point to consider.\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n core = dm.to_mol(core)\n cores = sf.utils.list_individual_attach_points(core, depth=attachment_point_depth)\n # get the fully open mol, everytime too.\n cores.append(dm.to_smiles(dm.reactions.open_attach_points(core)))\n cores = list(set(cores))\n rng = random.Random(random_seed)\n rng.shuffle(cores)\n # now also get the single openining of an attachment point\n total_sequences = []\n n_trials = n_trials or 1\n for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n core = cores[_ % len(cores)]\n old_verbose = self.verbose\n try:\n with sf.utils.attr_as(self, \"verbose\", False):\n out = self._completion(\n fragment=core,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=1,\n do_not_fragment_further=do_not_fragment_further,\n sanitize=sanitize,\n random_seed=random_seed,\n **kwargs,\n )\n total_sequences.extend(out)\n except Exception as e:\n if old_verbose:\n logger.error(e)\n\n finally:\n self.verbose = old_verbose\n\n if sanitize and self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n\n def scaffold_decoration(\n self,\n scaffold: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n do_not_fragment_further: Optional[bool] = True,\n sanitize: bool = False,\n random_seed: Optional[int] = None,\n add_dot: Optional[bool] = True,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform scaffold decoration using the pretrained SAFE model\n\n For scaffold decoration, we basically starts with a prefix with the attachment point.\n We first convert the prefix into valid safe string.\n\n Args:\n scaffold: scaffold (with attachment points) to decorate\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules and check if the scaffold is still present\n random_seed: random seed to use\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n total_sequences = self._completion(\n fragment=scaffold,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n do_not_fragment_further=do_not_fragment_further,\n sanitize=sanitize,\n random_seed=random_seed,\n add_dot=add_dot,\n **kwargs,\n )\n # if we require sanitization\n # then we should filter out molecules that do not match the requested\n if sanitize:\n total_sequences = sf.utils.filter_by_substructure_constraints(total_sequences, scaffold)\n if self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n\n def de_novo_generation(\n self,\n n_samples_per_trial: int = 10,\n sanitize: bool = False,\n n_trials: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n ):\n \"\"\"Perform de novo generation using the pretrained SAFE model.\n\n De novo generation is equivalent to not having any prefix.\n\n Args:\n n_samples_per_trial: number of new molecules to generate\n sanitize: whether to perform sanitization, aka, perform control to ensure what is asked is what is returned\n n_trials: number of randomization to perform\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n # EN: lazy programming much ?\n kwargs.setdefault(\"how\", \"random\")\n if kwargs[\"how\"] != \"random\" and not kwargs.get(\"do_sample\"):\n logger.warning(\n \"I don't think you know what you are doing ... for de novo generation `do_sample=True` or `how='random'` is expected !\"\n )\n\n total_sequences = []\n n_trials = n_trials or 1\n for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n sequences = self._generate(n_samples=n_samples_per_trial, **kwargs)\n total_sequences.extend(sequences)\n total_sequences = self._decode_safe(\n total_sequences, canonical=True, remove_invalid=sanitize\n )\n\n if sanitize and self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n\n def _find_fragment_cut(self, fragment: str, prefix_constraint: str, branching_id: str):\n \"\"\"\n Perform a cut on the input fragment in such a way that it could be joined with another fragments sharing the same\n branching id.\n\n Args:\n fragment: fragment to cut\n prefix_constraint: prefix constraint to use\n branching_id: branching id to use\n \"\"\"\n prefix_constraint = prefix_constraint.rstrip(\".\") + \".\"\n fragment = (\n fragment.replace(prefix_constraint, \"\", 1)\n if fragment.startswith(prefix_constraint)\n else fragment\n )\n fragments = fragment.split(\".\")\n i = 0\n for x in fragments:\n if branching_id in x:\n i += 1\n break\n return \".\".join(fragments[:i])\n\n def __mix_sequences(\n self,\n prefix_sequences: List[str],\n suffix_sequences: List[str],\n prefix: str,\n suffix: str,\n n_samples: int,\n mol_linker_slicer,\n ):\n \"\"\"Use generated prefix and suffix sequences to form new molecules\n that will be the merging of both. This is the two step scaffold morphing and linker generation scheme\n Args:\n prefix_sequences: list of prefix sequences\n suffix_sequences: list of suffix sequences\n prefix: decoded smiles of the prefix\n suffix: decoded smiles of the suffix\n n_samples: number of samples to generate\n \"\"\"\n prefix_linkers = []\n suffix_linkers = []\n prefix_query = dm.from_smarts(prefix)\n suffix_query = dm.from_smarts(suffix)\n\n for x in prefix_sequences:\n with suppress(Exception):\n x = dm.to_mol(x)\n out = mol_linker_slicer(x, prefix_query)\n prefix_linkers.append(out[1])\n for x in suffix_sequences:\n with suppress(Exception):\n x = dm.to_mol(x)\n out = mol_linker_slicer(x, suffix_query)\n suffix_linkers.append(out[1])\n n_linked = 0\n linked = []\n linkers = prefix_linkers + suffix_linkers\n linkers = [x for x in linkers if x is not None]\n for n_linked, linker in enumerate(linkers):\n linked.extend(mol_linker_slicer.link_fragments(linker, prefix, suffix))\n if n_linked > n_samples:\n break\n linked = [x for x in linked if x]\n return linked[:n_samples]\n\n def _decode_safe(\n self, sequences: List[str], canonical: bool = True, remove_invalid: bool = False\n ):\n \"\"\"Decode a safe sequence into a molecule\n\n Args:\n sequence: safe sequence to decode\n canonical: whether to return canonical sequence\n remove_invalid: whether to remove invalid safe strings or keep them\n \"\"\"\n\n def _decode_fn(x):\n return sf.decode(\n x,\n as_mol=False,\n fix=True,\n remove_added_hs=True,\n canonical=canonical,\n ignore_errors=True,\n remove_dummies=True,\n )\n\n if len(sequences) > 100:\n safe_strings = dm.parallelized(_decode_fn, sequences, n_jobs=-1)\n else:\n safe_strings = [_decode_fn(x) for x in sequences]\n if remove_invalid:\n safe_strings = [x for x in safe_strings if x is not None]\n\n return safe_strings\n\n def _completion(\n self,\n fragment: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n do_not_fragment_further: Optional[bool] = False,\n sanitize: bool = False,\n random_seed: Optional[int] = None,\n add_dot: Optional[bool] = False,\n is_safe: Optional[bool] = False,\n **kwargs,\n ):\n \"\"\"Perform sentence completion using a prefix fragment\n\n Args:\n fragment: fragment (with attachment points)\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n is_safe: whether the smiles is already encoded as a safe string\n add_dot: whether to add a dot at the end of the fragments to signal to the model that we want to generate a distinct fragment.\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n # EN: lazy programming much ?\n kwargs.setdefault(\"how\", \"random\")\n if kwargs[\"how\"] != \"random\" and not kwargs.get(\"do_sample\"):\n logger.warning(\n \"I don't think you know what you are doing ... for de novo generation `do_sample=True` or `how='random'` is expected !\"\n )\n\n # Step 1: we conver the fragment into the relevant safe string format\n # we use the provided safe encoder with the slicer that was expected\n\n rng = random.Random(random_seed)\n new_seed = rng.randint(1, 1000)\n\n total_sequences = []\n n_trials = n_trials or 1\n for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n if is_safe:\n encoded_fragment = fragment\n else:\n with dm.without_rdkit_log():\n context_mng = (\n sf.utils.attr_as(self.safe_encoder, \"slicer\", None)\n if do_not_fragment_further\n else suppress()\n )\n old_slicer = getattr(self.safe_encoder, \"slicer\", None)\n with context_mng:\n try:\n encoded_fragment = self.safe_encoder.encoder(\n fragment,\n canonical=False,\n randomize=True,\n constraints=None,\n allow_empty=True,\n seed=new_seed,\n )\n\n except Exception as e:\n if self.verbose:\n logger.error(e)\n raise sf.SAFEEncodeError(f\"Failed to encode {fragment}\") from e\n finally:\n if old_slicer is not None:\n self.safe_encoder.slicer = old_slicer\n\n if add_dot and encoded_fragment.count(\"(\") == encoded_fragment.count(\")\"):\n encoded_fragment = encoded_fragment.rstrip(\".\") + \".\"\n\n sequences = self._generate(\n n_samples=n_samples_per_trial, safe_prefix=encoded_fragment, **kwargs\n )\n\n sequences = self._decode_safe(sequences, canonical=True, remove_invalid=sanitize)\n total_sequences.extend(sequences)\n\n return total_sequences\n\n def _generate(\n self,\n n_samples: int = 1,\n safe_prefix: Optional[str] = None,\n max_length: Optional[int] = 100,\n how: Optional[str] = \"random\",\n num_beams: Optional[int] = None,\n num_beam_groups: Optional[int] = None,\n do_sample: Optional[bool] = None,\n **kwargs,\n ):\n \"\"\"Sample a new sequence using the underlying hugging face model.\n This emulates the izanagi sampling models, if you wish to retain the hugging face generation\n behaviour, either call the hugging face functions directly or overwrite this function\n\n ??? note \"Generation Parameters\"\n From the hugging face documentation:\n\n * `greedy decoding` if how=\"greedy\" and num_beams=1 and do_sample=False.\n * `multinomial sampling` if num_beams=1 and do_sample=True.\n * `beam-search decoding` if how=\"beam\" and num_beams>1 and do_sample=False.\n * `beam-search multinomial` sampling by calling if beam=True, num_beams>1 and do_sample=True or how=\"random\" and num_beams>1\n * `diverse beam-search decoding` if num_beams>1 and num_beam_groups>1\n\n It's also possible to ignore the 'how' shortcut and directly call the underlying generation methods using the proper arguments.\n Learn more here: https://huggingface.co/docs/transformers/v4.32.0/en/main_classes/text_generation#transformers.GenerationConfig\n Under the hood, the following will be applied depending on the arguments:\n\n * greedy decoding by calling greedy_search() if num_beams=1 and do_sample=False\n * contrastive search by calling contrastive_search() if penalty_alpha>0. and top_k>1\n * multinomial sampling by calling sample() if num_beams=1 and do_sample=True\n * beam-search decoding by calling beam_search() if num_beams>1 and do_sample=False\n * beam-search multinomial sampling by calling beam_sample() if num_beams>1 and do_sample=True\n * diverse beam-search decoding by calling group_beam_search(), if num_beams>1 and num_beam_groups>1\n * constrained beam-search decoding by calling constrained_beam_search(), if constraints!=None or force_words_ids!=None\n * assisted decoding by calling assisted_decoding(), if assistant_model is passed to .generate()\n\n Args:\n n_samples: number of sequences to return\n safe_prefix: Prefix to use in sampling, should correspond to a safe fragment\n max_length : maximum length of sampled sequence\n how: which sampling method to use: \"beam\", \"greedy\" or \"random\". Can be used to control other parameters by setting defaults\n num_beams: number of beams for beam search. 1 means no beam search, unless beam is specified then max(n_samples, num_beams) is used\n num_beam_groups: number of beam groups for diverse beam search\n do_sample: whether to perform random sampling or not, equivalent to setting random to True\n kwargs: any additional keyword argument to pass to the underlying sampling `generate` from hugging face transformer\n\n Returns:\n samples: list of sampled molecules, including failed validation\n\n \"\"\"\n pretrained_tk = self.tokenizer.get_pretrained()\n if getattr(pretrained_tk, \"model_max_length\") is None:\n setattr(\n pretrained_tk,\n \"model_max_length\",\n self._DEFAULT_MAX_LENGTH, # this was the defaul\n )\n\n input_ids = safe_prefix\n if isinstance(safe_prefix, str):\n # EN: should we address the special token issues\n input_ids = pretrained_tk(\n safe_prefix,\n return_tensors=\"pt\",\n )\n\n num_beams = num_beams or None\n do_sample = do_sample or False\n\n if how == \"random\":\n do_sample = True\n\n elif how is not None and \"beam\" in how:\n num_beams = max((num_beams or 0), n_samples)\n\n is_greedy = how == \"greedy\" or (num_beams in [0, 1, None]) and do_sample is False\n\n kwargs[\"do_sample\"] = do_sample\n if num_beams is not None:\n kwargs[\"num_beams\"] = num_beams\n if num_beam_groups is not None:\n kwargs[\"num_beam_groups\"] = num_beam_groups\n kwargs[\"output_scores\"] = True\n kwargs[\"return_dict_in_generate\"] = True\n kwargs[\"num_return_sequences\"] = n_samples\n kwargs[\"max_length\"] = max_length\n kwargs.setdefault(\"early_stopping\", True)\n # EN we don't do anything with the score that the model might return on generate ...\n if not isinstance(input_ids, Mapping):\n input_ids = {\"inputs\": None}\n else:\n # EN: we remove the EOS token added before running the prediction\n # because the model output nonsense when we keep it.\n for k in input_ids:\n input_ids[k] = input_ids[k][:, :-1]\n\n for k, v in input_ids.items():\n if torch.is_tensor(v):\n input_ids[k] = v.to(self.model.device)\n\n # we remove the token_type_ids to support more model type than just GPT2\n input_ids.pop(\"token_type_ids\", None)\n\n if is_greedy:\n kwargs[\"num_return_sequences\"] = 1\n if num_beams is not None and num_beams > 1:\n raise ValueError(\"Cannot set num_beams|num_beam_groups > 1 for greedy\")\n # under greedy decoding there can only be a single solution\n # we just duplicate the solution several time for efficiency\n outputs = self.model.generate(\n **input_ids,\n generation_config=self.generation_config,\n **kwargs,\n )\n sequences = [\n pretrained_tk.decode(outputs.sequences.squeeze(), skip_special_tokens=True)\n ] * n_samples\n\n else:\n outputs = self.model.generate(\n **input_ids,\n generation_config=self.generation_config,\n **kwargs,\n )\n sequences = pretrained_tk.batch_decode(outputs.sequences, skip_special_tokens=True)\n return sequences\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.__init__","title":"__init__(model, tokenizer, generation_config=None, safe_encoder=None, verbose=True)
","text":"SAFEDesign constructor
Info
Design methods in SAFE are not deterministic when it comes to the token sampling step. If a method accepts a random_seed
, it's for the SAFE-related algorithms and not the sampling from the autoregressive model. To ensure you get a deterministic sampling, please set the seed at the transformers
package level.
import safe as sf\nimport transformers\nmy_seed = 100\ndesigner = sf.SAFEDesign(...)\n\ntransformers.set_seed(100) # use this before calling a design function\ndesigner.linker_generation(...)\n
Parameters:
Name Type Description Defaultmodel
Union[SAFEDoubleHeadsModel, str]
input SAFEDoubleHeadsModel to use for generation
requiredtokenizer
Union[str, SAFETokenizer]
input SAFETokenizer to use for generation
requiredgeneration_config
Optional[Union[str, GenerationConfig]]
input GenerationConfig to use for generation
None
safe_encoder
Optional[SAFEConverter]
custom safe encoder to use
None
verbose
bool
whether to print out logging information during generation
True
Source code in safe/sample.py
def __init__(\n self,\n model: Union[SAFEDoubleHeadsModel, str],\n tokenizer: Union[str, SAFETokenizer],\n generation_config: Optional[Union[str, GenerationConfig]] = None,\n safe_encoder: Optional[sf.SAFEConverter] = None,\n verbose: bool = True,\n):\n \"\"\"SAFEDesign constructor\n\n !!! info\n Design methods in SAFE are not deterministic when it comes to the token sampling step.\n If a method accepts a `random_seed`, it's for the SAFE-related algorithms and not the\n sampling from the autoregressive model. To ensure you get a deterministic sampling,\n please set the seed at the `transformers` package level.\n\n ```python\n import safe as sf\n import transformers\n my_seed = 100\n designer = sf.SAFEDesign(...)\n\n transformers.set_seed(100) # use this before calling a design function\n designer.linker_generation(...)\n ```\n\n\n Args:\n model: input SAFEDoubleHeadsModel to use for generation\n tokenizer: input SAFETokenizer to use for generation\n generation_config: input GenerationConfig to use for generation\n safe_encoder: custom safe encoder to use\n verbose: whether to print out logging information during generation\n \"\"\"\n if isinstance(model, (str, os.PathLike)):\n model = SAFEDoubleHeadsModel.from_pretrained(model)\n\n if isinstance(tokenizer, (str, os.PathLike)):\n tokenizer = SAFETokenizer.load(tokenizer)\n\n model.eval()\n self.model = model\n self.tokenizer = tokenizer\n if isinstance(generation_config, os.PathLike):\n generation_config = GenerationConfig.from_pretrained(generation_config)\n if generation_config is None:\n generation_config = GenerationConfig.from_model_config(model.config)\n self.generation_config = generation_config\n for special_token_id in [\"bos_token_id\", \"eos_token_id\", \"pad_token_id\"]:\n if getattr(self.generation_config, special_token_id) is None:\n setattr(\n self.generation_config, special_token_id, getattr(tokenizer, special_token_id)\n )\n\n self.verbose = verbose\n self.safe_encoder = safe_encoder or sf.SAFEConverter()\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.__mix_sequences","title":"__mix_sequences(prefix_sequences, suffix_sequences, prefix, suffix, n_samples, mol_linker_slicer)
","text":"Use generated prefix and suffix sequences to form new molecules that will be the merging of both. This is the two step scaffold morphing and linker generation scheme Args: prefix_sequences: list of prefix sequences suffix_sequences: list of suffix sequences prefix: decoded smiles of the prefix suffix: decoded smiles of the suffix n_samples: number of samples to generate
Source code insafe/sample.py
def __mix_sequences(\n self,\n prefix_sequences: List[str],\n suffix_sequences: List[str],\n prefix: str,\n suffix: str,\n n_samples: int,\n mol_linker_slicer,\n):\n \"\"\"Use generated prefix and suffix sequences to form new molecules\n that will be the merging of both. This is the two step scaffold morphing and linker generation scheme\n Args:\n prefix_sequences: list of prefix sequences\n suffix_sequences: list of suffix sequences\n prefix: decoded smiles of the prefix\n suffix: decoded smiles of the suffix\n n_samples: number of samples to generate\n \"\"\"\n prefix_linkers = []\n suffix_linkers = []\n prefix_query = dm.from_smarts(prefix)\n suffix_query = dm.from_smarts(suffix)\n\n for x in prefix_sequences:\n with suppress(Exception):\n x = dm.to_mol(x)\n out = mol_linker_slicer(x, prefix_query)\n prefix_linkers.append(out[1])\n for x in suffix_sequences:\n with suppress(Exception):\n x = dm.to_mol(x)\n out = mol_linker_slicer(x, suffix_query)\n suffix_linkers.append(out[1])\n n_linked = 0\n linked = []\n linkers = prefix_linkers + suffix_linkers\n linkers = [x for x in linkers if x is not None]\n for n_linked, linker in enumerate(linkers):\n linked.extend(mol_linker_slicer.link_fragments(linker, prefix, suffix))\n if n_linked > n_samples:\n break\n linked = [x for x in linked if x]\n return linked[:n_samples]\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.de_novo_generation","title":"de_novo_generation(n_samples_per_trial=10, sanitize=False, n_trials=None, **kwargs)
","text":"Perform de novo generation using the pretrained SAFE model.
De novo generation is equivalent to not having any prefix.
Parameters:
Name Type Description Defaultn_samples_per_trial
int
number of new molecules to generate
10
sanitize
bool
whether to perform sanitization, aka, perform control to ensure what is asked is what is returned
False
n_trials
Optional[int]
number of randomization to perform
None
kwargs
Optional[Dict[Any, Any]]
any argument to provide to the underlying generation function
{}
Source code in safe/sample.py
def de_novo_generation(\n self,\n n_samples_per_trial: int = 10,\n sanitize: bool = False,\n n_trials: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n):\n \"\"\"Perform de novo generation using the pretrained SAFE model.\n\n De novo generation is equivalent to not having any prefix.\n\n Args:\n n_samples_per_trial: number of new molecules to generate\n sanitize: whether to perform sanitization, aka, perform control to ensure what is asked is what is returned\n n_trials: number of randomization to perform\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n # EN: lazy programming much ?\n kwargs.setdefault(\"how\", \"random\")\n if kwargs[\"how\"] != \"random\" and not kwargs.get(\"do_sample\"):\n logger.warning(\n \"I don't think you know what you are doing ... for de novo generation `do_sample=True` or `how='random'` is expected !\"\n )\n\n total_sequences = []\n n_trials = n_trials or 1\n for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n sequences = self._generate(n_samples=n_samples_per_trial, **kwargs)\n total_sequences.extend(sequences)\n total_sequences = self._decode_safe(\n total_sequences, canonical=True, remove_invalid=sanitize\n )\n\n if sanitize and self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.linker_generation","title":"linker_generation(*groups, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, model_only=False, **kwargs)
","text":"Perform linker generation using the pretrained SAFE model. Linker generation is really just scaffold morphing underlying.
Parameters:
Name Type Description Defaultgroups
Union[str, Mol]
list of fragments to link together, they are joined in the order provided
()
n_samples_per_trial
int
number of new molecules to generate for each randomization
10
n_trials
Optional[int]
number of randomization to perform
1
do_not_fragment_further
Optional[bool]
whether to fragment the scaffold further or not
True
sanitize
bool
whether to sanitize the generated molecules
False
random_seed
Optional[int]
random seed to use
None
model_only
Optional[bool]
whether to use the model only ability and nothing more.
False
kwargs
Optional[Dict[Any, Any]]
any argument to provide to the underlying generation function
{}
Source code in safe/sample.py
def linker_generation(\n self,\n *groups: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n model_only: Optional[bool] = False,\n **kwargs: Optional[Dict[Any, Any]],\n):\n \"\"\"Perform linker generation using the pretrained SAFE model.\n Linker generation is really just scaffold morphing underlying.\n\n Args:\n groups: list of fragments to link together, they are joined in the order provided\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n model_only: whether to use the model only ability and nothing more.\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n side_chains = list(groups)\n\n if len(side_chains) != 2:\n raise ValueError(\n \"Linker generation only works when providing two groups as side chains\"\n )\n\n return self._fragment_linking(\n side_chains=side_chains,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n sanitize=sanitize,\n do_not_fragment_further=do_not_fragment_further,\n random_seed=random_seed,\n is_linking=True,\n model_only=model_only,\n **kwargs,\n )\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.load_default","title":"load_default(verbose=False, model_dir=None, device=None)
classmethod
","text":"Load default SAFEGenerator model
Parameters:
Name Type Description Defaultverbose
bool
whether to print out logging information during generation
False
model_dir
Optional[str]
Optional path to model folder to use instead of the default one. If provided the tokenizer should be in the model_dir named as tokenizer.json
None
device
str
optional device where to move the model
None
Source code in safe/sample.py
@classmethod\ndef load_default(\n cls, verbose: bool = False, model_dir: Optional[str] = None, device: str = None\n) -> \"SAFEDesign\":\n \"\"\"Load default SAFEGenerator model\n\n Args:\n verbose: whether to print out logging information during generation\n model_dir: Optional path to model folder to use instead of the default one.\n If provided the tokenizer should be in the model_dir named as `tokenizer.json`\n device: optional device where to move the model\n \"\"\"\n if model_dir is None or not model_dir:\n model_dir = cls._DEFAULT_MODEL_PATH\n model = SAFEDoubleHeadsModel.from_pretrained(model_dir)\n tokenizer = SAFETokenizer.from_pretrained(model_dir)\n gen_config = GenerationConfig.from_pretrained(model_dir)\n if device is not None:\n model = model.to(device)\n return cls(model=model, tokenizer=tokenizer, generation_config=gen_config, verbose=verbose)\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.motif_extension","title":"motif_extension(motif, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, **kwargs)
","text":"Perform motif extension using the pretrained SAFE model. Motif extension is really just scaffold decoration underlying.
Parameters:
Name Type Description Defaultmotif
Union[str, Mol]
scaffold (with attachment points) to decorate
requiredn_samples_per_trial
int
number of new molecules to generate for each randomization
10
n_trials
Optional[int]
number of randomization to perform
1
do_not_fragment_further
Optional[bool]
whether to fragment the scaffold further or not
True
sanitize
bool
whether to sanitize the generated molecules and check
False
random_seed
Optional[int]
random seed to use
None
kwargs
Optional[Dict[Any, Any]]
any argument to provide to the underlying generation function
{}
Source code in safe/sample.py
def motif_extension(\n self,\n motif: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n):\n \"\"\"Perform motif extension using the pretrained SAFE model.\n Motif extension is really just scaffold decoration underlying.\n\n Args:\n motif: scaffold (with attachment points) to decorate\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules and check\n random_seed: random seed to use\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n return self.scaffold_decoration(\n motif,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n sanitize=sanitize,\n do_not_fragment_further=do_not_fragment_further,\n random_seed=random_seed,\n add_dot=True,\n **kwargs,\n )\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.scaffold_decoration","title":"scaffold_decoration(scaffold, n_samples_per_trial=10, n_trials=1, do_not_fragment_further=True, sanitize=False, random_seed=None, add_dot=True, **kwargs)
","text":"Perform scaffold decoration using the pretrained SAFE model
For scaffold decoration, we basically starts with a prefix with the attachment point. We first convert the prefix into valid safe string.
Parameters:
Name Type Description Defaultscaffold
Union[str, Mol]
scaffold (with attachment points) to decorate
requiredn_samples_per_trial
int
number of new molecules to generate for each randomization
10
n_trials
Optional[int]
number of randomization to perform
1
do_not_fragment_further
Optional[bool]
whether to fragment the scaffold further or not
True
sanitize
bool
whether to sanitize the generated molecules and check if the scaffold is still present
False
random_seed
Optional[int]
random seed to use
None
kwargs
Optional[Dict[Any, Any]]
any argument to provide to the underlying generation function
{}
Source code in safe/sample.py
def scaffold_decoration(\n self,\n scaffold: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n do_not_fragment_further: Optional[bool] = True,\n sanitize: bool = False,\n random_seed: Optional[int] = None,\n add_dot: Optional[bool] = True,\n **kwargs: Optional[Dict[Any, Any]],\n):\n \"\"\"Perform scaffold decoration using the pretrained SAFE model\n\n For scaffold decoration, we basically starts with a prefix with the attachment point.\n We first convert the prefix into valid safe string.\n\n Args:\n scaffold: scaffold (with attachment points) to decorate\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules and check if the scaffold is still present\n random_seed: random seed to use\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n total_sequences = self._completion(\n fragment=scaffold,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n do_not_fragment_further=do_not_fragment_further,\n sanitize=sanitize,\n random_seed=random_seed,\n add_dot=add_dot,\n **kwargs,\n )\n # if we require sanitization\n # then we should filter out molecules that do not match the requested\n if sanitize:\n total_sequences = sf.utils.filter_by_substructure_constraints(total_sequences, scaffold)\n if self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.scaffold_morphing","title":"scaffold_morphing(side_chains=None, mol=None, core=None, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, **kwargs)
","text":"Perform scaffold morphing decoration using the pretrained SAFE model
For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them. If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the scaffold morphing then.
Finding the side chains
The algorithm to find the side chains from core assumes that the core we get as input has attachment points. Those attachment points are never considered as part of the query, rather they are used to define the attachment points. See ~sf.utils.compute_side_chains for more information.
Parameters:
Name Type Description Defaultside_chains
Optional[Union[Mol, str, List[Union[str, Mol]]]]
side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)
None
mol
Optional[Union[Mol, str]]
input molecules when side_chains are not provided
None
core
Optional[Union[Mol, str]]
core to morph into another scaffold
None
n_samples_per_trial
int
number of new molecules to generate for each randomization
10
n_trials
Optional[int]
number of randomization to perform
1
do_not_fragment_further
Optional[bool]
whether to fragment the scaffold further or not
True
sanitize
bool
whether to sanitize the generated molecules
False
random_seed
Optional[int]
random seed to use
None
kwargs
Optional[Dict[Any, Any]]
any argument to provide to the underlying generation function
{}
Source code in safe/sample.py
def scaffold_morphing(\n self,\n side_chains: Optional[Union[dm.Mol, str, List[Union[str, dm.Mol]]]] = None,\n mol: Optional[Union[dm.Mol, str]] = None,\n core: Optional[Union[dm.Mol, str]] = None,\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n):\n \"\"\"Perform scaffold morphing decoration using the pretrained SAFE model\n\n For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.\n If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the\n scaffold morphing then.\n\n !!! note \"Finding the side chains\"\n The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n Those attachment points are never considered as part of the query, rather they are used to define the attachment points.\n See ~sf.utils.compute_side_chains for more information.\n\n Args:\n side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)\n mol: input molecules when side_chains are not provided\n core: core to morph into another scaffold\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of randomization to perform\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n return self._fragment_linking(\n side_chains=side_chains,\n mol=mol,\n core=core,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=n_trials,\n sanitize=sanitize,\n do_not_fragment_further=do_not_fragment_further,\n random_seed=random_seed,\n is_linking=False,\n **kwargs,\n )\n
"},{"location":"api/safe.html#safe.sample.SAFEDesign.super_structure","title":"super_structure(core, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, attachment_point_depth=None, **kwargs)
","text":"Perform super structure generation using the pretrained SAFE model.
To generate super-structure, we basically just create various attachment points to the input core, then perform scaffold decoration.
Parameters:
Name Type Description Defaultcore
Union[str, Mol]
input substructure to use. We aim to generate super structures of this molecule
requiredn_samples_per_trial
int
number of new molecules to generate for each randomization
10
n_trials
Optional[int]
number of different attachment points to consider
1
do_not_fragment_further
Optional[bool]
whether to fragment the scaffold further or not
True
sanitize
bool
whether to sanitize the generated molecules
False
random_seed
Optional[int]
random seed to use
None
attachment_point_depth
Optional[int]
depth of opening the attachment points. Increasing this, means you increase the number of substitution point to consider.
None
kwargs
Optional[Dict[Any, Any]]
any argument to provide to the underlying generation function
{}
Source code in safe/sample.py
def super_structure(\n self,\n core: Union[str, dm.Mol],\n n_samples_per_trial: int = 10,\n n_trials: Optional[int] = 1,\n sanitize: bool = False,\n do_not_fragment_further: Optional[bool] = True,\n random_seed: Optional[int] = None,\n attachment_point_depth: Optional[int] = None,\n **kwargs: Optional[Dict[Any, Any]],\n):\n \"\"\"Perform super structure generation using the pretrained SAFE model.\n\n To generate super-structure, we basically just create various attachment points to the input core,\n then perform scaffold decoration.\n\n Args:\n core: input substructure to use. We aim to generate super structures of this molecule\n n_samples_per_trial: number of new molecules to generate for each randomization\n n_trials: number of different attachment points to consider\n do_not_fragment_further: whether to fragment the scaffold further or not\n sanitize: whether to sanitize the generated molecules\n random_seed: random seed to use\n attachment_point_depth: depth of opening the attachment points.\n Increasing this, means you increase the number of substitution point to consider.\n kwargs: any argument to provide to the underlying generation function\n \"\"\"\n\n core = dm.to_mol(core)\n cores = sf.utils.list_individual_attach_points(core, depth=attachment_point_depth)\n # get the fully open mol, everytime too.\n cores.append(dm.to_smiles(dm.reactions.open_attach_points(core)))\n cores = list(set(cores))\n rng = random.Random(random_seed)\n rng.shuffle(cores)\n # now also get the single openining of an attachment point\n total_sequences = []\n n_trials = n_trials or 1\n for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n core = cores[_ % len(cores)]\n old_verbose = self.verbose\n try:\n with sf.utils.attr_as(self, \"verbose\", False):\n out = self._completion(\n fragment=core,\n n_samples_per_trial=n_samples_per_trial,\n n_trials=1,\n do_not_fragment_further=do_not_fragment_further,\n sanitize=sanitize,\n random_seed=random_seed,\n **kwargs,\n )\n total_sequences.extend(out)\n except Exception as e:\n if old_verbose:\n logger.error(e)\n\n finally:\n self.verbose = old_verbose\n\n if sanitize and self.verbose:\n logger.info(\n f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n )\n return total_sequences\n
"},{"location":"api/safe.html#safe-tokenizer","title":"SAFE Tokenizer","text":""},{"location":"api/safe.html#safe.tokenizer.SAFESplitter","title":"SAFESplitter
","text":"Standard Splitter for SAFE string
Source code insafe/tokenizer.py
class SAFESplitter:\n \"\"\"Standard Splitter for SAFE string\"\"\"\n\n REGEX_PATTERN = r\"\"\"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>>?|\\*|\\$|\\%[0-9]{2}|[0-9])\"\"\"\n\n name = \"safe\"\n\n def __init__(self, pattern: Optional[str] = None):\n # do not use this as raw strings (not r before)\n if pattern is None:\n pattern = self.REGEX_PATTERN\n self.regex = re.compile(pattern)\n\n def tokenize(self, line):\n \"\"\"Tokenize a safe string into characters.\"\"\"\n if isinstance(line, str):\n tokens = list(self.regex.findall(line))\n reconstruction = \"\".join(tokens)\n if line != reconstruction:\n logger.error(\n f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n )\n raise ValueError(line)\n else:\n idxs = re.finditer(self.regex, str(line))\n tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n return tokens\n\n def detokenize(self, chars):\n \"\"\"Detokenize SAFE notation\"\"\"\n if isinstance(chars, str):\n chars = chars.split(\" \")\n return \"\".join([x.strip() for x in chars])\n\n def split(self, n, normalized):\n \"\"\"Perform splitting for pretokenization\"\"\"\n return self.tokenize(normalized)\n\n def pre_tokenize(self, pretok):\n \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n pretok.split(self.split)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.detokenize","title":"detokenize(chars)
","text":"Detokenize SAFE notation
Source code insafe/tokenizer.py
def detokenize(self, chars):\n \"\"\"Detokenize SAFE notation\"\"\"\n if isinstance(chars, str):\n chars = chars.split(\" \")\n return \"\".join([x.strip() for x in chars])\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.pre_tokenize","title":"pre_tokenize(pretok)
","text":"Pretokenize using an input pretokenizer object from the tokenizer library
Source code insafe/tokenizer.py
def pre_tokenize(self, pretok):\n \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n pretok.split(self.split)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.split","title":"split(n, normalized)
","text":"Perform splitting for pretokenization
Source code insafe/tokenizer.py
def split(self, n, normalized):\n \"\"\"Perform splitting for pretokenization\"\"\"\n return self.tokenize(normalized)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.tokenize","title":"tokenize(line)
","text":"Tokenize a safe string into characters.
Source code insafe/tokenizer.py
def tokenize(self, line):\n \"\"\"Tokenize a safe string into characters.\"\"\"\n if isinstance(line, str):\n tokens = list(self.regex.findall(line))\n reconstruction = \"\".join(tokens)\n if line != reconstruction:\n logger.error(\n f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n )\n raise ValueError(line)\n else:\n idxs = re.finditer(self.regex, str(line))\n tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n return tokens\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer","title":"SAFETokenizer
","text":" Bases: PushToHubMixin
Class to initialize and train a tokenizer for SAFE string Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast
Source code insafe/tokenizer.py
class SAFETokenizer(PushToHubMixin):\n \"\"\"\n Class to initialize and train a tokenizer for SAFE string\n Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast\n \"\"\"\n\n vocab_files_names: str = \"tokenizer.json\"\n\n def __init__(\n self,\n tokenizer_type: str = \"bpe\",\n splitter: Optional[str] = \"safe\",\n trainer_args=None,\n decoder_args=None,\n token_model_args=None,\n ):\n super().__init__()\n self.tokenizer_type = tokenizer_type\n self.trainer_args = trainer_args or {}\n self.decoder_args = decoder_args or {}\n self.token_model_args = token_model_args or {}\n if tokenizer_type is not None and tokenizer_type.startswith(\"bpe\"):\n self.model = BPE(unk_token=UNK_TOKEN, **self.token_model_args)\n self.trainer = BpeTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n else:\n self.model = WordLevel(unk_token=UNK_TOKEN, **self.token_model_args)\n self.trainer = WordLevelTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n self.tokenizer = Tokenizer(self.model)\n self.splitter = None\n if splitter == \"safe\":\n self.splitter = SAFESplitter()\n self.tokenizer.pre_tokenizer = PreTokenizer.custom(self.splitter)\n self.tokenizer.post_processor = TemplateProcessing(\n single=TEMPLATE_SINGLE,\n pair=TEMPLATE_PAIR,\n special_tokens=TEMPLATE_SPECIAL_TOKENS,\n )\n self.tokenizer.decoder = decoders.BPEDecoder(**self.decoder_args)\n self.tokenizer = self.set_special_tokens(self.tokenizer)\n\n @property\n def bos_token_id(self):\n \"\"\"Get the bos token id\"\"\"\n return self.tokenizer.token_to_id(self.tokenizer.bos_token)\n\n @property\n def pad_token_id(self):\n \"\"\"Get the bos token id\"\"\"\n return self.tokenizer.token_to_id(self.tokenizer.pad_token)\n\n @property\n def eos_token_id(self):\n \"\"\"Get the bos token id\"\"\"\n return self.tokenizer.token_to_id(self.tokenizer.eos_token)\n\n @classmethod\n def set_special_tokens(\n cls,\n tokenizer: Tokenizer,\n bos_token: str = CLS_TOKEN,\n eos_token: str = SEP_TOKEN,\n ):\n \"\"\"Set special tokens for a tokenizer\n\n Args:\n tokenizer: tokenizer for which special tokens will be set\n bos_token: Optional bos token to use\n eos_token: Optional eos token to use\n \"\"\"\n tokenizer.pad_token = PADDING_TOKEN\n tokenizer.cls_token = CLS_TOKEN\n tokenizer.sep_token = SEP_TOKEN\n tokenizer.mask_token = MASK_TOKEN\n tokenizer.unk_token = UNK_TOKEN\n tokenizer.eos_token = eos_token\n tokenizer.bos_token = bos_token\n\n if isinstance(tokenizer, Tokenizer):\n tokenizer.add_special_tokens(\n [\n PADDING_TOKEN,\n CLS_TOKEN,\n SEP_TOKEN,\n MASK_TOKEN,\n UNK_TOKEN,\n eos_token,\n bos_token,\n ]\n )\n return tokenizer\n\n def train(self, files: Optional[List[str]], **kwargs):\n r\"\"\"\n This is to train a new tokenizer from either a list of file or some input data\n\n Args\n files (str): file in which your molecules are separated by new line\n kwargs (dict): optional args for the tokenizer `train`\n \"\"\"\n if isinstance(files, str):\n files = [files]\n self.tokenizer.train(files=files, trainer=self.trainer)\n\n def __getstate__(self):\n \"\"\"Getting state to allow pickling\"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n d = copy.deepcopy(self.__dict__)\n # copy back tokenizer level attribute\n d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n d[\"tokenizer\"].pre_tokenizer = Whitespace()\n return d\n\n def __setstate__(self, d):\n \"\"\"Setting state during reloading pickling\"\"\"\n use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n if use_pretokenizer:\n d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n self.__dict__.update(d)\n\n def train_from_iterator(self, data: Iterator, **kwargs: Any):\n \"\"\"Train the Tokenizer using the provided iterator.\n\n You can provide anything that is a Python Iterator\n * A list of sequences :obj:`List[str]`\n * A generator that yields :obj:`str` or :obj:`List[str]`\n * A Numpy array of strings\n\n Args:\n data: data iterator\n **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n \"\"\"\n self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n\n def __len__(self):\n r\"\"\"\n Gets the count of tokens in vocab along with special tokens.\n \"\"\"\n return len(self.tokenizer.get_vocab().keys())\n\n def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -> list:\n r\"\"\"\n Encodes a given molecule string once training is done\n\n Args:\n sample_str: Sample string to encode molecule\n ids_only: whether to return only the ids or the encoding objet\n\n Returns:\n object: Returns encoded list of IDs\n \"\"\"\n if isinstance(sample_str, str):\n enc = self.tokenizer.encode(sample_str, **kwargs)\n if ids_only:\n return enc.ids\n return enc\n\n encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n if ids_only:\n return [enc.ids for enc in encs]\n return encs\n\n def to_dict(self, **kwargs):\n \"\"\"Convert tokenizer to dict\"\"\"\n # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n if self.splitter is None:\n tk_data = json.loads(self.tokenizer.to_str())\n else:\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n # temporary replace pre tokenizer with whitespace\n tk_data = json.loads(self.tokenizer.to_str())\n tk_data[\"custom_pre_tokenizer\"] = True\n tk_data[\"tokenizer_type\"] = self.tokenizer_type\n tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n return tk_data\n\n def save_pretrained(self, *args, **kwargs):\n \"\"\"Save pretrained tokenizer\"\"\"\n self.tokenizer.save_pretrained(*args, **kwargs)\n\n def save(self, file_name=None):\n r\"\"\"\n Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n Args:\n file_name (str, optional): File where to save tokenizer\n \"\"\"\n # EN: whole logic here assumes noone is going to mess with the special token\n tk_data = self.to_dict()\n with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n out_str = json.dumps(tk_data, ensure_ascii=False)\n OUT.write(out_str)\n\n @classmethod\n def from_dict(cls, data: dict):\n \"\"\"Load tokenizer from dict\n\n Args:\n data: dictionary containing the tokenizer info\n \"\"\"\n tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n tokenizer = Tokenizer.from_str(json.dumps(data))\n if custom_pre_tokenizer:\n tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n mol_tokenizer = cls(tokenizer_type)\n mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n return mol_tokenizer\n\n @classmethod\n def load(cls, file_name):\n \"\"\"Load the current tokenizer from file\"\"\"\n with fsspec.open(file_name, \"r\") as OUT:\n data_str = OUT.read()\n data = json.loads(data_str)\n # EN: the rust json parser of tokenizers has a predefined structure\n # the next two lines are important\n return cls.from_dict(data)\n\n def decode(\n self,\n ids: list,\n skip_special_tokens: bool = True,\n ignore_stops: bool = False,\n stop_token_ids: Optional[List[int]] = None,\n ) -> str:\n r\"\"\"\n Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n Args:\n ids: list of IDs\n skip_special_tokens: whether to skip all special tokens when encountering them\n ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n stop_token_ids: optional list of stop token ids to use\n\n Returns:\n sequence: str representation of molecule\n \"\"\"\n old_id_list = ids\n if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n old_id_list = [ids]\n if not stop_token_ids:\n stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n new_ids_list = []\n for ids in old_id_list:\n new_ids = ids\n if not ignore_stops:\n new_ids = []\n # if first tokens are stop, we just remove it\n # this is because of bart essentially\n pos = 0\n if len(ids) > 1:\n while ids[pos] in stop_token_ids:\n pos += 1\n # we only ignore when there is a list of tokens\n ids = ids[pos:]\n for pos, id in enumerate(ids):\n if int(id) in stop_token_ids:\n break\n new_ids.append(id)\n new_ids_list.append(new_ids)\n if len(new_ids_list) == 1:\n return self.tokenizer.decode(\n list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n )\n return self.tokenizer.decode_batch(\n list(new_ids_list), skip_special_tokens=skip_special_tokens\n )\n\n def get_pretrained(self, **kwargs) -> PreTrainedTokenizerFast:\n r\"\"\"\n Get a pretrained tokenizer from this tokenizer\n\n Returns:\n Returns pre-trained fast tokenizer for hugging face models.\n \"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n # now we need to add special_tokens\n tk.add_special_tokens(\n {\n \"cls_token\": self.tokenizer.cls_token,\n \"bos_token\": self.tokenizer.bos_token,\n \"eos_token\": self.tokenizer.eos_token,\n \"mask_token\": self.tokenizer.mask_token,\n \"pad_token\": self.tokenizer.pad_token,\n \"unk_token\": self.tokenizer.unk_token,\n \"sep_token\": self.tokenizer.sep_token,\n }\n )\n if (\n tk.model_max_length is None\n or tk.model_max_length > 1e8\n and hasattr(self.tokenizer, \"model_max_length\")\n ):\n tk.model_max_length = self.tokenizer.model_max_length\n setattr(\n tk,\n \"model_max_length\",\n getattr(self.tokenizer, \"model_max_length\"),\n )\n return tk\n\n def push_to_hub(\n self,\n repo_id: str,\n use_temp_dir: Optional[bool] = None,\n commit_message: Optional[str] = None,\n private: Optional[bool] = None,\n token: Optional[Union[bool, str]] = None,\n max_shard_size: Optional[Union[int, str]] = \"10GB\",\n create_pr: bool = False,\n safe_serialization: bool = False,\n **deprecated_kwargs,\n ) -> str:\n \"\"\"\n Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n Args:\n repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n when pushing to a given organization.\n use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n private: Whether or not the repository created should be private.\n token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n is not specified.\n max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n by a unit (like `\"5MB\"`).\n create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n \"\"\"\n use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n if repo_path_or_name is not None:\n # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n # repo_id from the folder path, if it exists.\n warnings.warn(\n \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n \"`repo_id` instead.\",\n FutureWarning,\n )\n if repo_id is not None:\n raise ValueError(\n \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n )\n if os.path.isdir(repo_path_or_name):\n # repo_path: infer repo_id from the path\n repo_id = repo_id.split(os.path.sep)[-1]\n working_dir = repo_id\n else:\n # repo_name: use it as repo_id\n repo_id = repo_path_or_name\n working_dir = repo_id.split(\"/\")[-1]\n else:\n # Repo_id is passed correctly: infer working_dir from it\n working_dir = repo_id.split(\"/\")[-1]\n\n # Deprecation warning will be sent after for repo_url and organization\n repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n organization = deprecated_kwargs.pop(\"organization\", None)\n\n repo_id = self._create_repo(\n repo_id, private, token, repo_url=repo_url, organization=organization\n )\n\n if use_temp_dir is None:\n use_temp_dir = not os.path.isdir(working_dir)\n\n with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n files_timestamps = self._get_files_timestamps(work_dir)\n\n # Save all files.\n with contextlib.suppress(Exception):\n self.save_pretrained(\n work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n )\n\n self.save(os.path.join(work_dir, self.vocab_files_names))\n\n return self._upload_modified_files(\n work_dir,\n repo_id,\n files_timestamps,\n commit_message=commit_message,\n token=token,\n create_pr=create_pr,\n )\n\n @classmethod\n def from_pretrained(\n cls,\n pretrained_model_name_or_path: Union[str, os.PathLike],\n cache_dir: Optional[Union[str, os.PathLike]] = None,\n force_download: bool = False,\n local_files_only: bool = False,\n token: Optional[Union[str, bool]] = None,\n return_fast_tokenizer: Optional[bool] = False,\n proxies: Optional[Dict[str, str]] = None,\n **kwargs,\n ):\n r\"\"\"\n Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n tokenizer.\n\n Args:\n pretrained_model_name_or_path:\n Can be either:\n\n - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n user or organization name, like `dbmdz/bert-base-german-cased`.\n - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n `./my_model_directory/`.\n - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n `./my_model_directory/vocab.txt`.\n cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n standard cache should not be used.\n force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n token: The token to use as HTTP bearer authorization for remote files.\n If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n Examples:\n ``` py\n # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n # Download vocabulary from huggingface.co and cache.\n tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n # If the tokenizer uses a single vocabulary file, you can point directly to this file\n tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n ```\n \"\"\"\n resume_download = kwargs.pop(\"resume_download\", False)\n use_auth_token = kwargs.pop(\"use_auth_token\", None)\n subfolder = kwargs.pop(\"subfolder\", None)\n from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n from_auto_class = kwargs.pop(\"_from_auto\", False)\n commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n user_agent = {\n \"file_type\": \"tokenizer\",\n \"from_auto_class\": from_auto_class,\n \"is_fast\": \"Fast\" in cls.__name__,\n }\n if from_pipeline is not None:\n user_agent[\"using_pipeline\"] = from_pipeline\n\n if is_offline_mode() and not local_files_only:\n logger.info(\"Offline mode: forcing local_files_only=True\")\n local_files_only = True\n\n pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n os.path.isdir(pretrained_model_name_or_path)\n file_path = None\n if os.path.isfile(pretrained_model_name_or_path):\n file_path = pretrained_model_name_or_path\n elif is_remote_url(pretrained_model_name_or_path):\n file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n else:\n # EN: remove this when transformers package has uniform API\n cached_file_extra_kwargs = {\"use_auth_token\": token}\n if packaging.version.parse(transformers_version) >= packaging.version.parse(\"5.0\"):\n cached_file_extra_kwargs = {\"token\": token}\n # Try to get the tokenizer config to see if there are versioned tokenizer files.\n resolved_vocab_files = cached_file(\n pretrained_model_name_or_path,\n cls.vocab_files_names,\n cache_dir=cache_dir,\n force_download=force_download,\n resume_download=resume_download,\n proxies=proxies,\n local_files_only=local_files_only,\n subfolder=subfolder,\n user_agent=user_agent,\n _raise_exceptions_for_missing_entries=False,\n _raise_exceptions_for_connection_errors=False,\n _commit_hash=commit_hash,\n **cached_file_extra_kwargs,\n )\n commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n file_path = resolved_vocab_files\n\n if not os.path.isfile(file_path):\n logger.info(\n f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n )\n\n tokenizer = cls.load(file_path)\n if return_fast_tokenizer:\n return tokenizer.get_pretrained()\n return tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.bos_token_id","title":"bos_token_id
property
","text":"Get the bos token id
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.eos_token_id","title":"eos_token_id
property
","text":"Get the bos token id
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.pad_token_id","title":"pad_token_id
property
","text":"Get the bos token id
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__getstate__","title":"__getstate__()
","text":"Getting state to allow pickling
Source code insafe/tokenizer.py
def __getstate__(self):\n \"\"\"Getting state to allow pickling\"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n d = copy.deepcopy(self.__dict__)\n # copy back tokenizer level attribute\n d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n d[\"tokenizer\"].pre_tokenizer = Whitespace()\n return d\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__len__","title":"__len__()
","text":"Gets the count of tokens in vocab along with special tokens.
Source code insafe/tokenizer.py
def __len__(self):\n r\"\"\"\n Gets the count of tokens in vocab along with special tokens.\n \"\"\"\n return len(self.tokenizer.get_vocab().keys())\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__setstate__","title":"__setstate__(d)
","text":"Setting state during reloading pickling
Source code insafe/tokenizer.py
def __setstate__(self, d):\n \"\"\"Setting state during reloading pickling\"\"\"\n use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n if use_pretokenizer:\n d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n self.__dict__.update(d)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.decode","title":"decode(ids, skip_special_tokens=True, ignore_stops=False, stop_token_ids=None)
","text":"Decodes a list of ids to molecular representation in the format in which this tokenizer was created.
Parameters:
Name Type Description Defaultids
list
list of IDs
requiredskip_special_tokens
bool
whether to skip all special tokens when encountering them
True
ignore_stops
bool
whether to ignore the stop tokens, thus decoding till the end
False
stop_token_ids
Optional[List[int]]
optional list of stop token ids to use
None
Returns:
Name Type Descriptionsequence
str
str representation of molecule
Source code insafe/tokenizer.py
def decode(\n self,\n ids: list,\n skip_special_tokens: bool = True,\n ignore_stops: bool = False,\n stop_token_ids: Optional[List[int]] = None,\n) -> str:\n r\"\"\"\n Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n Args:\n ids: list of IDs\n skip_special_tokens: whether to skip all special tokens when encountering them\n ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n stop_token_ids: optional list of stop token ids to use\n\n Returns:\n sequence: str representation of molecule\n \"\"\"\n old_id_list = ids\n if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n old_id_list = [ids]\n if not stop_token_ids:\n stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n new_ids_list = []\n for ids in old_id_list:\n new_ids = ids\n if not ignore_stops:\n new_ids = []\n # if first tokens are stop, we just remove it\n # this is because of bart essentially\n pos = 0\n if len(ids) > 1:\n while ids[pos] in stop_token_ids:\n pos += 1\n # we only ignore when there is a list of tokens\n ids = ids[pos:]\n for pos, id in enumerate(ids):\n if int(id) in stop_token_ids:\n break\n new_ids.append(id)\n new_ids_list.append(new_ids)\n if len(new_ids_list) == 1:\n return self.tokenizer.decode(\n list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n )\n return self.tokenizer.decode_batch(\n list(new_ids_list), skip_special_tokens=skip_special_tokens\n )\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.encode","title":"encode(sample_str, ids_only=True, **kwargs)
","text":"Encodes a given molecule string once training is done
Parameters:
Name Type Description Defaultsample_str
str
Sample string to encode molecule
requiredids_only
bool
whether to return only the ids or the encoding objet
True
Returns:
Name Type Descriptionobject
list
Returns encoded list of IDs
Source code insafe/tokenizer.py
def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -> list:\n r\"\"\"\n Encodes a given molecule string once training is done\n\n Args:\n sample_str: Sample string to encode molecule\n ids_only: whether to return only the ids or the encoding objet\n\n Returns:\n object: Returns encoded list of IDs\n \"\"\"\n if isinstance(sample_str, str):\n enc = self.tokenizer.encode(sample_str, **kwargs)\n if ids_only:\n return enc.ids\n return enc\n\n encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n if ids_only:\n return [enc.ids for enc in encs]\n return encs\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_dict","title":"from_dict(data)
classmethod
","text":"Load tokenizer from dict
Parameters:
Name Type Description Defaultdata
dict
dictionary containing the tokenizer info
required Source code insafe/tokenizer.py
@classmethod\ndef from_dict(cls, data: dict):\n \"\"\"Load tokenizer from dict\n\n Args:\n data: dictionary containing the tokenizer info\n \"\"\"\n tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n tokenizer = Tokenizer.from_str(json.dumps(data))\n if custom_pre_tokenizer:\n tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n mol_tokenizer = cls(tokenizer_type)\n mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n return mol_tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_pretrained","title":"from_pretrained(pretrained_model_name_or_path, cache_dir=None, force_download=False, local_files_only=False, token=None, return_fast_tokenizer=False, proxies=None, **kwargs)
classmethod
","text":"Instantiate a [~tokenization_utils_base.PreTrainedTokenizerBase
] (or a derived class) from a predefined tokenizer.
Parameters:
Name Type Description Defaultpretrained_model_name_or_path
Union[str, PathLike]
Can be either:
bert-base-uncased
, or namespaced under a user or organization name, like dbmdz/bert-base-german-cased
.~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained
] method, e.g., ./my_model_directory/
../my_model_directory/vocab.txt
.cache_dir
Optional[Union[str, PathLike]]
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
None
force_download
bool
Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.
False
proxies
Optional[Dict[str, str]]
A dictionary of proxy servers to use by protocol or endpoint, e.g., {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}
. The proxies are used on each request.
None
token
Optional[Union[str, bool]]
The token to use as HTTP bearer authorization for remote files. If True
, will use the token generated when running huggingface-cli login
(stored in ~/.huggingface
).
None
local_files_only
bool
Whether or not to only rely on local files and not to attempt to download any files.
False
return_fast_tokenizer
Optional[bool]
Whether to return fast tokenizer or not.
False
Examples:
# We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n # Download vocabulary from huggingface.co and cache.\n tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n # If the tokenizer uses a single vocabulary file, you can point directly to this file\n tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n
Source code in safe/tokenizer.py
@classmethod\ndef from_pretrained(\n cls,\n pretrained_model_name_or_path: Union[str, os.PathLike],\n cache_dir: Optional[Union[str, os.PathLike]] = None,\n force_download: bool = False,\n local_files_only: bool = False,\n token: Optional[Union[str, bool]] = None,\n return_fast_tokenizer: Optional[bool] = False,\n proxies: Optional[Dict[str, str]] = None,\n **kwargs,\n):\n r\"\"\"\n Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n tokenizer.\n\n Args:\n pretrained_model_name_or_path:\n Can be either:\n\n - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n user or organization name, like `dbmdz/bert-base-german-cased`.\n - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n `./my_model_directory/`.\n - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n `./my_model_directory/vocab.txt`.\n cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n standard cache should not be used.\n force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n token: The token to use as HTTP bearer authorization for remote files.\n If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n Examples:\n ``` py\n # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n # Download vocabulary from huggingface.co and cache.\n tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n # If the tokenizer uses a single vocabulary file, you can point directly to this file\n tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n ```\n \"\"\"\n resume_download = kwargs.pop(\"resume_download\", False)\n use_auth_token = kwargs.pop(\"use_auth_token\", None)\n subfolder = kwargs.pop(\"subfolder\", None)\n from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n from_auto_class = kwargs.pop(\"_from_auto\", False)\n commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n user_agent = {\n \"file_type\": \"tokenizer\",\n \"from_auto_class\": from_auto_class,\n \"is_fast\": \"Fast\" in cls.__name__,\n }\n if from_pipeline is not None:\n user_agent[\"using_pipeline\"] = from_pipeline\n\n if is_offline_mode() and not local_files_only:\n logger.info(\"Offline mode: forcing local_files_only=True\")\n local_files_only = True\n\n pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n os.path.isdir(pretrained_model_name_or_path)\n file_path = None\n if os.path.isfile(pretrained_model_name_or_path):\n file_path = pretrained_model_name_or_path\n elif is_remote_url(pretrained_model_name_or_path):\n file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n else:\n # EN: remove this when transformers package has uniform API\n cached_file_extra_kwargs = {\"use_auth_token\": token}\n if packaging.version.parse(transformers_version) >= packaging.version.parse(\"5.0\"):\n cached_file_extra_kwargs = {\"token\": token}\n # Try to get the tokenizer config to see if there are versioned tokenizer files.\n resolved_vocab_files = cached_file(\n pretrained_model_name_or_path,\n cls.vocab_files_names,\n cache_dir=cache_dir,\n force_download=force_download,\n resume_download=resume_download,\n proxies=proxies,\n local_files_only=local_files_only,\n subfolder=subfolder,\n user_agent=user_agent,\n _raise_exceptions_for_missing_entries=False,\n _raise_exceptions_for_connection_errors=False,\n _commit_hash=commit_hash,\n **cached_file_extra_kwargs,\n )\n commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n file_path = resolved_vocab_files\n\n if not os.path.isfile(file_path):\n logger.info(\n f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n )\n\n tokenizer = cls.load(file_path)\n if return_fast_tokenizer:\n return tokenizer.get_pretrained()\n return tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.get_pretrained","title":"get_pretrained(**kwargs)
","text":"Get a pretrained tokenizer from this tokenizer
Returns:
Type DescriptionPreTrainedTokenizerFast
Returns pre-trained fast tokenizer for hugging face models.
Source code insafe/tokenizer.py
def get_pretrained(self, **kwargs) -> PreTrainedTokenizerFast:\n r\"\"\"\n Get a pretrained tokenizer from this tokenizer\n\n Returns:\n Returns pre-trained fast tokenizer for hugging face models.\n \"\"\"\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n # now we need to add special_tokens\n tk.add_special_tokens(\n {\n \"cls_token\": self.tokenizer.cls_token,\n \"bos_token\": self.tokenizer.bos_token,\n \"eos_token\": self.tokenizer.eos_token,\n \"mask_token\": self.tokenizer.mask_token,\n \"pad_token\": self.tokenizer.pad_token,\n \"unk_token\": self.tokenizer.unk_token,\n \"sep_token\": self.tokenizer.sep_token,\n }\n )\n if (\n tk.model_max_length is None\n or tk.model_max_length > 1e8\n and hasattr(self.tokenizer, \"model_max_length\")\n ):\n tk.model_max_length = self.tokenizer.model_max_length\n setattr(\n tk,\n \"model_max_length\",\n getattr(self.tokenizer, \"model_max_length\"),\n )\n return tk\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.load","title":"load(file_name)
classmethod
","text":"Load the current tokenizer from file
Source code insafe/tokenizer.py
@classmethod\ndef load(cls, file_name):\n \"\"\"Load the current tokenizer from file\"\"\"\n with fsspec.open(file_name, \"r\") as OUT:\n data_str = OUT.read()\n data = json.loads(data_str)\n # EN: the rust json parser of tokenizers has a predefined structure\n # the next two lines are important\n return cls.from_dict(data)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.push_to_hub","title":"push_to_hub(repo_id, use_temp_dir=None, commit_message=None, private=None, token=None, max_shard_size='10GB', create_pr=False, safe_serialization=False, **deprecated_kwargs)
","text":"Upload the tokenizer to the \ud83e\udd17 Model Hub.
Parameters:
Name Type Description Defaultrepo_id
str
The name of the repository you want to push your {object} to. It should contain your organization name when pushing to a given organization.
requireduse_temp_dir
Optional[bool]
Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub. Will default to True
if there is no directory named like repo_id
, False
otherwise.
None
commit_message
Optional[str]
Message to commit while pushing. Will default to \"Upload {object}\"
.
None
private
Optional[bool]
Whether or not the repository created should be private.
None
token
Optional[Union[bool, str]]
The token to use as HTTP bearer authorization for remote files. If True
, will use the token generated when running huggingface-cli login
(stored in ~/.huggingface
). Will default to True
if repo_url
is not specified.
None
max_shard_size
Optional[Union[int, str]]
Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size lower than this size. If expressed as a string, needs to be digits followed by a unit (like \"5MB\"
).
'10GB'
create_pr
bool
Whether or not to create a PR with the uploaded files or directly commit.
False
safe_serialization
bool
Whether or not to convert the model weights in safetensors format for safer serialization.
False
Source code in safe/tokenizer.py
def push_to_hub(\n self,\n repo_id: str,\n use_temp_dir: Optional[bool] = None,\n commit_message: Optional[str] = None,\n private: Optional[bool] = None,\n token: Optional[Union[bool, str]] = None,\n max_shard_size: Optional[Union[int, str]] = \"10GB\",\n create_pr: bool = False,\n safe_serialization: bool = False,\n **deprecated_kwargs,\n) -> str:\n \"\"\"\n Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n Args:\n repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n when pushing to a given organization.\n use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n private: Whether or not the repository created should be private.\n token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n is not specified.\n max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n by a unit (like `\"5MB\"`).\n create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n \"\"\"\n use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n if use_auth_token is not None:\n warnings.warn(\n \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n FutureWarning,\n )\n if token is not None:\n raise ValueError(\n \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n )\n token = use_auth_token\n\n repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n if repo_path_or_name is not None:\n # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n # repo_id from the folder path, if it exists.\n warnings.warn(\n \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n \"`repo_id` instead.\",\n FutureWarning,\n )\n if repo_id is not None:\n raise ValueError(\n \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n )\n if os.path.isdir(repo_path_or_name):\n # repo_path: infer repo_id from the path\n repo_id = repo_id.split(os.path.sep)[-1]\n working_dir = repo_id\n else:\n # repo_name: use it as repo_id\n repo_id = repo_path_or_name\n working_dir = repo_id.split(\"/\")[-1]\n else:\n # Repo_id is passed correctly: infer working_dir from it\n working_dir = repo_id.split(\"/\")[-1]\n\n # Deprecation warning will be sent after for repo_url and organization\n repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n organization = deprecated_kwargs.pop(\"organization\", None)\n\n repo_id = self._create_repo(\n repo_id, private, token, repo_url=repo_url, organization=organization\n )\n\n if use_temp_dir is None:\n use_temp_dir = not os.path.isdir(working_dir)\n\n with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n files_timestamps = self._get_files_timestamps(work_dir)\n\n # Save all files.\n with contextlib.suppress(Exception):\n self.save_pretrained(\n work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n )\n\n self.save(os.path.join(work_dir, self.vocab_files_names))\n\n return self._upload_modified_files(\n work_dir,\n repo_id,\n files_timestamps,\n commit_message=commit_message,\n token=token,\n create_pr=create_pr,\n )\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save","title":"save(file_name=None)
","text":"Saves the :class:~tokenizers.Tokenizer
to the file at the given path.
Parameters:
Name Type Description Defaultfile_name
str
File where to save tokenizer
None
Source code in safe/tokenizer.py
def save(self, file_name=None):\n r\"\"\"\n Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n Args:\n file_name (str, optional): File where to save tokenizer\n \"\"\"\n # EN: whole logic here assumes noone is going to mess with the special token\n tk_data = self.to_dict()\n with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n out_str = json.dumps(tk_data, ensure_ascii=False)\n OUT.write(out_str)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save_pretrained","title":"save_pretrained(*args, **kwargs)
","text":"Save pretrained tokenizer
Source code insafe/tokenizer.py
def save_pretrained(self, *args, **kwargs):\n \"\"\"Save pretrained tokenizer\"\"\"\n self.tokenizer.save_pretrained(*args, **kwargs)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.set_special_tokens","title":"set_special_tokens(tokenizer, bos_token=CLS_TOKEN, eos_token=SEP_TOKEN)
classmethod
","text":"Set special tokens for a tokenizer
Parameters:
Name Type Description Defaulttokenizer
Tokenizer
tokenizer for which special tokens will be set
requiredbos_token
str
Optional bos token to use
CLS_TOKEN
eos_token
str
Optional eos token to use
SEP_TOKEN
Source code in safe/tokenizer.py
@classmethod\ndef set_special_tokens(\n cls,\n tokenizer: Tokenizer,\n bos_token: str = CLS_TOKEN,\n eos_token: str = SEP_TOKEN,\n):\n \"\"\"Set special tokens for a tokenizer\n\n Args:\n tokenizer: tokenizer for which special tokens will be set\n bos_token: Optional bos token to use\n eos_token: Optional eos token to use\n \"\"\"\n tokenizer.pad_token = PADDING_TOKEN\n tokenizer.cls_token = CLS_TOKEN\n tokenizer.sep_token = SEP_TOKEN\n tokenizer.mask_token = MASK_TOKEN\n tokenizer.unk_token = UNK_TOKEN\n tokenizer.eos_token = eos_token\n tokenizer.bos_token = bos_token\n\n if isinstance(tokenizer, Tokenizer):\n tokenizer.add_special_tokens(\n [\n PADDING_TOKEN,\n CLS_TOKEN,\n SEP_TOKEN,\n MASK_TOKEN,\n UNK_TOKEN,\n eos_token,\n bos_token,\n ]\n )\n return tokenizer\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.to_dict","title":"to_dict(**kwargs)
","text":"Convert tokenizer to dict
Source code insafe/tokenizer.py
def to_dict(self, **kwargs):\n \"\"\"Convert tokenizer to dict\"\"\"\n # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n if self.splitter is None:\n tk_data = json.loads(self.tokenizer.to_str())\n else:\n with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n # temporary replace pre tokenizer with whitespace\n tk_data = json.loads(self.tokenizer.to_str())\n tk_data[\"custom_pre_tokenizer\"] = True\n tk_data[\"tokenizer_type\"] = self.tokenizer_type\n tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n return tk_data\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train","title":"train(files, **kwargs)
","text":"This is to train a new tokenizer from either a list of file or some input data
Args files (str): file in which your molecules are separated by new line kwargs (dict): optional args for the tokenizer train
safe/tokenizer.py
def train(self, files: Optional[List[str]], **kwargs):\n r\"\"\"\n This is to train a new tokenizer from either a list of file or some input data\n\n Args\n files (str): file in which your molecules are separated by new line\n kwargs (dict): optional args for the tokenizer `train`\n \"\"\"\n if isinstance(files, str):\n files = [files]\n self.tokenizer.train(files=files, trainer=self.trainer)\n
"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train_from_iterator","title":"train_from_iterator(data, **kwargs)
","text":"Train the Tokenizer using the provided iterator.
You can provide anything that is a Python Iterator * A list of sequences :obj:List[str]
* A generator that yields :obj:str
or :obj:List[str]
* A Numpy array of strings
Parameters:
Name Type Description Defaultdata
Iterator
data iterator
required**kwargs
Any
additional keyword argument for the tokenizer train_from_iterator
{}
Source code in safe/tokenizer.py
def train_from_iterator(self, data: Iterator, **kwargs: Any):\n \"\"\"Train the Tokenizer using the provided iterator.\n\n You can provide anything that is a Python Iterator\n * A list of sequences :obj:`List[str]`\n * A generator that yields :obj:`str` or :obj:`List[str]`\n * A Numpy array of strings\n\n Args:\n data: data iterator\n **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n \"\"\"\n self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n
"},{"location":"api/safe.html#utils","title":"Utils","text":""},{"location":"api/safe.html#safe.utils.MolSlicer","title":"MolSlicer
","text":"Slice a molecule into head-linker-tail
Source code insafe/utils.py
class MolSlicer:\n \"\"\"Slice a molecule into head-linker-tail\"\"\"\n\n BOND_SPLITTERS = [\n # two atoms connected by a non ring single bond, one of each is not in a ring and at least two heavy neighbor\n \"[R:1]-&!@[!R;!D1:2]\",\n # two atoms in different rings linked by a non-ring single bond\n \"[R:1]-&!@[R:2]\",\n ]\n _BOND_BUFFER = 1 # buffer around substructure match size.\n MAX_CUTS = 2 # maximum number of cuts. Here we need two cuts for head-linker-tail.\n\n _MERGING_RXN = dm.reactions.rxn_from_smarts(\n \"[#0][*:1].[#0][*:4].([#0][*:2].[#0][*:3])>>([*:1][*:2].[*:3][*:4])\"\n )\n\n def __init__(\n self,\n shortest_linker: bool = False,\n min_linker_size: int = 0,\n require_ring_system: bool = True,\n verbose: bool = False,\n ):\n \"\"\"\n Constructor of bond slicer.\n\n Args:\n shortest_linker: whether to consider longuest or shortest linker.\n Does not have any effect when expected_head group is provided during splitting\n min_linker_size: minimum linker size\n require_ring_system: whether all fragment needs to have a ring system\n verbose: whether to allow verbosity in logging\n \"\"\"\n\n self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n self.shortest_linker = shortest_linker\n self.min_linker_size = min_linker_size\n self.require_ring_system = require_ring_system\n self.verbose = verbose\n\n def get_ring_system(self, mol: dm.Mol):\n \"\"\"Get the list of ring system from a molecule\n\n Args:\n mol: input molecule for which we are computing the ring system\n \"\"\"\n mol.UpdatePropertyCache()\n ri = mol.GetRingInfo()\n systems = []\n for ring in ri.AtomRings():\n ring_atoms = set(ring)\n cur_system = [] # keep a track of ring system\n for system in systems:\n if len(ring_atoms.intersection(system)) > 0:\n ring_atoms = ring_atoms.union(system) # merge ring system that overlap\n else:\n cur_system.append(system)\n cur_system.append(ring_atoms)\n systems = cur_system\n return systems\n\n def _bond_selection_from_max_cuts(self, bond_list: List[int], dist_mat: np.ndarray):\n \"\"\"Select bonds based on maximum number of cuts allowed\"\"\"\n # for now we are just implementing to 2 max cuts algorithms\n if self.MAX_CUTS != 2:\n raise ValueError(f\"Only MAX_CUTS=2 is supported, got {self.MAX_CUTS}\")\n\n bond_pdist = np.full((len(bond_list), len(bond_list)), -1)\n for i in range(len(bond_list)):\n for j in range(i, len(bond_list)):\n # we get the minimum topological distance between bond to cut\n bond_pdist[i, j] = bond_pdist[j, i] = min(\n [dist_mat[a1, a2] for a1, a2 in itertools.product(bond_list[i], bond_list[j])]\n )\n\n masked_bond_pdist = np.ma.masked_less_equal(bond_pdist, self.min_linker_size)\n\n if self.shortest_linker:\n return np.unravel_index(np.ma.argmin(masked_bond_pdist), bond_pdist.shape)\n return np.unravel_index(np.ma.argmax(masked_bond_pdist), bond_pdist.shape)\n\n def _get_bonds_to_cut(self, mol: dm.Mol):\n \"\"\"Get possible bond to cuts\n\n Args:\n mol: input molecule\n \"\"\"\n # use this if you want to enumerate yourself the possible cuts\n\n ring_systems = self.get_ring_system(mol)\n candidate_bonds = []\n ring_query = Chem.rdqueries.IsInRingQueryAtom()\n\n for query in self.bond_splitters:\n bonds = mol.GetSubstructMatches(query, uniquify=True)\n cur_unique_bonds = [set(cbond) for cbond in candidate_bonds]\n # do not accept bonds part of the same ring system or already known\n for b in bonds:\n bond_id = mol.GetBondBetweenAtoms(*b).GetIdx()\n bond_cut = Chem.GetMolFrags(\n Chem.FragmentOnBonds(mol, [bond_id], addDummies=False), asMols=True\n )\n can_add = not self.require_ring_system or all(\n len(frag.GetAtomsMatchingQuery(ring_query)) > 0 for frag in bond_cut\n )\n if can_add and not (\n set(b) in cur_unique_bonds or any(x.issuperset(set(b)) for x in ring_systems)\n ):\n candidate_bonds.append(b)\n return candidate_bonds\n\n def _fragment_mol(self, mol: dm.Mol, bonds: List[dm.Bond]):\n \"\"\"Fragment molecules on bonds and return head, linker, tail combination\n\n Args:\n mol: input molecule\n bonds: list of bonds to cut\n \"\"\"\n tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in bonds])\n _frags = list(Chem.GetMolFrags(tmp, asMols=True))\n # linker is the one with 2 dummy atoms\n linker_pos = 0\n for pos, _frag in enumerate(_frags):\n if sum([at.GetSymbol() == \"*\" for at in _frag.GetAtoms()]) == 2:\n linker_pos = pos\n break\n linker = _frags.pop(linker_pos)\n head, tail = _frags\n return (head, linker, tail)\n\n def _compute_linker_score(self, linker: dm.Mol):\n \"\"\"Compute the score of a linker to help select between linkers\"\"\"\n\n # we need to take into account\n # case where we require the linker to have a ring system\n # case where we want the linker to be longuest or shortest\n\n # find shortest path\n attach1, attach2, *_ = [at.GetIdx() for at in linker.GetAtoms() if at.GetSymbol() == \"*\"]\n score = len(Chem.rdmolops.GetShortestPath(linker, attach1, attach2))\n ring_query = Chem.rdqueries.IsInRingQueryAtom()\n linker_ring_count = len(linker.GetAtomsMatchingQuery(ring_query))\n if self.require_ring_system:\n score *= int(linker_ring_count > 0)\n if score == 0:\n return float(\"inf\")\n if not self.shortest_linker:\n score = 1 / score\n return score\n\n def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n \"\"\"Perform slicing of the input molecule\n\n Args:\n mol: input molecule\n expected_head: substructure that should be part of the head.\n The small fragment containing this substructure would be kept as head\n \"\"\"\n\n mol = dm.to_mol(mol)\n # remove salt and solution\n mol = dm.keep_largest_fragment(mol)\n Chem.rdDepictor.Compute2DCoords(mol)\n dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n if expected_head is not None:\n if isinstance(expected_head, str):\n expected_head = dm.to_mol(expected_head)\n if not mol.HasSubstructMatch(expected_head):\n if self.verbose:\n logger.info(\n \"Expected head was provided, but does not match molecules. It will be ignored\"\n )\n expected_head = None\n\n candidate_bonds = self._get_bonds_to_cut(mol)\n\n # we have all the candidate bonds we can cut\n # now we need to pick the most plausible bonds\n selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n # CASE 1: no bond to cut ==> only head\n if len(selected_bonds) == 0:\n return (mol, None, None)\n\n # CASE 2: only one bond ==> linker is empty\n if len(selected_bonds) == 1:\n # there is not linker\n tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n head, tail = Chem.GetMolFrags(tmp, asMols=True)\n return (head, None, tail)\n\n # CASE 3a: we select the most plausible bond to cut on ourselves\n if expected_head is None:\n choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n selected_bonds = [selected_bonds[c] for c in choice]\n return self._fragment_mol(mol, selected_bonds)\n\n # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n # provided substructure\n bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n bond_score = float(\"inf\")\n linker_score = float(\"inf\")\n head, linker, tail = (None, None, None)\n for split_bonds in bond_combination:\n cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n # head can also be tail\n head_match = cur_head.GetSubstructMatch(expected_head)\n tail_match = cur_tail.GetSubstructMatch(expected_head)\n if not head_match and not tail_match:\n continue\n if not head_match and tail_match:\n cur_head, cur_tail = cur_tail, cur_head\n cur_bond_score = cur_head.GetNumHeavyAtoms()\n # compute linker score\n cur_linker_score = self._compute_linker_score(cur_linker)\n if (cur_bond_score < bond_score) or (\n cur_bond_score < self._BOND_BUFFER + bond_score and cur_linker_score < linker_score\n ):\n head, linker, tail = cur_head, cur_linker, cur_tail\n bond_score = cur_bond_score\n linker_score = cur_linker_score\n\n return (head, linker, tail)\n\n @classmethod\n def link_fragments(\n cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n ):\n \"\"\"Link fragments together using the provided linker\n\n Args:\n linker: linker to use\n head: head fragment\n tail: tail fragment\n \"\"\"\n if isinstance(linker, dm.Mol):\n linker = dm.to_smiles(linker)\n linker = standardize_attach(linker)\n reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n return dm.reactions.apply_reaction(\n cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n )\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.__call__","title":"__call__(mol, expected_head=None)
","text":"Perform slicing of the input molecule
Parameters:
Name Type Description Defaultmol
Union[Mol, str]
input molecule
requiredexpected_head
Union[Mol, str]
substructure that should be part of the head. The small fragment containing this substructure would be kept as head
None
Source code in safe/utils.py
def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n \"\"\"Perform slicing of the input molecule\n\n Args:\n mol: input molecule\n expected_head: substructure that should be part of the head.\n The small fragment containing this substructure would be kept as head\n \"\"\"\n\n mol = dm.to_mol(mol)\n # remove salt and solution\n mol = dm.keep_largest_fragment(mol)\n Chem.rdDepictor.Compute2DCoords(mol)\n dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n if expected_head is not None:\n if isinstance(expected_head, str):\n expected_head = dm.to_mol(expected_head)\n if not mol.HasSubstructMatch(expected_head):\n if self.verbose:\n logger.info(\n \"Expected head was provided, but does not match molecules. It will be ignored\"\n )\n expected_head = None\n\n candidate_bonds = self._get_bonds_to_cut(mol)\n\n # we have all the candidate bonds we can cut\n # now we need to pick the most plausible bonds\n selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n # CASE 1: no bond to cut ==> only head\n if len(selected_bonds) == 0:\n return (mol, None, None)\n\n # CASE 2: only one bond ==> linker is empty\n if len(selected_bonds) == 1:\n # there is not linker\n tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n head, tail = Chem.GetMolFrags(tmp, asMols=True)\n return (head, None, tail)\n\n # CASE 3a: we select the most plausible bond to cut on ourselves\n if expected_head is None:\n choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n selected_bonds = [selected_bonds[c] for c in choice]\n return self._fragment_mol(mol, selected_bonds)\n\n # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n # provided substructure\n bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n bond_score = float(\"inf\")\n linker_score = float(\"inf\")\n head, linker, tail = (None, None, None)\n for split_bonds in bond_combination:\n cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n # head can also be tail\n head_match = cur_head.GetSubstructMatch(expected_head)\n tail_match = cur_tail.GetSubstructMatch(expected_head)\n if not head_match and not tail_match:\n continue\n if not head_match and tail_match:\n cur_head, cur_tail = cur_tail, cur_head\n cur_bond_score = cur_head.GetNumHeavyAtoms()\n # compute linker score\n cur_linker_score = self._compute_linker_score(cur_linker)\n if (cur_bond_score < bond_score) or (\n cur_bond_score < self._BOND_BUFFER + bond_score and cur_linker_score < linker_score\n ):\n head, linker, tail = cur_head, cur_linker, cur_tail\n bond_score = cur_bond_score\n linker_score = cur_linker_score\n\n return (head, linker, tail)\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.__init__","title":"__init__(shortest_linker=False, min_linker_size=0, require_ring_system=True, verbose=False)
","text":"Constructor of bond slicer.
Parameters:
Name Type Description Defaultshortest_linker
bool
whether to consider longuest or shortest linker. Does not have any effect when expected_head group is provided during splitting
False
min_linker_size
int
minimum linker size
0
require_ring_system
bool
whether all fragment needs to have a ring system
True
verbose
bool
whether to allow verbosity in logging
False
Source code in safe/utils.py
def __init__(\n self,\n shortest_linker: bool = False,\n min_linker_size: int = 0,\n require_ring_system: bool = True,\n verbose: bool = False,\n):\n \"\"\"\n Constructor of bond slicer.\n\n Args:\n shortest_linker: whether to consider longuest or shortest linker.\n Does not have any effect when expected_head group is provided during splitting\n min_linker_size: minimum linker size\n require_ring_system: whether all fragment needs to have a ring system\n verbose: whether to allow verbosity in logging\n \"\"\"\n\n self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n self.shortest_linker = shortest_linker\n self.min_linker_size = min_linker_size\n self.require_ring_system = require_ring_system\n self.verbose = verbose\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.get_ring_system","title":"get_ring_system(mol)
","text":"Get the list of ring system from a molecule
Parameters:
Name Type Description Defaultmol
Mol
input molecule for which we are computing the ring system
required Source code insafe/utils.py
def get_ring_system(self, mol: dm.Mol):\n \"\"\"Get the list of ring system from a molecule\n\n Args:\n mol: input molecule for which we are computing the ring system\n \"\"\"\n mol.UpdatePropertyCache()\n ri = mol.GetRingInfo()\n systems = []\n for ring in ri.AtomRings():\n ring_atoms = set(ring)\n cur_system = [] # keep a track of ring system\n for system in systems:\n if len(ring_atoms.intersection(system)) > 0:\n ring_atoms = ring_atoms.union(system) # merge ring system that overlap\n else:\n cur_system.append(system)\n cur_system.append(ring_atoms)\n systems = cur_system\n return systems\n
"},{"location":"api/safe.html#safe.utils.MolSlicer.link_fragments","title":"link_fragments(linker, head, tail)
classmethod
","text":"Link fragments together using the provided linker
Parameters:
Name Type Description Defaultlinker
Union[Mol, str]
linker to use
requiredhead
Union[Mol, str]
head fragment
requiredtail
Union[Mol, str]
tail fragment
required Source code insafe/utils.py
@classmethod\ndef link_fragments(\n cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n):\n \"\"\"Link fragments together using the provided linker\n\n Args:\n linker: linker to use\n head: head fragment\n tail: tail fragment\n \"\"\"\n if isinstance(linker, dm.Mol):\n linker = dm.to_smiles(linker)\n linker = standardize_attach(linker)\n reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n return dm.reactions.apply_reaction(\n cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n )\n
"},{"location":"api/safe.html#safe.utils.attr_as","title":"attr_as(obj, field, value)
","text":"Temporary replace the value of an object
Parameters:
Name Type Description Defaultobj
Any
object to temporary patch
requiredfield
str
name of the key to change
requiredvalue
Any
value of key to be temporary changed
required Source code insafe/utils.py
@contextmanager\ndef attr_as(obj: Any, field: str, value: Any):\n \"\"\"Temporary replace the value of an object\n\n Args:\n obj: object to temporary patch\n field: name of the key to change\n value: value of key to be temporary changed\n \"\"\"\n old_value = getattr(obj, field, None)\n setattr(obj, field, value)\n yield\n with suppress(TypeError):\n setattr(obj, field, old_value)\n
"},{"location":"api/safe.html#safe.utils.compute_side_chains","title":"compute_side_chains(mol, core, label_by_index=False)
","text":"Compute the side chain of a molecule given a core
Finding the side chains
The algorithm to find the side chains from core assumes that the core we get as input has attachment points. Those attachment points are never considered as part of the query, rather they are used to define the attachment points on the side chains. Removing the attachment points from the core is exactly the same as keeping them.
mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\ncore0 = \"CC1(C)CN2C(CC2=O)S1\"\ncore1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\ncore2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\nside_chain = compute_side_chain(core=core0, mol=mol)\ndm.to_image([side_chain, core0, mol])\n
Therefore on the above, core0 and core1 are equivalent for the molecule mol
, but core2 is not. Parameters:
Name Type Description Defaultmol
Mol
molecule to split
requiredcore
Mol
core to use for deriving the side chains
required Source code insafe/utils.py
def compute_side_chains(mol: dm.Mol, core: dm.Mol, label_by_index: bool = False):\n \"\"\"Compute the side chain of a molecule given a core\n\n !!! note \"Finding the side chains\"\n The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n Those attachment points are never considered as part of the query, rather they are used to define the attachment points\n on the side chains. Removing the attachment points from the core is exactly the same as keeping them.\n\n ```python\n mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\n core0 = \"CC1(C)CN2C(CC2=O)S1\"\n core1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\n core2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\n side_chain = compute_side_chain(core=core0, mol=mol)\n dm.to_image([side_chain, core0, mol])\n ```\n Therefore on the above, core0 and core1 are equivalent for the molecule `mol`, but core2 is not.\n\n Args:\n mol: molecule to split\n core: core to use for deriving the side chains\n \"\"\"\n\n if isinstance(mol, str):\n mol = dm.to_mol(mol)\n if isinstance(core, str):\n core = dm.to_mol(core)\n core_query_param = AdjustQueryParameters()\n core_query_param.makeDummiesQueries = True\n core_query_param.adjustDegree = False\n core_query_param.aromatizeIfPossible = True\n core_query_param.makeBondsGeneric = False\n core_query = AdjustQueryProperties(core, core_query_param)\n return ReplaceCore(\n mol, core_query, labelByIndex=label_by_index, replaceDummies=False, requireDummyMatch=False\n )\n
"},{"location":"api/safe.html#safe.utils.convert_to_safe","title":"convert_to_safe(mol, canonical=False, randomize=False, seed=1, slicer='brics', split_fragment=True, fraction_hs=None, resolution=0.5)
","text":"Convert a molecule to a safe representation
Parameters:
Name Type Description Defaultmol
Mol
molecule to convert
requiredcanonical
bool
whether to use canonical encoding
False
randomize
bool
whether to randomize the encoding
False
seed
Optional[int]
random seed
1
slicer
str
the slicer to use for fragmentation
'brics'
split_fragment
bool
whether to split fragments
True
fraction_hs
bool
proportion of random atom to which we will add explicit hydrogens
None
resolution
Optional[float]
resolution for the partitioning algorithm
0.5
seed
Optional[int]
random seed
1
Source code in safe/utils.py
def convert_to_safe(\n mol: dm.Mol,\n canonical: bool = False,\n randomize: bool = False,\n seed: Optional[int] = 1,\n slicer: str = \"brics\",\n split_fragment: bool = True,\n fraction_hs: bool = None,\n resolution: Optional[float] = 0.5,\n):\n \"\"\"Convert a molecule to a safe representation\n\n Args:\n mol: molecule to convert\n canonical: whether to use canonical encoding\n randomize: whether to randomize the encoding\n seed: random seed\n slicer: the slicer to use for fragmentation\n split_fragment: whether to split fragments\n fraction_hs: proportion of random atom to which we will add explicit hydrogens\n resolution: resolution for the partitioning algorithm\n seed: random seed\n \"\"\"\n x = None\n try:\n x = sf.encode(mol, canonical=canonical, randomize=randomize, slicer=slicer, seed=seed)\n except sf.SAFEFragmentationError:\n if split_fragment:\n if \".\" in mol:\n return None\n try:\n x = sf.encode(\n mol,\n canonical=False,\n randomize=randomize,\n seed=seed,\n slicer=partial(\n fragment_aware_spliting,\n fraction_hs=fraction_hs,\n resolution=resolution,\n seed=seed,\n ),\n )\n except (sf.SAFEEncodeError, sf.SAFEFragmentationError):\n # logger.exception(e)\n return x\n # we need to resplit using attachment point but here we are only adding\n except sf.SAFEEncodeError:\n return x\n return x\n
"},{"location":"api/safe.html#safe.utils.filter_by_substructure_constraints","title":"filter_by_substructure_constraints(sequences, substruct, n_jobs=-1)
","text":"Check whether the input substructures are present in each of the molecule in the sequences
Parameters:
Name Type Description Defaultsequences
List[Union[str, Mol]]
list of molecules to validate
requiredsubstruct
Union[str, Mol]
substructure to use as query
requiredn_jobs
int
number of jobs to use for parallelization
-1
Source code in safe/utils.py
def filter_by_substructure_constraints(\n sequences: List[Union[str, dm.Mol]], substruct: Union[str, dm.Mol], n_jobs: int = -1\n):\n \"\"\"Check whether the input substructures are present in each of the molecule in the sequences\n\n Args:\n sequences: list of molecules to validate\n substruct: substructure to use as query\n n_jobs: number of jobs to use for parallelization\n\n \"\"\"\n\n if isinstance(substruct, str):\n substruct = standardize_attach(substruct)\n substruct = dm.from_smarts(substruct)\n\n def _check_match(mol):\n with suppress(Exception):\n mol = dm.to_mol(mol)\n return mol.HasSubstructMatch(substruct)\n return False\n\n matches = dm.parallelized(_check_match, sequences, n_jobs=n_jobs)\n return list(compress(sequences, matches))\n
"},{"location":"api/safe.html#safe.utils.find_partition_edges","title":"find_partition_edges(G, partition)
","text":"Find the edges connecting the subgraphs in a given partition of a graph.
Parameters:
Name Type Description DefaultG
Graph
The original graph.
requiredpartition
list of list of nodes
The partition of the graph where each element is a list of nodes representing a subgraph.
requiredReturns:
Name Type Descriptionlist
List[Tuple]
A list of edges connecting the subgraphs in the partition.
Source code insafe/utils.py
def find_partition_edges(G: nx.Graph, partition: List[List]) -> List[Tuple]:\n \"\"\"\n Find the edges connecting the subgraphs in a given partition of a graph.\n\n Args:\n G (networkx.Graph): The original graph.\n partition (list of list of nodes): The partition of the graph where each element is a list of nodes representing a subgraph.\n\n Returns:\n list: A list of edges connecting the subgraphs in the partition.\n \"\"\"\n partition_edges = []\n for subgraph1, subgraph2 in combinations(partition, 2):\n edges = nx.edge_boundary(G, subgraph1, subgraph2)\n partition_edges.extend(edges)\n return partition_edges\n
"},{"location":"api/safe.html#safe.utils.fragment_aware_spliting","title":"fragment_aware_spliting(mol, fraction_hs=None, **kwargs)
","text":"Custom splitting algorithm for dataset building.
This slicing strategy will cut any bond including bonding with hydrogens However, only one cut per atom is allowed
Parameters:
Name Type Description Defaultmol
Mol
molecule to split
requiredfraction_hs
Optional[bool]
proportion of random atom to which we will add explicit hydrogens
None
kwargs
Any
additional arguments to pass to the partitioning algorithm
{}
Source code in safe/utils.py
def fragment_aware_spliting(mol: dm.Mol, fraction_hs: Optional[bool] = None, **kwargs: Any):\n \"\"\"Custom splitting algorithm for dataset building.\n\n This slicing strategy will cut any bond including bonding with hydrogens\n However, only one cut per atom is allowed\n\n Args:\n mol: molecule to split\n fraction_hs: proportion of random atom to which we will add explicit hydrogens\n kwargs: additional arguments to pass to the partitioning algorithm\n \"\"\"\n random.seed(kwargs.get(\"seed\", 1))\n mol = dm.to_mol(mol, remove_hs=False)\n mol = _selective_add_hs(mol, fraction_hs=fraction_hs)\n graph = dm.graph.to_graph(mol)\n d = mol_partition(mol, **kwargs)\n q = deque(d)\n partition = q.pop()\n return find_partition_edges(graph, partition)\n
"},{"location":"api/safe.html#safe.utils.list_individual_attach_points","title":"list_individual_attach_points(mol, depth=None)
","text":"List all individual attachement points.
We do not allow multiple attachment points per substitution position.
Parameters:
Name Type Description Defaultmol
Mol
molecule for which we need to open the attachment points
required Source code insafe/utils.py
def list_individual_attach_points(mol: dm.Mol, depth: Optional[int] = None):\n \"\"\"List all individual attachement points.\n\n We do not allow multiple attachment points per substitution position.\n\n Args:\n mol: molecule for which we need to open the attachment points\n\n \"\"\"\n ATTACHING_RXN = ReactionFromSmarts(\"[*;h;!$([*][#0]):1]>>[*:1][*]\")\n mols = [mol]\n curated_prods = set()\n num_attachs = len(mol.GetSubstructMatches(dm.from_smarts(\"[*;h:1]\"), uniquify=True))\n depth = depth or 1\n depth = min(max(depth, 1), num_attachs)\n while depth > 0:\n prods = set()\n for mol in mols:\n mol = dm.to_mol(mol)\n for p in ATTACHING_RXN.RunReactants((mol,)):\n try:\n m = dm.sanitize_mol(p[0])\n sm = dm.to_smiles(m, canonical=True)\n sm = dm.reactions.add_brackets_to_attachment_points(sm)\n prods.add(dm.reactions.convert_attach_to_isotope(sm, as_smiles=True))\n except Exception as e:\n logger.error(e)\n curated_prods.update(prods)\n mols = prods\n depth -= 1\n return list(curated_prods)\n
"},{"location":"api/safe.html#safe.utils.mol_partition","title":"mol_partition(mol, query=None, seed=None, **kwargs)
","text":"Partition a molecule into fragments using a bond query
Parameters:
Name Type Description Defaultmol
Mol
molecule to split
requiredquery
Optional[Mol]
bond query to use for splitting
None
seed
Optional[int]
random seed
None
kwargs
Any
additional arguments to pass to the partitioning algorithm
{}
Source code in safe/utils.py
@py_random_state(\"seed\")\ndef mol_partition(\n mol: dm.Mol, query: Optional[dm.Mol] = None, seed: Optional[int] = None, **kwargs: Any\n):\n \"\"\"Partition a molecule into fragments using a bond query\n\n Args:\n mol: molecule to split\n query: bond query to use for splitting\n seed: random seed\n kwargs: additional arguments to pass to the partitioning algorithm\n\n \"\"\"\n resolution = kwargs.get(\"resolution\", 1.0)\n threshold = kwargs.get(\"threshold\", 1e-7)\n weight = kwargs.get(\"weight\", \"weight\")\n\n if query is None:\n query = __mmpa_query\n\n G = dm.graph.to_graph(mol)\n bond_partition = [\n tuple(sorted(match)) for match in mol.GetSubstructMatches(query, uniquify=True)\n ]\n\n def get_relevant_edges(e1, e2):\n return tuple(sorted([e1, e2])) not in bond_partition\n\n subgraphs = nx.subgraph_view(G, filter_edge=get_relevant_edges)\n\n partition = [{u} for u in G.nodes()]\n inner_partition = sorted(nx.connected_components(subgraphs), key=lambda x: min(x))\n mod = nx.algorithms.community.modularity(\n G, inner_partition, resolution=resolution, weight=weight\n )\n is_directed = G.is_directed()\n graph = G.__class__()\n graph.add_nodes_from(G)\n graph.add_weighted_edges_from(G.edges(data=weight, default=1))\n graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n m = graph.size(weight=\"weight\")\n partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n graph, m, inner_partition, resolution, is_directed, seed\n )\n improvement = True\n while improvement:\n # gh-5901 protect the sets in the yielded list from further manipulation here\n yield [s.copy() for s in partition]\n new_mod = nx.algorithms.community.modularity(\n graph, inner_partition, resolution=resolution, weight=\"weight\"\n )\n if new_mod - mod <= threshold:\n return\n mod = new_mod\n graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n graph, m, partition, resolution, is_directed, seed\n )\n
"},{"location":"api/safe.html#safe.utils.standardize_attach","title":"standardize_attach(inputs, standard_attach='[*]')
","text":"Standardize the attachment points of a molecule
Parameters:
Name Type Description Defaultinputs
str
input molecule
requiredstandard_attach
str
standard attachment point to use
'[*]'
Source code in safe/utils.py
def standardize_attach(inputs: str, standard_attach: str = \"[*]\"):\n \"\"\"Standardize the attachment points of a molecule\n\n Args:\n inputs: input molecule\n standard_attach: standard attachment point to use\n \"\"\"\n\n for attach_regex in _SMILES_ATTACHMENT_POINTS:\n inputs = re.sub(attach_regex, standard_attach, inputs)\n return inputs\n
"},{"location":"api/safe.models.html","title":"Model training","text":""},{"location":"api/safe.models.html#config-file","title":"Config File","text":"The input config file for training a SAFE
model is very similar to the GPT2 config file, with the addition of an optional num_labels
attribute for training with descriptors regularization.
{\n \"activation_function\": \"gelu_new\",\n \"attn_pdrop\": 0.1,\n \"bos_token_id\": 10000,\n \"embd_pdrop\": 0.1,\n \"eos_token_id\": 1,\n \"initializer_range\": 0.02,\n \"layer_norm_epsilon\": 1e-05,\n \"model_type\": \"gpt2\",\n \"n_embd\": 768,\n \"n_head\": 12,\n \"n_inner\": null,\n \"n_layer\": 12,\n \"n_positions\": 1024,\n \"reorder_and_upcast_attn\": false,\n \"resid_pdrop\": 0.1,\n \"scale_attn_by_inverse_layer_idx\": false,\n \"scale_attn_weights\": true,\n \"summary_activation\": \"tanh\",\n \"summary_first_dropout\": 0.1,\n \"summary_proj_to_labels\": true,\n \"summary_type\": \"cls_index\",\n \"summary_hidden_size\": 128,\n \"summary_use_proj\": true,\n \"transformers_version\": \"4.31.0\",\n \"use_cache\": true,\n \"vocab_size\": 10000,\n \"num_labels\": 9\n}\n
"},{"location":"api/safe.models.html#safe-model","title":"SAFE Model","text":""},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead","title":"PropertyHead
","text":" Bases: Module
Compute a single vector summary of a sequence hidden states.
Parameters:
Name Type Description Defaultconfig
[`PretrainedConfig`]
The config used by the model. Relevant arguments in the config class of the model are (refer to the actual config class of your model for the default values it uses):
str
) -- The method to use to make this summary. Accepted values are:- \"last\"
-- Take the last token hidden state (like XLNet) - \"first\"
-- Take the first token hidden state (like Bert) - \"mean\"
-- Take the mean of all tokens hidden states - \"cls_index\"
-- Supply a Tensor of classification token position (GPT/GPT-2)
Optional[str]
) -- Set to \"tanh\"
to add a tanh activation to the output, another string, or None
to add no activation.safe/trainer/model.py
class PropertyHead(torch.nn.Module):\n r\"\"\"\n Compute a single vector summary of a sequence hidden states.\n\n Args:\n config ([`PretrainedConfig`]):\n The config used by the model. Relevant arguments in the config class of the model are (refer to the actual\n config class of your model for the default values it uses):\n\n - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:\n\n - `\"last\"` -- Take the last token hidden state (like XLNet)\n - `\"first\"` -- Take the first token hidden state (like Bert)\n - `\"mean\"` -- Take the mean of all tokens hidden states\n - `\"cls_index\"` -- Supply a Tensor of classification token position (GPT/GPT-2)\n\n - **summary_activation** (`Optional[str]`) -- Set to `\"tanh\"` to add a tanh activation to the output,\n another string, or `None` to add no activation.\n \"\"\"\n\n def __init__(self, config: PretrainedConfig):\n super().__init__()\n\n self.summary_type = getattr(config, \"summary_type\", \"cls_index\")\n self.summary = torch.nn.Identity()\n last_hidden_size = config.hidden_size\n\n if getattr(config, \"summary_hidden_size\", None) and config.summary_hidden_size > 0:\n self.summary = nn.Linear(config.hidden_size, config.summary_hidden_size)\n last_hidden_size = config.summary_hidden_size\n\n activation_string = getattr(config, \"summary_activation\", None)\n self.activation: Callable = (\n get_activation(activation_string) if activation_string else nn.Identity()\n )\n\n self.out = torch.nn.Identity()\n if getattr(config, \"num_labels\", None) and config.num_labels > 0:\n num_labels = config.num_labels\n self.out = nn.Linear(last_hidden_size, num_labels)\n\n def forward(\n self,\n hidden_states: torch.FloatTensor,\n cls_index: Optional[torch.LongTensor] = None,\n ) -> torch.FloatTensor:\n \"\"\"\n Compute a single vector summary of a sequence hidden states.\n\n Args:\n hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n The hidden states of the last layer.\n cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n where ... are optional leading dimensions of `hidden_states`, *optional*\n Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n Returns:\n `torch.FloatTensor`: The summary of the sequence hidden states.\n \"\"\"\n if self.summary_type == \"last\":\n output = hidden_states[:, -1]\n elif self.summary_type == \"first\":\n output = hidden_states[:, 0]\n elif self.summary_type == \"mean\":\n output = hidden_states.mean(dim=1)\n elif self.summary_type == \"cls_index\":\n # if cls_index is None:\n # cls_index = torch.full_like(\n # hidden_states[..., :1, :],\n # hidden_states.shape[-2] - 1,\n # dtype=torch.long,\n # )\n # else:\n # cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n # cls_index = cls_index.expand(\n # (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n # )\n\n # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n # output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)\n batch_size = hidden_states.shape[0]\n output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n else:\n raise NotImplementedError\n\n output = self.summary(output)\n output = self.activation(output)\n return self.out(output)\n
"},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead.forward","title":"forward(hidden_states, cls_index=None)
","text":"Compute a single vector summary of a sequence hidden states.
Parameters:
Name Type Description Defaulthidden_states
FloatTensor
torch.FloatTensor
of shape [batch_size, seq_len, hidden_size]
) The hidden states of the last layer.
cls_index
Optional[LongTensor]
torch.LongTensor
of shape [batch_size]
or [batch_size, ...]
where ... are optional leading dimensions of hidden_states
, optional Used if summary_type == \"cls_index\"
and takes the last token of the sequence as classification token.
None
Returns:
Type DescriptionFloatTensor
torch.FloatTensor
: The summary of the sequence hidden states.
safe/trainer/model.py
def forward(\n self,\n hidden_states: torch.FloatTensor,\n cls_index: Optional[torch.LongTensor] = None,\n) -> torch.FloatTensor:\n \"\"\"\n Compute a single vector summary of a sequence hidden states.\n\n Args:\n hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n The hidden states of the last layer.\n cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n where ... are optional leading dimensions of `hidden_states`, *optional*\n Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n Returns:\n `torch.FloatTensor`: The summary of the sequence hidden states.\n \"\"\"\n if self.summary_type == \"last\":\n output = hidden_states[:, -1]\n elif self.summary_type == \"first\":\n output = hidden_states[:, 0]\n elif self.summary_type == \"mean\":\n output = hidden_states.mean(dim=1)\n elif self.summary_type == \"cls_index\":\n # if cls_index is None:\n # cls_index = torch.full_like(\n # hidden_states[..., :1, :],\n # hidden_states.shape[-2] - 1,\n # dtype=torch.long,\n # )\n # else:\n # cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n # cls_index = cls_index.expand(\n # (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n # )\n\n # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n # output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)\n batch_size = hidden_states.shape[0]\n output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n else:\n raise NotImplementedError\n\n output = self.summary(output)\n output = self.activation(output)\n return self.out(output)\n
"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel","title":"SAFEDoubleHeadsModel
","text":" Bases: GPT2DoubleHeadsModel
The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head
Source code insafe/trainer/model.py
class SAFEDoubleHeadsModel(GPT2DoubleHeadsModel):\n \"\"\"The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head\"\"\"\n\n def __init__(self, config):\n self.num_labels = getattr(config, \"num_labels\", None)\n super().__init__(config)\n self.config.num_labels = self.num_labels\n del self.multiple_choice_head\n self.multiple_choice_head = PropertyHead(config)\n\n @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\n def forward(\n self,\n input_ids: Optional[torch.LongTensor] = None,\n past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n attention_mask: Optional[torch.FloatTensor] = None,\n token_type_ids: Optional[torch.LongTensor] = None,\n position_ids: Optional[torch.LongTensor] = None,\n head_mask: Optional[torch.FloatTensor] = None,\n inputs_embeds: Optional[torch.FloatTensor] = None,\n mc_token_ids: Optional[torch.LongTensor] = None,\n labels: Optional[torch.LongTensor] = None,\n mc_labels: Optional[torch.LongTensor] = None,\n use_cache: Optional[bool] = None,\n output_attentions: Optional[bool] = None,\n output_hidden_states: Optional[bool] = None,\n return_dict: Optional[bool] = None,\n inputs: Optional[Any] = None, # do not remove because of trainer\n encoder_hidden_states: Optional[torch.Tensor] = None,\n **kwargs,\n ) -> Union[Tuple, GPT2DoubleHeadsModelOutput]:\n r\"\"\"\n\n Args:\n mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n 1]`.\n labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n Labels for computing the supervized loss for regularization.\n inputs: List of inputs, put here because the trainer removes information not in signature\n Returns:\n output (GPT2DoubleHeadsModelOutput): output of the model\n \"\"\"\n return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n transformer_outputs = self.transformer(\n input_ids,\n past_key_values=past_key_values,\n attention_mask=attention_mask,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n use_cache=use_cache,\n output_attentions=output_attentions,\n output_hidden_states=output_hidden_states,\n return_dict=return_dict,\n encoder_hidden_states=encoder_hidden_states,\n )\n\n hidden_states = transformer_outputs[0]\n lm_logits = self.lm_head(hidden_states)\n\n if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n lm_logits.device\n )\n\n # Set device for model parallelism\n if self.model_parallel:\n torch.cuda.set_device(self.transformer.first_device)\n hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n mc_loss = None\n mc_logits = None\n if mc_labels is not None and getattr(self.config, \"num_labels\", 0) > 0:\n mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n mc_labels = mc_labels.to(mc_logits.device)\n loss_fct = MSELoss()\n mc_loss = loss_fct(\n mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n )\n\n lm_loss = None\n if labels is not None:\n labels = labels.to(lm_logits.device)\n shift_logits = lm_logits[..., :-1, :].contiguous()\n shift_labels = labels[..., 1:].contiguous()\n loss_fct = CrossEntropyLoss()\n lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n if not return_dict:\n output = (lm_logits, mc_logits) + transformer_outputs[1:]\n return (\n lm_loss,\n mc_loss,\n ) + output\n\n return GPT2DoubleHeadsModelOutput(\n loss=lm_loss,\n mc_loss=mc_loss,\n logits=lm_logits,\n mc_logits=mc_logits,\n past_key_values=transformer_outputs.past_key_values,\n hidden_states=transformer_outputs.hidden_states,\n attentions=transformer_outputs.attentions,\n )\n
"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel.forward","title":"forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, labels=None, mc_labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, inputs=None, encoder_hidden_states=None, **kwargs)
","text":"Parameters:
Name Type Description Defaultmc_token_ids
`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input
Index of the classification token in each input sequence. Selected in the range [0, input_ids.size(-1) - 1]
.
None
labels
`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*
Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids
. Indices are selected in [-100, 0, ..., config.vocab_size - 1]
. All labels set to -100
are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size - 1]
None
mc_labels
`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*
Labels for computing the supervized loss for regularization.
None
inputs
Optional[Any]
List of inputs, put here because the trainer removes information not in signature
None
Returns: output (GPT2DoubleHeadsModelOutput): output of the model
Source code insafe/trainer/model.py
@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n@replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\ndef forward(\n self,\n input_ids: Optional[torch.LongTensor] = None,\n past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n attention_mask: Optional[torch.FloatTensor] = None,\n token_type_ids: Optional[torch.LongTensor] = None,\n position_ids: Optional[torch.LongTensor] = None,\n head_mask: Optional[torch.FloatTensor] = None,\n inputs_embeds: Optional[torch.FloatTensor] = None,\n mc_token_ids: Optional[torch.LongTensor] = None,\n labels: Optional[torch.LongTensor] = None,\n mc_labels: Optional[torch.LongTensor] = None,\n use_cache: Optional[bool] = None,\n output_attentions: Optional[bool] = None,\n output_hidden_states: Optional[bool] = None,\n return_dict: Optional[bool] = None,\n inputs: Optional[Any] = None, # do not remove because of trainer\n encoder_hidden_states: Optional[torch.Tensor] = None,\n **kwargs,\n) -> Union[Tuple, GPT2DoubleHeadsModelOutput]:\n r\"\"\"\n\n Args:\n mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n 1]`.\n labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n Labels for computing the supervized loss for regularization.\n inputs: List of inputs, put here because the trainer removes information not in signature\n Returns:\n output (GPT2DoubleHeadsModelOutput): output of the model\n \"\"\"\n return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n transformer_outputs = self.transformer(\n input_ids,\n past_key_values=past_key_values,\n attention_mask=attention_mask,\n token_type_ids=token_type_ids,\n position_ids=position_ids,\n head_mask=head_mask,\n inputs_embeds=inputs_embeds,\n use_cache=use_cache,\n output_attentions=output_attentions,\n output_hidden_states=output_hidden_states,\n return_dict=return_dict,\n encoder_hidden_states=encoder_hidden_states,\n )\n\n hidden_states = transformer_outputs[0]\n lm_logits = self.lm_head(hidden_states)\n\n if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n lm_logits.device\n )\n\n # Set device for model parallelism\n if self.model_parallel:\n torch.cuda.set_device(self.transformer.first_device)\n hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n mc_loss = None\n mc_logits = None\n if mc_labels is not None and getattr(self.config, \"num_labels\", 0) > 0:\n mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n mc_labels = mc_labels.to(mc_logits.device)\n loss_fct = MSELoss()\n mc_loss = loss_fct(\n mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n )\n\n lm_loss = None\n if labels is not None:\n labels = labels.to(lm_logits.device)\n shift_logits = lm_logits[..., :-1, :].contiguous()\n shift_labels = labels[..., 1:].contiguous()\n loss_fct = CrossEntropyLoss()\n lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n if not return_dict:\n output = (lm_logits, mc_logits) + transformer_outputs[1:]\n return (\n lm_loss,\n mc_loss,\n ) + output\n\n return GPT2DoubleHeadsModelOutput(\n loss=lm_loss,\n mc_loss=mc_loss,\n logits=lm_logits,\n mc_logits=mc_logits,\n past_key_values=transformer_outputs.past_key_values,\n hidden_states=transformer_outputs.hidden_states,\n attentions=transformer_outputs.attentions,\n )\n
"},{"location":"api/safe.models.html#trainer","title":"Trainer","text":""},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer","title":"SAFETrainer
","text":" Bases: Trainer
Custom trainer for training SAFE model.
This custom trainer changes the loss function to support the property head
Source code insafe/trainer/trainer_utils.py
class SAFETrainer(Trainer):\n \"\"\"\n Custom trainer for training SAFE model.\n\n This custom trainer changes the loss function to support the property head\n\n \"\"\"\n\n def __init__(self, *args, prop_loss_coeff: float = 1e-3, **kwargs):\n super().__init__(*args, **kwargs)\n self.prop_loss_coeff = prop_loss_coeff\n\n def compute_loss(self, model, inputs, return_outputs=False):\n \"\"\"\n How the loss is computed by Trainer. By default, all models return the loss in the first element.\n \"\"\"\n labels = (\n inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n )\n\n outputs = model(**inputs)\n # Save past state if it exists\n # TODO: this needs to be fixed and made cleaner later.\n if self.args.past_index >= 0:\n self._past = outputs[self.args.past_index]\n\n if labels is not None:\n if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n loss = self.label_smoother(outputs, labels, shift_labels=True)\n else:\n loss = self.label_smoother(outputs, labels)\n else:\n if isinstance(outputs, dict) and \"loss\" not in outputs:\n raise ValueError(\n \"The model did not return a loss from the inputs, only the following keys: \"\n f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n )\n # We don't use .loss here since the model may return tuples instead of ModelOutput.\n loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n if mc_loss is not None:\n loss = loss + self.prop_loss_coeff * mc_loss\n return (loss, outputs) if return_outputs else loss\n
"},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer.compute_loss","title":"compute_loss(model, inputs, return_outputs=False)
","text":"How the loss is computed by Trainer. By default, all models return the loss in the first element.
Source code insafe/trainer/trainer_utils.py
def compute_loss(self, model, inputs, return_outputs=False):\n \"\"\"\n How the loss is computed by Trainer. By default, all models return the loss in the first element.\n \"\"\"\n labels = (\n inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n )\n\n outputs = model(**inputs)\n # Save past state if it exists\n # TODO: this needs to be fixed and made cleaner later.\n if self.args.past_index >= 0:\n self._past = outputs[self.args.past_index]\n\n if labels is not None:\n if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n loss = self.label_smoother(outputs, labels, shift_labels=True)\n else:\n loss = self.label_smoother(outputs, labels)\n else:\n if isinstance(outputs, dict) and \"loss\" not in outputs:\n raise ValueError(\n \"The model did not return a loss from the inputs, only the following keys: \"\n f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n )\n # We don't use .loss here since the model may return tuples instead of ModelOutput.\n loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n if mc_loss is not None:\n loss = loss + self.prop_loss_coeff * mc_loss\n return (loss, outputs) if return_outputs else loss\n
"},{"location":"api/safe.models.html#data-collator","title":"Data Collator","text":""},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator","title":"SAFECollator
","text":"Collate function for language modelling tasks
Note
The collate function is based on the default DataCollatorForLanguageModeling in huggingface see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py
Source code insafe/trainer/collator.py
class SAFECollator:\n \"\"\"Collate function for language modelling tasks\n\n\n !!! note\n The collate function is based on the default DataCollatorForLanguageModeling in huggingface\n see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py\n \"\"\"\n\n def __init__(\n self,\n tokenizer: Tokenizer,\n pad_to_multiple_of: Optional[int] = None,\n input_key: str = \"inputs\",\n label_key: str = \"labels\",\n property_key: str = \"descriptors\",\n include_descriptors: bool = False,\n max_length: Optional[int] = None,\n ):\n \"\"\"\n Default collator for huggingface transformers in izanagi.\n\n Args:\n tokenizer: Huggingface tokenizer\n input_key: key to use for input ids\n label_key: key to use for labels\n property_key: key to use for properties\n include_descriptors: whether to include training on descriptors or not\n pad_to_multiple_of: pad to multiple of this value\n \"\"\"\n\n self.tokenizer = tokenizer\n self.pad_to_multiple_of = pad_to_multiple_of\n self.input_key = input_key\n self.label_key = label_key\n self.property_key = property_key\n self.include_descriptors = include_descriptors\n self.max_length = max_length\n\n @functools.lru_cache()\n def get_tokenizer(self):\n \"\"\"Get underlying tokenizer\"\"\"\n if isinstance(self.tokenizer, SAFETokenizer):\n return self.tokenizer.get_pretrained()\n return self.tokenizer\n\n def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n \"\"\"\n Call collate function\n\n Args:\n samples: list of examples\n \"\"\"\n # Handle dict or lists with proper padding and conversion to tensor.\n tokenizer = self.get_tokenizer()\n\n # examples = samples\n examples = copy.deepcopy(samples)\n inputs = [example.pop(self.input_key, None) for example in examples]\n mc_labels = (\n torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n if self.property_key in examples[0]\n else None\n )\n\n if \"input_ids\" not in examples[0] and inputs is not None:\n batch = tokenizer(\n inputs,\n return_tensors=\"pt\",\n padding=True,\n truncation=True,\n max_length=self.max_length,\n pad_to_multiple_of=self.pad_to_multiple_of,\n )\n else:\n batch = tokenizer.pad(\n examples,\n return_tensors=\"pt\",\n padding=True,\n pad_to_multiple_of=self.pad_to_multiple_of,\n max_length=self.max_length,\n )\n\n # If special token mask has been preprocessed, pop it from the dict.\n batch.pop(\"special_tokens_mask\", None)\n labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n if tokenizer.pad_token_id is not None:\n labels[labels == tokenizer.pad_token_id] = -100\n batch[\"labels\"] = labels\n\n if mc_labels is not None and self.include_descriptors:\n batch.update(\n {\n \"mc_labels\": mc_labels,\n # \"input_text\": inputs,\n }\n )\n return batch\n
"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__call__","title":"__call__(samples)
","text":"Call collate function
Parameters:
Name Type Description Defaultsamples
List[Union[List[int], Any, Dict[str, Any]]]
list of examples
required Source code insafe/trainer/collator.py
def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n \"\"\"\n Call collate function\n\n Args:\n samples: list of examples\n \"\"\"\n # Handle dict or lists with proper padding and conversion to tensor.\n tokenizer = self.get_tokenizer()\n\n # examples = samples\n examples = copy.deepcopy(samples)\n inputs = [example.pop(self.input_key, None) for example in examples]\n mc_labels = (\n torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n if self.property_key in examples[0]\n else None\n )\n\n if \"input_ids\" not in examples[0] and inputs is not None:\n batch = tokenizer(\n inputs,\n return_tensors=\"pt\",\n padding=True,\n truncation=True,\n max_length=self.max_length,\n pad_to_multiple_of=self.pad_to_multiple_of,\n )\n else:\n batch = tokenizer.pad(\n examples,\n return_tensors=\"pt\",\n padding=True,\n pad_to_multiple_of=self.pad_to_multiple_of,\n max_length=self.max_length,\n )\n\n # If special token mask has been preprocessed, pop it from the dict.\n batch.pop(\"special_tokens_mask\", None)\n labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n if tokenizer.pad_token_id is not None:\n labels[labels == tokenizer.pad_token_id] = -100\n batch[\"labels\"] = labels\n\n if mc_labels is not None and self.include_descriptors:\n batch.update(\n {\n \"mc_labels\": mc_labels,\n # \"input_text\": inputs,\n }\n )\n return batch\n
"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__init__","title":"__init__(tokenizer, pad_to_multiple_of=None, input_key='inputs', label_key='labels', property_key='descriptors', include_descriptors=False, max_length=None)
","text":"Default collator for huggingface transformers in izanagi.
Parameters:
Name Type Description Defaulttokenizer
Tokenizer
Huggingface tokenizer
requiredinput_key
str
key to use for input ids
'inputs'
label_key
str
key to use for labels
'labels'
property_key
str
key to use for properties
'descriptors'
include_descriptors
bool
whether to include training on descriptors or not
False
pad_to_multiple_of
Optional[int]
pad to multiple of this value
None
Source code in safe/trainer/collator.py
def __init__(\n self,\n tokenizer: Tokenizer,\n pad_to_multiple_of: Optional[int] = None,\n input_key: str = \"inputs\",\n label_key: str = \"labels\",\n property_key: str = \"descriptors\",\n include_descriptors: bool = False,\n max_length: Optional[int] = None,\n):\n \"\"\"\n Default collator for huggingface transformers in izanagi.\n\n Args:\n tokenizer: Huggingface tokenizer\n input_key: key to use for input ids\n label_key: key to use for labels\n property_key: key to use for properties\n include_descriptors: whether to include training on descriptors or not\n pad_to_multiple_of: pad to multiple of this value\n \"\"\"\n\n self.tokenizer = tokenizer\n self.pad_to_multiple_of = pad_to_multiple_of\n self.input_key = input_key\n self.label_key = label_key\n self.property_key = property_key\n self.include_descriptors = include_descriptors\n self.max_length = max_length\n
"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.get_tokenizer","title":"get_tokenizer()
cached
","text":"Get underlying tokenizer
Source code insafe/trainer/collator.py
@functools.lru_cache()\ndef get_tokenizer(self):\n \"\"\"Get underlying tokenizer\"\"\"\n if isinstance(self.tokenizer, SAFETokenizer):\n return self.tokenizer.get_pretrained()\n return self.tokenizer\n
"},{"location":"api/safe.models.html#data-utils","title":"Data Utils","text":""},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset","title":"get_dataset(data_path, name=None, tokenizer=None, cache_dir=None, streaming=True, use_auth_token=False, tokenize_column='inputs', property_column='descriptors', max_length=None, num_shards=1024)
","text":"Get the datasets from the config file
Source code insafe/trainer/data_utils.py
def get_dataset(\n data_path,\n name: Optional[str] = None,\n tokenizer: Optional[Callable] = None,\n cache_dir: Optional[str] = None,\n streaming: bool = True,\n use_auth_token: bool = False,\n tokenize_column: Optional[str] = \"inputs\",\n property_column: Optional[str] = \"descriptors\",\n max_length: Optional[int] = None,\n num_shards=1024,\n):\n \"\"\"Get the datasets from the config file\"\"\"\n raw_datasets = {}\n if data_path is not None:\n data_path = upath.UPath(str(data_path))\n\n if data_path.exists():\n # then we need to load from disk\n data_path = str(data_path)\n # for some reason, the datasets package is not able to load the dataset\n # because the split where not originally proposed\n raw_datasets = datasets.load_from_disk(data_path)\n\n if streaming:\n if isinstance(raw_datasets, datasets.DatasetDict):\n previous_num_examples = {k: len(dt) for k, dt in raw_datasets.items()}\n raw_datasets = datasets.IterableDatasetDict(\n {\n k: dt.to_iterable_dataset(num_shards=num_shards)\n for k, dt in raw_datasets.items()\n }\n )\n for k, dt in raw_datasets.items():\n if previous_num_examples[k] is not None:\n setattr(dt, \"num_examples\", previous_num_examples[k])\n else:\n num_examples = len(raw_datasets)\n raw_datasets = raw_datasets.to_iterable_dataset(num_shards=num_shards)\n setattr(raw_datasets, \"num_examples\", num_examples)\n\n else:\n data_path = str(data_path)\n raw_datasets = datasets.load_dataset(\n data_path,\n name=name,\n cache_dir=cache_dir,\n use_auth_token=True if use_auth_token else None,\n streaming=streaming,\n )\n # that means we need to return a tokenized version of the dataset\n\n if property_column not in [\"mc_labels\", None]:\n raw_datasets = raw_datasets.rename_column(property_column, \"mc_labels\")\n\n columns_to_remove = None\n if tokenize_column is not None:\n columns_to_remove = [\n x\n for x in (get_dataset_column_names(raw_datasets) or [])\n if x not in [tokenize_column, \"mc_labels\"] and \"label\" not in x\n ] or None\n\n if tokenizer is None:\n if columns_to_remove is not None:\n raw_datasets = raw_datasets.remove_columns(columns_to_remove)\n return raw_datasets\n\n return raw_datasets.map(\n partial(\n tokenize_fn,\n tokenizer=tokenizer,\n tokenize_column=tokenize_column,\n max_length=max_length,\n ),\n batched=True,\n remove_columns=columns_to_remove,\n )\n
"},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset_column_names","title":"get_dataset_column_names(dataset)
","text":"Get the column names in a dataset
Parameters:
Name Type Description Defaultdataset
Union[Dataset, IterableDataset, Mapping]
dataset to get the column names from
required Source code insafe/trainer/data_utils.py
def get_dataset_column_names(dataset: Union[datasets.Dataset, datasets.IterableDataset, Mapping]):\n \"\"\"Get the column names in a dataset\n\n Args:\n dataset: dataset to get the column names from\n\n \"\"\"\n if isinstance(dataset, (datasets.IterableDatasetDict, Mapping)):\n column_names = {split: dataset[split].column_names for split in dataset}\n else:\n column_names = dataset.column_names\n if isinstance(column_names, dict):\n column_names = list(column_names.values())[0]\n return column_names\n
"},{"location":"api/safe.models.html#safe.trainer.data_utils.take","title":"take(n, iterable)
","text":"Return first n items of the iterable as a list
Source code insafe/trainer/data_utils.py
def take(n, iterable):\n \"Return first n items of the iterable as a list\"\n return list(itertools.islice(iterable, n))\n
"},{"location":"api/safe.models.html#safe.trainer.data_utils.tokenize_fn","title":"tokenize_fn(row, tokenizer, tokenize_column='inputs', max_length=None, padding=False)
","text":"Perform the tokenization of a row Args: row: row to tokenize tokenizer: tokenizer to use tokenize_column: column to tokenize max_length: maximum size of the tokenized sequence padding: whether to pad the sequence
Source code insafe/trainer/data_utils.py
def tokenize_fn(\n row: Dict[str, Any],\n tokenizer: Callable,\n tokenize_column: str = \"inputs\",\n max_length: Optional[int] = None,\n padding: bool = False,\n):\n \"\"\"Perform the tokenization of a row\n Args:\n row: row to tokenize\n tokenizer: tokenizer to use\n tokenize_column: column to tokenize\n max_length: maximum size of the tokenized sequence\n padding: whether to pad the sequence\n \"\"\"\n # there's probably a way to do this with the tokenizer settings\n # but again, gotta move fast\n\n fast_tokenizer = (\n tokenizer.get_pretrained() if isinstance(tokenizer, SAFETokenizer) else tokenizer\n )\n\n return fast_tokenizer(\n row[tokenize_column],\n truncation=(max_length is not None),\n max_length=max_length,\n padding=padding,\n return_tensors=None,\n )\n
"},{"location":"api/safe.viz.html","title":"Visualization","text":""},{"location":"api/safe.viz.html#safe.viz.to_image","title":"to_image(safe_str, fragments=None, legend=None, mol_size=(300, 300), use_svg=True, highlight_mode='lasso', highlight_bond_width_multiplier=12, **kwargs)
","text":"Display a safe string by highlighting the fragments that make it.
Parameters:
Name Type Description Defaultsafe_str
str
the safe string to display
requiredfragments
Optional[Union[str, Mol]]
list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.
None
legend
Union[str, None]
A string to use as the legend under the molecule.
None
mol_size
Union[Tuple[int, int], int]
The size of the image to be returned
(300, 300)
use_svg
Optional[bool]
Whether to return an svg or png image
True
highlight_mode
Optional[str]
the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown
'lasso'
highlight_bond_width_multiplier
int
the multiplier to use for the bond width when using the 'fill' mode
12
**kwargs
Any
Additional arguments to pass to the drawing function. See RDKit documentation related to MolDrawOptions
for more details at https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.
{}
Source code in safe/viz.py
def to_image(\n safe_str: str,\n fragments: Optional[Union[str, dm.Mol]] = None,\n legend: Union[str, None] = None,\n mol_size: Union[Tuple[int, int], int] = (300, 300),\n use_svg: Optional[bool] = True,\n highlight_mode: Optional[str] = \"lasso\",\n highlight_bond_width_multiplier: int = 12,\n **kwargs: Any,\n):\n \"\"\"Display a safe string by highlighting the fragments that make it.\n\n Args:\n safe_str: the safe string to display\n fragments: list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.\n legend: A string to use as the legend under the molecule.\n mol_size: The size of the image to be returned\n use_svg: Whether to return an svg or png image\n highlight_mode: the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown\n highlight_bond_width_multiplier: the multiplier to use for the bond width when using the 'fill' mode\n **kwargs: Additional arguments to pass to the drawing function. See RDKit\n documentation related to `MolDrawOptions` for more details at\n https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.\n\n \"\"\"\n\n kwargs[\"legends\"] = legend\n kwargs[\"mol_size\"] = mol_size\n kwargs[\"use_svg\"] = use_svg\n if highlight_bond_width_multiplier is not None:\n kwargs[\"highlightBondWidthMultiplier\"] = highlight_bond_width_multiplier\n\n if highlight_mode == \"color\":\n kwargs[\"continuousHighlight\"] = False\n kwargs[\"circleAtoms\"] = kwargs.get(\"circleAtoms\", False) or False\n\n if isinstance(fragments, (str, dm.Mol)):\n fragments = [fragments]\n\n if fragments is None and highlight_mode is not None:\n fragments = [\n sf.decode(x, as_mol=False, remove_dummies=False, ignore_errors=False)\n for x in safe_str.split(\".\")\n ]\n elif fragments and len(fragments) > 0:\n parsed_fragments = []\n for fg in fragments:\n if isinstance(fg, str) and dm.to_mol(fg) is None:\n fg = sf.decode(fg, as_mol=False, remove_dummies=False, ignore_errors=False)\n parsed_fragments.append(fg)\n fragments = parsed_fragments\n else:\n fragments = []\n mol = dm.to_mol(safe_str, remove_hs=False)\n cm = plt.get_cmap(\"gist_rainbow\")\n current_colors = [cm(1.0 * i / len(fragments)) for i in range(len(fragments))]\n\n if highlight_mode == \"lasso\":\n return dm.viz.lasso_highlight_image(mol, fragments, **kwargs)\n\n atom_indices = []\n bond_indices = []\n atom_colors = {}\n bond_colors = {}\n\n for i, frag in enumerate(fragments):\n frag = dm.from_smarts(frag)\n atom_matches, bond_matches = dm.substructure_matching_bonds(mol, frag)\n atom_matches = list(itertools.chain(*atom_matches))\n bond_matches = list(itertools.chain(*bond_matches))\n atom_indices.extend(atom_matches)\n bond_indices.extend(bond_matches)\n atom_colors.update({x: current_colors[i] for x in atom_matches})\n bond_colors.update({x: current_colors[i] for x in bond_matches})\n\n return dm.viz.to_image(\n mol,\n highlight_atom=[atom_indices],\n highlight_bond=[bond_indices],\n highlightAtomColors=[atom_colors],\n highlightBondColors=[bond_colors],\n **kwargs,\n )\n
"},{"location":"tutorials/design-with-safe.html","title":"Molecular design","text":"In\u00a0[2]: Copied! import os\n\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n\n\nimport safe as sf\nimport datamol as dm\nimport os os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\" import safe as sf import datamol as dm
Load the default pretrained Safe model.
We will use this unique model for all the downstream molecular design tasks.
In\u00a0[3]: Copied!designer = sf.SAFEDesign.load_default(verbose=True)\n\ndesigner.model\ndesigner = sf.SAFEDesign.load_default(verbose=True) designer.model Out[3]:
SAFEDoubleHeadsModel(\n (transformer): GPT2Model(\n (wte): Embedding(1880, 768)\n (wpe): Embedding(1024, 768)\n (drop): Dropout(p=0.1, inplace=False)\n (h): ModuleList(\n (0-11): 12 x GPT2Block(\n (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n (attn): GPT2Attention(\n (c_attn): Conv1D()\n (c_proj): Conv1D()\n (attn_dropout): Dropout(p=0.1, inplace=False)\n (resid_dropout): Dropout(p=0.1, inplace=False)\n )\n (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n (mlp): GPT2MLP(\n (c_fc): Conv1D()\n (c_proj): Conv1D()\n (act): NewGELUActivation()\n (dropout): Dropout(p=0.1, inplace=False)\n )\n )\n )\n (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n )\n (lm_head): Linear(in_features=768, out_features=1880, bias=False)\n (multiple_choice_head): PropertyHead(\n (summary): Linear(in_features=768, out_features=64, bias=True)\n (activation): ReLU()\n (out): Linear(in_features=64, out_features=1, bias=True)\n )\n)
Let's start with the below molecule.
In\u00a0[4]: Copied!candidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\"\ncandidate_mol = dm.to_mol(candidate_smiles)\n\ndm.to_image(candidate_mol)\ncandidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\" candidate_mol = dm.to_mol(candidate_smiles) dm.to_image(candidate_mol) Out[4]: In\u00a0[6]: Copied!
generated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12)\n\ngenerated_smiles[:5]\ngenerated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12) generated_smiles[:5]
0%| | 0/1 [00:00<?, ?it/s]
/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n warnings.warn(\n2023-10-28 11:37:25.393 | INFO | safe.sample:de_novo_generation:581 - After sanitization, 82 / 100 (82.00 %) generated molecules are valid !\nOut[6]:
['CCCCOc1c(Br)cc(C)cc1-c1nc(C2(CC)CCN(C(C)C)CC2)cn2nc(C)nc12',\n 'CC(C)(C)OC(=O)Nc1ccc(C[NH+]2CC[C@@H]3OCCC[C@H]3C2)cn1',\n 'Cc1ccc(Br)c(NCCC(C)C(C)C)c1',\n 'CCOC(=O)C1=C(C)N=c2s/c(=C/c3c(C)[nH]c4ccccc34)c(=O)n2[C@@H]1c1ccc(OC)cc1',\n 'CCc1ccccc1-n1cc(O)c(C(=O)Nc2ccc(Cl)c(F)c2)n1']In\u00a0[7]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[7]: In\u00a0[8]: Copied!
scaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\"\n\ndm.to_image(scaffold)\nscaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\" dm.to_image(scaffold) Out[8]: In\u00a0[9]: Copied!
generated_smiles = designer.scaffold_decoration(\n scaffold=scaffold,\n n_samples_per_trial=12,\n n_trials=2,\n sanitize=True,\n do_not_fragment_further=True,\n)\n\ngenerated_mols = [dm.to_mol(x) for x in generated_smiles]\ngenerated_smiles = designer.scaffold_decoration( scaffold=scaffold, n_samples_per_trial=12, n_trials=2, sanitize=True, do_not_fragment_further=True, ) generated_mols = [dm.to_mol(x) for x in generated_smiles]
0%| | 0/2 [00:00<?, ?it/s]
/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n warnings.warn(\n2023-10-28 11:37:48.620 | INFO | safe.sample:scaffold_decoration:542 - After sanitization, 21 / 24 (87.50 %) generated molecules are valid !\nIn\u00a0[10]: Copied!
dm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1)\ndm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1) Out[10]: In\u00a0[11]: Copied!
superstructure = \"c1ccc2ncncc2c1\"\n\ndm.to_image(superstructure)\nsuperstructure = \"c1ccc2ncncc2c1\" dm.to_image(superstructure) Out[11]: In\u00a0[12]: Copied!
generated_smiles = designer.super_structure(\n core=superstructure,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n attachment_point_depth=3,\n)\n\ngenerated_smiles\ngenerated_smiles = designer.super_structure( core=superstructure, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, attachment_point_depth=3, ) generated_smiles
0%| | 0/1 [00:00<?, ?it/s]
/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n warnings.warn(\n2023-10-28 11:38:24.884 | INFO | safe.sample:super_structure:496 - After sanitization, 12 / 12 (100.00 %) generated molecules are valid !\nOut[12]:
['c1ncc2c(N3CCOCC3)ccc(N3CCNCC3)c2n1',\n 'N[C@H](CNc1ccc(C(F)(F)F)c2ncncc12)C(F)(F)F',\n 'C=CCCCNC(=S)Nc1ccc(C(F)(F)F)c2cncnc12',\n 'O=C(N[C@@H](CO)CCF)c1ccc(C(=O)[O-])c2ncncc12',\n 'O=C(CC=Nc1ccc(OC(F)(F)F)c2ncncc12)C(F)(F)F',\n 'NC(=Nc1ccc([N+](=O)[O-])c2cncnc12)C(F)(F)F',\n 'O=C(CCC(F)=C(F)F)Nc1ccc(C(F)(F)F)c2ncncc12',\n 'O=S(=O)(CCC(F)(F)F)Nc1cccc2cncnc12',\n 'O=S(=O)(Cl)c1ccc(C(F)(F)F)c2ncncc12',\n 'c1ncc2c(N3CCCCCC3)ccc(-c3cn[nH]c3)c2n1',\n 'NC(=O)CSCC(=O)Nc1ccc(C(=O)[O-])c2ncncc12',\n 'c1ncc2c(-n3cncn3)ccc(C3CCCCN3)c2n1']In\u00a0[14]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[14]: In\u00a0[15]: Copied!
motif = \"[*]-N1CCCCC1\"\n\ndm.to_image(motif)\nmotif = \"[*]-N1CCCCC1\" dm.to_image(motif) Out[15]: In\u00a0[26]: Copied!
# let's make some long sequence\ngenerated_smiles = designer.motif_extension(\n motif=motif,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n min_length=25,\n max_length=80,\n)\n\ngenerated_smiles\n# let's make some long sequence generated_smiles = designer.motif_extension( motif=motif, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, min_length=25, max_length=80, ) generated_smiles
0%| | 0/1 [00:00<?, ?it/s]
2023-10-28 11:41:52.959 | INFO | safe.sample:scaffold_decoration:542 - After sanitization, 10 / 12 (83.33 %) generated molecules are valid !\nOut[26]:
['C1CCN([C@@H]2CCCC[C@@H]2[NH+]2CCOCC2)CC1',\n 'FC(F)(F)C(F)(F)CN1CCCCC1',\n 'O=NN(/C(=C/N1CCCCC1)N1CCCCC1)c1ccccc1',\n 'C1CCC(CC2(CC3CCCC3)CCCCC2C2CCCCCC2N2CCCCC2)CC1',\n '[Na+].[Na+].[O-]S(=S)(=S)N1CCCCC1',\n 'NC(CS)C(O)=NC(O)C(=O)N1CCCCC1',\n 'O=P(O)(O)CCOCCOP(=O)(O)SCCN1CCCCC1',\n 'C1CCN(N=c2nn[nH][nH]2)CC1.O.O',\n 'N.N#CC1C=CCN1N1CCCCC1',\n 'O=C1CCCCC1.O=C1COCCCN1N1CCCCC1']In\u00a0[27]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[27]: In\u00a0[28]: Copied!
side_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\"\n\ndm.to_image(side_chains)\nside_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\" dm.to_image(side_chains) Out[28]: In\u00a0[29]: Copied!
generated_smiles = designer.scaffold_morphing(\n side_chains=side_chains,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n random_seed=100,\n)\n\ndm.to_image(generated_smiles[:12], mol_size=(350, 200))\ngenerated_smiles = designer.scaffold_morphing( side_chains=side_chains, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, random_seed=100, ) dm.to_image(generated_smiles[:12], mol_size=(350, 200))
0%| | 0/1 [00:00<?, ?it/s]
2023-10-28 11:42:05.888 | INFO | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %) generated molecules are valid !\nOut[29]: In\u00a0[30]: Copied!
linker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"]\n\ndm.to_image(linker_generation)\nlinker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"] dm.to_image(linker_generation) Out[30]: In\u00a0[31]: Copied!
generated_smiles = designer.linker_generation(\n *linker_generation,\n n_samples_per_trial=12,\n n_trials=1,\n sanitize=True,\n do_not_fragment_further=False,\n random_seed=100,\n)\n\ngenerated_smiles\ngenerated_smiles = designer.linker_generation( *linker_generation, n_samples_per_trial=12, n_trials=1, sanitize=True, do_not_fragment_further=False, random_seed=100, ) generated_smiles
0%| | 0/1 [00:00<?, ?it/s]
2023-10-28 11:42:14.034 | INFO | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %) generated molecules are valid !\nOut[31]:
['O=C(Oc1cccc(-c2nc(N3CCCCC3)nc3c2CCN3)c1)c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'O=C(Oc1cccc(-c2nc(-c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)nc3c2CCN3)c1)N1CCCCC1',\n 'N=C(N)NCCCN1C(=O)N(CN2CCCCC2)C(=O)C2CC(c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)CC21',\n 'N=C(N)NCCCN1C(=O)N(Cc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)C(=O)C2CC(N3CCCCC3)CC21',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc5c4oc4c6ccccc6c(Nc6cccc(N7CCCCC7)c6)cc54)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc(Nc5cc6c7cccc(N8CCCCC8)c7oc6c6ccccc56)c4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cc(-c5nc6n(n5)CC=C[C@H]6N5CCCCC5)ncn4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc([C@@H]4C=CCn5nc(-c6cc(N7CCCCC7)ncn6)nc54)cc23)c1',\n 'O=C1C[C@@H]2C[C@H]3[C@H](N4CCCCC4)CC4COCCC42O[C@@H]3CC(CCc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)O1',\n 'O=C1C[C@@H]2C[C@@H]3[C@@H](CC(CCN4CCCCC4)O1)OC21CCOCC1C[C@H]3c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'Brc1cccc(Nc2ncnc3ccc(NNc4ccc(SCCCCCCc5ccc(N6CCCCC6)cc5)cc4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4ccc(CCCCCCSc5ccc(NNN6CCCCC6)cc5)cc4)cc23)c1']In\u00a0[32]: Copied!
dm.to_image(generated_smiles[:12], mol_size=(350, 200))\ndm.to_image(generated_smiles[:12], mol_size=(350, 200)) Out[32]:
The End !
"},{"location":"tutorials/design-with-safe.html#de-novo-generation","title":"De novo generation\u00b6","text":"Generation of novel molecules without any constraints.
"},{"location":"tutorials/design-with-safe.html#scaffold-decoration","title":"Scaffold Decoration\u00b6","text":"For scaffold decoration, we wish to generate new molecules that would contain a given scaffold as core. Usually, the attachment point on the scaffold should dictate where the new vectors will be added.
"},{"location":"tutorials/design-with-safe.html#super-structure-generation","title":"Super structure generation\u00b6","text":"In super structure generation, we just want to generate superstructure of a molecular subgraph
"},{"location":"tutorials/design-with-safe.html#motif-extension","title":"Motif Extension\u00b6","text":"In motif extension, we are interested in generating a molecule containing a given motif as starting point.
"},{"location":"tutorials/design-with-safe.html#scaffold-morphing","title":"Scaffold Morphing\u00b6","text":"In scaffold morphing, we wish to replace a scaffold by another one in a molecule. The process requires as input that the user provides either the side chains or the input molecules and the core
"},{"location":"tutorials/design-with-safe.html#linker-generation","title":"Linker generation\u00b6","text":"Linker generation is mostly the same thing as scaffold morphing ...
"},{"location":"tutorials/extracting-representation-molfeat.html","title":"so really we just need our custom converter","text":"In\u00a0[1]: Copied!%load_ext autoreload\n%autoreload 2\n%load_ext autoreload %autoreload 2 In\u00a0[2]: Copied!
import safe\nimport torch\nimport datamol as dm\nimport types\nfrom molfeat.trans.pretrained import PretrainedMolTransformer\nfrom molfeat.trans.pretrained import PretrainedHFTransformer\n\nfrom molfeat.trans.pretrained.hf_transformers import HFModel\nfrom safe.trainer.model import SAFEDoubleHeadsModel\nfrom safe.tokenizer import SAFETokenizer\nimport safe import torch import datamol as dm import types from molfeat.trans.pretrained import PretrainedMolTransformer from molfeat.trans.pretrained import PretrainedHFTransformer from molfeat.trans.pretrained.hf_transformers import HFModel from safe.trainer.model import SAFEDoubleHeadsModel from safe.tokenizer import SAFETokenizer In\u00a0[3]: Copied!
safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\nsafe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\nsafe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\") safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")
We now need to build the molfeat
's HFModel
instance by wrapping our model.
safe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\nsafe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())
You can put the above process in the __init__
of the SAFEMolTransformer
if you wish as we will be doing below.
class SAFEMolTransformer(PretrainedHFTransformer):\n \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is \n how we convert the input molecules into the safe format\"\"\"\n def __init__(self, kind=None, notation=\"safe\", **kwargs):\n if kind is None:\n # we load the default SAFE model if the exact SAFE GPT model \n # to use is not provided\n safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\n safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\n super().__init__(kind, notation=None, **kwargs)\n # now we change the internal converter\n # overriding the internal converter of SmilesConverter leverages the exception handling\n # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS\n # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds.\n self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe)\n # you could also do any of the following:\n # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode)\n # self.converter = safe # the safe module\nclass SAFEMolTransformer(PretrainedHFTransformer): \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is how we convert the input molecules into the safe format\"\"\" def __init__(self, kind=None, notation=\"safe\", **kwargs): if kind is None: # we load the default SAFE model if the exact SAFE GPT model # to use is not provided safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\") safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\") kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained()) super().__init__(kind, notation=None, **kwargs) # now we change the internal converter # overriding the internal converter of SmilesConverter leverages the exception handling # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds. self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe) # you could also do any of the following: # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode) # self.converter = safe # the safe module
2023-12-20 22:57:39.310 | WARNING | molfeat.trans.base:__init__:51 - The 'SAFEMolTransformer' interaction has been superseded by a new class with id 0x2ad77d6a0\n
Let's use the GPT pooler which uses the last non padding token (often eos
) since the model is GPT2 like. For other options, see: https://molfeat-docs.datamol.io/stable/api/molfeat.utils.html#pooling
# Let's use the GPT pooling method and only take the last hidden layer\nsafe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1])\nsafe_transformers\n# Let's use the GPT pooling method and only take the last hidden layer safe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1]) safe_transformers Out[116]:
SAFEMolTransformer(dtype=np.float32)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.SAFEMolTransformer
SAFEMolTransformer(dtype=np.float32)In\u00a0[117]: Copied!
mols = dm.data.freesolv().iloc[:10].smiles.values\nmols = dm.data.freesolv().iloc[:10].smiles.values In\u00a0[118]: Copied!
safe_transformers(mols)\nsafe_transformers(mols) Out[118]:
array([[ 0.05216356, 0.10754181, 0.07509107, ..., 0.04756968,\n -0.08228929, -0.11568106],\n [ 0.02449008, 0.04048932, 0.14489463, ..., 0.11410899,\n -0.02203353, 0.08706839],\n [-0.07425696, 0.11859665, 0.19010407, ..., 0.10526019,\n 0.08878426, -0.06609854],\n ...,\n [ 0.07867863, 0.19300285, 0.23054805, ..., -0.00737952,\n 0.07542405, 0.00289541],\n [ 0.12092628, -0.01785688, 0.19791883, ..., 0.13796932,\n 0.11520796, -0.15333697],\n [-0.02005584, 0.13946685, 0.18568742, ..., 0.07080407,\n 0.06991849, -0.07151204]], dtype=float32)In\u00a0[119]: Copied!
from sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import Pipeline\n\ndf = dm.data.freesolv()\ndf[\"safe\"] = df[\"smiles\"].apply(safe_transformers.converter.encode)\ndf = df.dropna(subset=\"safe\")\n# we have to remove the molecules that cannot be converted \n# (no breakable bonds with our default methodology)\nfrom sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline df = dm.data.freesolv() df[\"safe\"] = df[\"smiles\"].apply(safe_transformers.converter.encode) df = df.dropna(subset=\"safe\") # we have to remove the molecules that cannot be converted # (no breakable bonds with our default methodology) In\u00a0[120]: Copied!
X, y = df[\"smiles\"].values, df[\"expt\"].values\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2)\n\n# The Molfeat transformer seemingly integrates with Scikit-learn Pipeline!\npipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())])\nX, y = df[\"smiles\"].values, df[\"expt\"].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2) # The Molfeat transformer seemingly integrates with Scikit-learn Pipeline! pipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())]) In\u00a0[121]: Copied!
with dm.without_rdkit_log():\n pipe.fit(X_train, y_train)\n score = pipe.score(X_test, y_test)\n y_pred = pipe.predict(X_test)\nwith dm.without_rdkit_log(): pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test) y_pred = pipe.predict(X_test) In\u00a0[122]: Copied!
print(\"R2 score:\", score)\nprint(\"R2 score:\", score)
R2 score: 0.4971483821661925\nIn\u00a0[123]: Copied!
import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots()\nax.scatter(y_test, y_pred)\nax.set_xlabel(\"Target\")\nax.set_ylabel(\"Preds\")\nimport matplotlib.pyplot as plt fig, ax = plt.subplots() ax.scatter(y_test, y_pred) ax.set_xlabel(\"Target\") ax.set_ylabel(\"Preds\") Out[123]:
Text(0, 0.5, 'Preds')
Not really a great result. Any other model in molfeat
would do better.
Because the SAFE model is not a standard HuggingFace transformers
model, we need to wrap it.
Why are we doing this ? Because we want to leverage the structure of molfeat
and not have to write our own pooling for the model. This can be done by using the huggingface molecule transformer PretrainedHFTransformer
rather than the general purpose pretrained model class PretrainedMolTransformer
where we will have to define our own _embed
and _convert
function.
We have multiple options here, we can override the _convert
method or even the _embed
method but the best thing about molfeat
is how flexible it is and all the shortcuts it provides.
In this case, we just need to change the custom
"},{"location":"tutorials/extracting-representation-molfeat.html#so-really-we-just-need-our-custom-converter","title":"so really we just need our custom converter\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#basic-test","title":"Basic Test\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#tips","title":"Tips\u00b6","text":"None
molecules at some steps in the conversion to SAFE. This can happen if there your slicing algorithm of choice is not working. In that case, please filter your datasets to remove molecules that fails the encoding steps first. You can always use the very robus safe.utils.convert_to_safe
, which augment default BRICS slicing with some graph partitioning algorithm.import safe as sf\nimport datamol as dm\n\ncelecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\"\ncelecoxib_mol = dm.to_mol(celecoxib)\n\ndisplay(dm.to_image(celecoxib_mol))\nimport safe as sf import datamol as dm celecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\" celecoxib_mol = dm.to_mol(celecoxib) display(dm.to_image(celecoxib_mol)) In\u00a0[3]: Copied!
safe_str = sf.encode(celecoxib_mol)\n\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\nsafe_str = sf.encode(celecoxib_mol) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")
c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\nRepresentation using 4 fragments\n
SAFE string are SMILES
Any SAFE string is a valid SMILES and can be read by RDKit without any decoding trick.
In\u00a0[4]: Copied!reconstructed = dm.to_mol(safe_str)\n\ndisplay(dm.to_image(reconstructed))\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\nreconstructed = dm.to_mol(safe_str) display(dm.to_image(reconstructed)) assert dm.same_mol(celecoxib_mol, reconstructed)
SAFE supports randomization
You can generate randomized SAFE strings.
In\u00a0[5]: Copied!random_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True)\n\nprint(random_safe_str)\n\nreconstructed = dm.to_mol(safe_str)\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\nrandom_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True) print(random_safe_str) reconstructed = dm.to_mol(safe_str) assert dm.same_mol(celecoxib_mol, reconstructed)
c15ccc(S(N)(=O)=O)cc1.c16cc4nn15.C4(F)(F)F.c16ccc(C)cc1\n
Fragment order in SAFE does not matter
Any permutation of the fragment order in a SAFE string preserve the molecule identity
In\u00a0[6]: Copied!import numpy as np\n\nfragments = safe_str.split(\".\")\nrandomized_fragment_safe_str = np.random.permutation(fragments).tolist()\nrandomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str)\n\nprint(randomized_fragment_safe_str, safe_str)\nassert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)\nimport numpy as np fragments = safe_str.split(\".\") randomized_fragment_safe_str = np.random.permutation(fragments).tolist() randomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str) print(randomized_fragment_safe_str, safe_str) assert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)
c14ccc(S(N)(=O)=O)cc1.c15cc3nn14.Cc1ccc5cc1.C3(F)(F)F c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\n
Use your own slicing logic
By default SAFE strings are generated using BRICS
, however, the following are supported:
hr
)recap
)mmpa
)attach
)Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.
In\u00a0[7]: Copied!def my_slicer(mol):\n \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\"\n for bond in mol.GetBonds():\n if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()):\n yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())\ndef my_slicer(mol): \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\" for bond in mol.GetBonds(): if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()): yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) In\u00a0[9]: Copied!
safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\nsafe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")
c14cc(C(F)(F)F)nn13.c13ccc(S(N)(=O)=O)cc1.Cc1ccc4cc1\nRepresentation using 3 fragments\n
Or simply use a SMARTS or a list of SMARTS.
In\u00a0[11]: Copied!# The above is equivalent to using the following SMARTS:\nsmart_slicer = [\"[r]-;!@[r]\"]\nsafe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n# The above is equivalent to using the following SMARTS: smart_slicer = [\"[r]-;!@[r]\"] safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")
c13cc(C(F)(F)F)nn14.c14ccc(S(N)(=O)=O)cc1.Cc1ccc3cc1\nRepresentation using 3 fragments\nIn\u00a0[13]: Copied!
safe_fragment = safe_str.split(\".\")\nsafe_fragment\nsafe_fragment = safe_str.split(\".\") safe_fragment Out[13]:
['c13cc(C(F)(F)F)nn14', 'c14ccc(S(N)(=O)=O)cc1', 'Cc1ccc3cc1']In\u00a0[14]: Copied!
# the following will fail\ndm.to_mol(safe_fragment[0])\n# the following will fail dm.to_mol(safe_fragment[0])
[11:20:14] SMILES Parse Error: unclosed ring for input: 'c13cc(C(F)(F)F)nn14'\nIn\u00a0[15]: Copied!
# while this works\nsf.decode(safe_fragment[0], as_mol=True)\n# while this works sf.decode(safe_fragment[0], as_mol=True) Out[15]: In\u00a0[16]: Copied!
# if you want to keep the attachment points, then use remove_dummies=False\nsf.decode(safe_fragment[0], as_mol=True, remove_dummies=False)\n# if you want to keep the attachment points, then use remove_dummies=False sf.decode(safe_fragment[0], as_mol=True, remove_dummies=False) Out[16]: In\u00a0[17]: Copied!
sf.to_image(safe_str)\nsf.to_image(safe_str) Out[17]:
There are 3 display modes for highlighting the fragments in a SAFE string. The difference between those modes is highlighted below using two different slicing algorithm.
Overlapping fragments
Note that because some fragment might be matching overlapping substructure of the molecules (for example the same fragment appearing multiple time in the molecule), the highlighting might assigned the same color to these fragments.
In\u00a0[18]: Copied!from IPython.display import display\nfrom ipywidgets import widgets, HBox\n\ndef display_image(safe_str):\n image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml')\n image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml')\n image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml')\n hbox = HBox([image_lasso, image_fill, image_color])\n display(hbox)\nfrom IPython.display import display from ipywidgets import widgets, HBox def display_image(safe_str): image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml') image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml') image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml') hbox = HBox([image_lasso, image_fill, image_color]) display(hbox) In\u00a0[19]: Copied!
# display for brics\nsafe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\")\ndisplay_image(safe_str_brics)\n# display for brics safe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\") display_image(safe_str_brics)
HBox(children=(Image(value=b'<svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'<s\u2026In\u00a0[20]: Copied!
# display with HR\nsafe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\")\ndisplay_image(safe_str_hr)\n# display with HR safe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\") display_image(safe_str_hr)
HBox(children=(Image(value=b'<svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'<s\u2026
The End !
"},{"location":"tutorials/getting-started.html#getting-started-with-safe","title":"Getting Started with SAFE\u00b6","text":"The SAFE encoding format is a rewriting of SMILES to ensure that any molecule can be written as a sequence of fragments where atoms or tokens corresponding to given fragments form a substring (ontiguous sequence) in the line notation representation.
SAFE addresses some of the limitation of SMILES strings when it comes to generative design:
Safe Others - native support for (sub)structure-constrained design - different generative models for different generative tasks - extensive substructure matching for filtering after generation - multiple steps generative process (e.g Liao et al. 2023 ) - graph based approaches with their limitations - any molecule generation as a simple NLP task (sequence completion or mask filling) - a single autoregressive sequence model for both linker generation and scaffold decoration. - complex training and decoding schemes for scaffold-constrained generation (e.g Ar\u00fas-Pous et al. 2020 ) - complex sampling algorithms for scaffold-constrained generation (e.g Langevin et al. 2020) - SAFE strings are SMILES strings - requires a different chemical language (e.g Krenn et al. 2022)"},{"location":"tutorials/getting-started.html#using-safe","title":"Using SAFE\u00b6","text":"In the following we will highlight how to use SAFE and some of the properties of SAFE strings.
"},{"location":"tutorials/getting-started.html#encoding","title":"Encoding\u00b6","text":"SAFE represents fragments
SAFE represents molecules as a set of N [Fragment_1].[Fragment_i].[Fragment_N]
"},{"location":"tutorials/getting-started.html#decoding","title":"Decoding\u00b6","text":"Fragment order in SAFE does not matter
Each SAFE fragment
is a valid molecule itself, however, you need to use the decoder to recover molecules where all attachment point are not fullfiled.
We provide a visualization module to display a safe string, with highlight of all the fragments that compose it.
"},{"location":"tutorials/how-it-works.html","title":"How SAFE encoding works?","text":"In\u00a0[1]: Copied!import datamol as dm\n\nfrom rdkit import Chem\nfrom rdkit.Chem.Draw import rdDepictor\nfrom rdkit.Chem import rdChemReactions as rdr\nrdDepictor.SetPreferCoordGen(True)\nimport datamol as dm from rdkit import Chem from rdkit.Chem.Draw import rdDepictor from rdkit.Chem import rdChemReactions as rdr rdDepictor.SetPreferCoordGen(True) In\u00a0[2]: Copied!
smiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"]\nlegends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"]\ndm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True)\nsmiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"] legends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"] dm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True) Out[2]:
In the example above, we can see that phenol
can be represented as two fragments that can be connected given proper attachment point.
To achieve this we are interested in attaching 2 fragments together (the methoxy
and the phenyl
groups). In RDKit, this can usually be achieved using chemical reactions. For convenience, we will prefer a standardized representation of attachment points that includes an atom mapping.
smiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] #\ndm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True)\nsmiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] # dm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True) Out[3]:
To attach the two fragments, I can write a simple chemical transformation. Since smarts and smiles syntax do not mix very well when it comes to *
I will assume an isotopic representation [1*]
instead of [*:1]
rxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]>>[*:1][*:2]\")\nrxn\nrxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]>>[*:1][*:2]\") rxn Out[4]: In\u00a0[5]: Copied!
# replace atom map by isotopes\nphenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\")\nmethoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\")\n\n# runreactions\nprod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy)))\nprod[0][0]\n# replace atom map by isotopes phenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\") methoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\") # runreactions prod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy))) prod[0][0] Out[5]:
We can achieve the same result by using rdkit API in an slightly more tedious way.
In\u00a0[6]: Copied!replacement_sub = Chem.MolFromSmarts(\"[1*]\")\nprod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0)\nprod = dm.remove_dummies(prod[0], dummy=\"[1*]\")\nprod\nreplacement_sub = Chem.MolFromSmarts(\"[1*]\") prod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0) prod = dm.remove_dummies(prod[0], dummy=\"[1*]\") prod
[11:14:08] WARNING: not removing hydrogen atom without neighbors\nOut[6]:
But wait, could we attach the fragment using only the string operations on the smiles ?
Well, it's not possible by trying to perform substring replacement, but recall we just said that numbers in smiles represents connectivity points
?
phenyl = \"c1cc([*:1])ccc1\"\nmethoxy = \"O([*:1])C\"\ncomposite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C\ncompo = dm.to_mol(composite)\nphenyl = \"c1cc([*:1])ccc1\" methoxy = \"O([*:1])C\" composite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C compo = dm.to_mol(composite)
Since 1
\"connectivity point\" is already present in the phenyl group. We need to start by opening a new connectivity point: 2
attached_composite = composite.replace(\"[*:1]\", \"2\")\ndm.to_mol(attached_composite)\nattached_composite = composite.replace(\"[*:1]\", \"2\") dm.to_mol(attached_composite)
[11:14:10] SMILES Parse Error: syntax error while parsing: c1cc(2)ccc1.O(2)C\n[11:14:10] SMILES Parse Error: Failed parsing SMILES 'c1cc(2)ccc1.O(2)C' for input: 'c1cc(2)ccc1.O(2)C'\n
The previous line does not work because of violation in the smiles syntax. As we are not taking into account the branching bracket surrounding the attachment point.
We could try to regenerate the smiles or scan the sequence and remove the brackets when it's possible, but we want to limit the operations to str.replace
. So let's try again.
attached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\")\ndm.to_image(attached_composite, legends=[attached_composite])\nattached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\") dm.to_image(attached_composite, legends=[attached_composite]) Out[9]:
You can see that the phenol molecule is represented as two \"fragments\" [Fragment1].[Fragment2]
. That is what SAFE is about.
In summary, to build a SAFE string, we just need to follow the step below:
The End !
"},{"location":"tutorials/how-it-works.html#how-safe-encoding-works","title":"How SAFE encoding works?\u00b6","text":"The intuition behind safe is quite simple: we want to represent any molecule as a set of connected fragments
.
Let's start first by revisiting some information about the SMILES syntax:
An asterisk *
in a smiles is usually employed to indicate any atom OR an attachment point of any group. It's particularly useful for smarts matching.
Number in smiles syntax indicates connectivity points between two atoms. For 2 digits numbers they would need to be preceeded by %
.
This is partially explained on the wikipedia ring section of SMILES.
.
in smiles indicates the presence of additional fragments and is used to separate them.A good ressource on the subject is the DAYLIGHT page.
We illustrate these informations below !
"}]} \ No newline at end of file diff --git a/main/sitemap.xml.gz b/main/sitemap.xml.gz index ac67dee..c3ca0da 100644 Binary files a/main/sitemap.xml.gz and b/main/sitemap.xml.gz differ