diff --git a/main/api/safe.html b/main/api/safe.html
index 90d59e6..7a44d58 100644
--- a/main/api/safe.html
+++ b/main/api/safe.html
@@ -536,6 +536,120 @@
     </span>
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#safe-design" class="md-nav__link">
+    <span class="md-ellipsis">
+      SAFE Design
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#safe.sample" class="md-nav__link">
+    <span class="md-ellipsis">
+      sample
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign" class="md-nav__link">
+    <span class="md-ellipsis">
+      SAFEDesign
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="SAFEDesign">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.__init__" class="md-nav__link">
+    <span class="md-ellipsis">
+      __init__
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.__mix_sequences" class="md-nav__link">
+    <span class="md-ellipsis">
+      __mix_sequences
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.de_novo_generation" class="md-nav__link">
+    <span class="md-ellipsis">
+      de_novo_generation
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.linker_generation" class="md-nav__link">
+    <span class="md-ellipsis">
+      linker_generation
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.load_default" class="md-nav__link">
+    <span class="md-ellipsis">
+      load_default
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.motif_extension" class="md-nav__link">
+    <span class="md-ellipsis">
+      motif_extension
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.scaffold_decoration" class="md-nav__link">
+    <span class="md-ellipsis">
+      scaffold_decoration
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.scaffold_morphing" class="md-nav__link">
+    <span class="md-ellipsis">
+      scaffold_morphing
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.super_structure" class="md-nav__link">
+    <span class="md-ellipsis">
+      super_structure
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
 </li>
       
         <li class="md-nav__item">
@@ -1170,6 +1284,120 @@
     </span>
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#safe-design" class="md-nav__link">
+    <span class="md-ellipsis">
+      SAFE Design
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#safe.sample" class="md-nav__link">
+    <span class="md-ellipsis">
+      sample
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign" class="md-nav__link">
+    <span class="md-ellipsis">
+      SAFEDesign
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="SAFEDesign">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.__init__" class="md-nav__link">
+    <span class="md-ellipsis">
+      __init__
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.__mix_sequences" class="md-nav__link">
+    <span class="md-ellipsis">
+      __mix_sequences
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.de_novo_generation" class="md-nav__link">
+    <span class="md-ellipsis">
+      de_novo_generation
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.linker_generation" class="md-nav__link">
+    <span class="md-ellipsis">
+      linker_generation
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.load_default" class="md-nav__link">
+    <span class="md-ellipsis">
+      load_default
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.motif_extension" class="md-nav__link">
+    <span class="md-ellipsis">
+      motif_extension
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.scaffold_decoration" class="md-nav__link">
+    <span class="md-ellipsis">
+      scaffold_decoration
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.scaffold_morphing" class="md-nav__link">
+    <span class="md-ellipsis">
+      scaffold_morphing
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#safe.sample.SAFEDesign.super_structure" class="md-nav__link">
+    <span class="md-ellipsis">
+      super_structure
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
 </li>
       
         <li class="md-nav__item">
@@ -2098,7 +2326,7 @@ <h2 id="safe.converter.SAFEConverter" class="doc doc-heading">
 <span class="sd">        Args:</span>
 <span class="sd">            inp: input smiles</span>
 <span class="sd">        &quot;&quot;&quot;</span>
-        <span class="n">inp</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">&quot;[\[].*?[\]]&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">inp</span><span class="p">)</span>  <span class="c1"># noqa</span>
+        <span class="n">inp</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;\[.*?\]&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">inp</span><span class="p">)</span>  <span class="c1"># noqa</span>
         <span class="n">matching_groups</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">findall</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;((?&lt;=%)\d</span><span class="si">{2}</span><span class="s2">)|((?&lt;!%)\d+)(?![^\[]*\])&quot;</span><span class="p">,</span> <span class="n">inp</span><span class="p">)</span>
         <span class="c1"># first match is for multiple connection as multiple digits</span>
         <span class="c1"># second match is for single connections requiring 2 digits</span>
@@ -3577,6 +3805,3677 @@ <h2 id="safe.converter.decode" class="doc doc-heading">
 
 
 
+  </div>
+
+    </div>
+
+</div><hr />
+<h2 id="safe-design">SAFE Design<a class="headerlink" href="#safe-design" title="Permanent link">&para;</a></h2>
+
+
+<div class="doc doc-object doc-module">
+
+
+
+<a id="safe.sample"></a>
+    <div class="doc doc-contents first">
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-class">
+
+
+
+<h2 id="safe.sample.SAFEDesign" class="doc doc-heading">
+            <code>SAFEDesign</code>
+
+
+<a href="#safe.sample.SAFEDesign" class="headerlink" title="Permanent link">&para;</a></h2>
+
+
+    <div class="doc doc-contents ">
+
+
+      <p>Molecular generation using SAFE pretrained model</p>
+
+              <details class="quote">
+                <summary>Source code in <code>safe/sample.py</code></summary>
+                <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 22</span>
+<span class="normal"> 23</span>
+<span class="normal"> 24</span>
+<span class="normal"> 25</span>
+<span class="normal"> 26</span>
+<span class="normal"> 27</span>
+<span class="normal"> 28</span>
+<span class="normal"> 29</span>
+<span class="normal"> 30</span>
+<span class="normal"> 31</span>
+<span class="normal"> 32</span>
+<span class="normal"> 33</span>
+<span class="normal"> 34</span>
+<span class="normal"> 35</span>
+<span class="normal"> 36</span>
+<span class="normal"> 37</span>
+<span class="normal"> 38</span>
+<span class="normal"> 39</span>
+<span class="normal"> 40</span>
+<span class="normal"> 41</span>
+<span class="normal"> 42</span>
+<span class="normal"> 43</span>
+<span class="normal"> 44</span>
+<span class="normal"> 45</span>
+<span class="normal"> 46</span>
+<span class="normal"> 47</span>
+<span class="normal"> 48</span>
+<span class="normal"> 49</span>
+<span class="normal"> 50</span>
+<span class="normal"> 51</span>
+<span class="normal"> 52</span>
+<span class="normal"> 53</span>
+<span class="normal"> 54</span>
+<span class="normal"> 55</span>
+<span class="normal"> 56</span>
+<span class="normal"> 57</span>
+<span class="normal"> 58</span>
+<span class="normal"> 59</span>
+<span class="normal"> 60</span>
+<span class="normal"> 61</span>
+<span class="normal"> 62</span>
+<span class="normal"> 63</span>
+<span class="normal"> 64</span>
+<span class="normal"> 65</span>
+<span class="normal"> 66</span>
+<span class="normal"> 67</span>
+<span class="normal"> 68</span>
+<span class="normal"> 69</span>
+<span class="normal"> 70</span>
+<span class="normal"> 71</span>
+<span class="normal"> 72</span>
+<span class="normal"> 73</span>
+<span class="normal"> 74</span>
+<span class="normal"> 75</span>
+<span class="normal"> 76</span>
+<span class="normal"> 77</span>
+<span class="normal"> 78</span>
+<span class="normal"> 79</span>
+<span class="normal"> 80</span>
+<span class="normal"> 81</span>
+<span class="normal"> 82</span>
+<span class="normal"> 83</span>
+<span class="normal"> 84</span>
+<span class="normal"> 85</span>
+<span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
+<span class="normal">101</span>
+<span class="normal">102</span>
+<span class="normal">103</span>
+<span class="normal">104</span>
+<span class="normal">105</span>
+<span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
+<span class="normal">114</span>
+<span class="normal">115</span>
+<span class="normal">116</span>
+<span class="normal">117</span>
+<span class="normal">118</span>
+<span class="normal">119</span>
+<span class="normal">120</span>
+<span class="normal">121</span>
+<span class="normal">122</span>
+<span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span>
+<span class="normal">134</span>
+<span class="normal">135</span>
+<span class="normal">136</span>
+<span class="normal">137</span>
+<span class="normal">138</span>
+<span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
+<span class="normal">146</span>
+<span class="normal">147</span>
+<span class="normal">148</span>
+<span class="normal">149</span>
+<span class="normal">150</span>
+<span class="normal">151</span>
+<span class="normal">152</span>
+<span class="normal">153</span>
+<span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
+<span class="normal">159</span>
+<span class="normal">160</span>
+<span class="normal">161</span>
+<span class="normal">162</span>
+<span class="normal">163</span>
+<span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span>
+<span class="normal">172</span>
+<span class="normal">173</span>
+<span class="normal">174</span>
+<span class="normal">175</span>
+<span class="normal">176</span>
+<span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span>
+<span class="normal">185</span>
+<span class="normal">186</span>
+<span class="normal">187</span>
+<span class="normal">188</span>
+<span class="normal">189</span>
+<span class="normal">190</span>
+<span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span>
+<span class="normal">201</span>
+<span class="normal">202</span>
+<span class="normal">203</span>
+<span class="normal">204</span>
+<span class="normal">205</span>
+<span class="normal">206</span>
+<span class="normal">207</span>
+<span class="normal">208</span>
+<span class="normal">209</span>
+<span class="normal">210</span>
+<span class="normal">211</span>
+<span class="normal">212</span>
+<span class="normal">213</span>
+<span class="normal">214</span>
+<span class="normal">215</span>
+<span class="normal">216</span>
+<span class="normal">217</span>
+<span class="normal">218</span>
+<span class="normal">219</span>
+<span class="normal">220</span>
+<span class="normal">221</span>
+<span class="normal">222</span>
+<span class="normal">223</span>
+<span class="normal">224</span>
+<span class="normal">225</span>
+<span class="normal">226</span>
+<span class="normal">227</span>
+<span class="normal">228</span>
+<span class="normal">229</span>
+<span class="normal">230</span>
+<span class="normal">231</span>
+<span class="normal">232</span>
+<span class="normal">233</span>
+<span class="normal">234</span>
+<span class="normal">235</span>
+<span class="normal">236</span>
+<span class="normal">237</span>
+<span class="normal">238</span>
+<span class="normal">239</span>
+<span class="normal">240</span>
+<span class="normal">241</span>
+<span class="normal">242</span>
+<span class="normal">243</span>
+<span class="normal">244</span>
+<span class="normal">245</span>
+<span class="normal">246</span>
+<span class="normal">247</span>
+<span class="normal">248</span>
+<span class="normal">249</span>
+<span class="normal">250</span>
+<span class="normal">251</span>
+<span class="normal">252</span>
+<span class="normal">253</span>
+<span class="normal">254</span>
+<span class="normal">255</span>
+<span class="normal">256</span>
+<span class="normal">257</span>
+<span class="normal">258</span>
+<span class="normal">259</span>
+<span class="normal">260</span>
+<span class="normal">261</span>
+<span class="normal">262</span>
+<span class="normal">263</span>
+<span class="normal">264</span>
+<span class="normal">265</span>
+<span class="normal">266</span>
+<span class="normal">267</span>
+<span class="normal">268</span>
+<span class="normal">269</span>
+<span class="normal">270</span>
+<span class="normal">271</span>
+<span class="normal">272</span>
+<span class="normal">273</span>
+<span class="normal">274</span>
+<span class="normal">275</span>
+<span class="normal">276</span>
+<span class="normal">277</span>
+<span class="normal">278</span>
+<span class="normal">279</span>
+<span class="normal">280</span>
+<span class="normal">281</span>
+<span class="normal">282</span>
+<span class="normal">283</span>
+<span class="normal">284</span>
+<span class="normal">285</span>
+<span class="normal">286</span>
+<span class="normal">287</span>
+<span class="normal">288</span>
+<span class="normal">289</span>
+<span class="normal">290</span>
+<span class="normal">291</span>
+<span class="normal">292</span>
+<span class="normal">293</span>
+<span class="normal">294</span>
+<span class="normal">295</span>
+<span class="normal">296</span>
+<span class="normal">297</span>
+<span class="normal">298</span>
+<span class="normal">299</span>
+<span class="normal">300</span>
+<span class="normal">301</span>
+<span class="normal">302</span>
+<span class="normal">303</span>
+<span class="normal">304</span>
+<span class="normal">305</span>
+<span class="normal">306</span>
+<span class="normal">307</span>
+<span class="normal">308</span>
+<span class="normal">309</span>
+<span class="normal">310</span>
+<span class="normal">311</span>
+<span class="normal">312</span>
+<span class="normal">313</span>
+<span class="normal">314</span>
+<span class="normal">315</span>
+<span class="normal">316</span>
+<span class="normal">317</span>
+<span class="normal">318</span>
+<span class="normal">319</span>
+<span class="normal">320</span>
+<span class="normal">321</span>
+<span class="normal">322</span>
+<span class="normal">323</span>
+<span class="normal">324</span>
+<span class="normal">325</span>
+<span class="normal">326</span>
+<span class="normal">327</span>
+<span class="normal">328</span>
+<span class="normal">329</span>
+<span class="normal">330</span>
+<span class="normal">331</span>
+<span class="normal">332</span>
+<span class="normal">333</span>
+<span class="normal">334</span>
+<span class="normal">335</span>
+<span class="normal">336</span>
+<span class="normal">337</span>
+<span class="normal">338</span>
+<span class="normal">339</span>
+<span class="normal">340</span>
+<span class="normal">341</span>
+<span class="normal">342</span>
+<span class="normal">343</span>
+<span class="normal">344</span>
+<span class="normal">345</span>
+<span class="normal">346</span>
+<span class="normal">347</span>
+<span class="normal">348</span>
+<span class="normal">349</span>
+<span class="normal">350</span>
+<span class="normal">351</span>
+<span class="normal">352</span>
+<span class="normal">353</span>
+<span class="normal">354</span>
+<span class="normal">355</span>
+<span class="normal">356</span>
+<span class="normal">357</span>
+<span class="normal">358</span>
+<span class="normal">359</span>
+<span class="normal">360</span>
+<span class="normal">361</span>
+<span class="normal">362</span>
+<span class="normal">363</span>
+<span class="normal">364</span>
+<span class="normal">365</span>
+<span class="normal">366</span>
+<span class="normal">367</span>
+<span class="normal">368</span>
+<span class="normal">369</span>
+<span class="normal">370</span>
+<span class="normal">371</span>
+<span class="normal">372</span>
+<span class="normal">373</span>
+<span class="normal">374</span>
+<span class="normal">375</span>
+<span class="normal">376</span>
+<span class="normal">377</span>
+<span class="normal">378</span>
+<span class="normal">379</span>
+<span class="normal">380</span>
+<span class="normal">381</span>
+<span class="normal">382</span>
+<span class="normal">383</span>
+<span class="normal">384</span>
+<span class="normal">385</span>
+<span class="normal">386</span>
+<span class="normal">387</span>
+<span class="normal">388</span>
+<span class="normal">389</span>
+<span class="normal">390</span>
+<span class="normal">391</span>
+<span class="normal">392</span>
+<span class="normal">393</span>
+<span class="normal">394</span>
+<span class="normal">395</span>
+<span class="normal">396</span>
+<span class="normal">397</span>
+<span class="normal">398</span>
+<span class="normal">399</span>
+<span class="normal">400</span>
+<span class="normal">401</span>
+<span class="normal">402</span>
+<span class="normal">403</span>
+<span class="normal">404</span>
+<span class="normal">405</span>
+<span class="normal">406</span>
+<span class="normal">407</span>
+<span class="normal">408</span>
+<span class="normal">409</span>
+<span class="normal">410</span>
+<span class="normal">411</span>
+<span class="normal">412</span>
+<span class="normal">413</span>
+<span class="normal">414</span>
+<span class="normal">415</span>
+<span class="normal">416</span>
+<span class="normal">417</span>
+<span class="normal">418</span>
+<span class="normal">419</span>
+<span class="normal">420</span>
+<span class="normal">421</span>
+<span class="normal">422</span>
+<span class="normal">423</span>
+<span class="normal">424</span>
+<span class="normal">425</span>
+<span class="normal">426</span>
+<span class="normal">427</span>
+<span class="normal">428</span>
+<span class="normal">429</span>
+<span class="normal">430</span>
+<span class="normal">431</span>
+<span class="normal">432</span>
+<span class="normal">433</span>
+<span class="normal">434</span>
+<span class="normal">435</span>
+<span class="normal">436</span>
+<span class="normal">437</span>
+<span class="normal">438</span>
+<span class="normal">439</span>
+<span class="normal">440</span>
+<span class="normal">441</span>
+<span class="normal">442</span>
+<span class="normal">443</span>
+<span class="normal">444</span>
+<span class="normal">445</span>
+<span class="normal">446</span>
+<span class="normal">447</span>
+<span class="normal">448</span>
+<span class="normal">449</span>
+<span class="normal">450</span>
+<span class="normal">451</span>
+<span class="normal">452</span>
+<span class="normal">453</span>
+<span class="normal">454</span>
+<span class="normal">455</span>
+<span class="normal">456</span>
+<span class="normal">457</span>
+<span class="normal">458</span>
+<span class="normal">459</span>
+<span class="normal">460</span>
+<span class="normal">461</span>
+<span class="normal">462</span>
+<span class="normal">463</span>
+<span class="normal">464</span>
+<span class="normal">465</span>
+<span class="normal">466</span>
+<span class="normal">467</span>
+<span class="normal">468</span>
+<span class="normal">469</span>
+<span class="normal">470</span>
+<span class="normal">471</span>
+<span class="normal">472</span>
+<span class="normal">473</span>
+<span class="normal">474</span>
+<span class="normal">475</span>
+<span class="normal">476</span>
+<span class="normal">477</span>
+<span class="normal">478</span>
+<span class="normal">479</span>
+<span class="normal">480</span>
+<span class="normal">481</span>
+<span class="normal">482</span>
+<span class="normal">483</span>
+<span class="normal">484</span>
+<span class="normal">485</span>
+<span class="normal">486</span>
+<span class="normal">487</span>
+<span class="normal">488</span>
+<span class="normal">489</span>
+<span class="normal">490</span>
+<span class="normal">491</span>
+<span class="normal">492</span>
+<span class="normal">493</span>
+<span class="normal">494</span>
+<span class="normal">495</span>
+<span class="normal">496</span>
+<span class="normal">497</span>
+<span class="normal">498</span>
+<span class="normal">499</span>
+<span class="normal">500</span>
+<span class="normal">501</span>
+<span class="normal">502</span>
+<span class="normal">503</span>
+<span class="normal">504</span>
+<span class="normal">505</span>
+<span class="normal">506</span>
+<span class="normal">507</span>
+<span class="normal">508</span>
+<span class="normal">509</span>
+<span class="normal">510</span>
+<span class="normal">511</span>
+<span class="normal">512</span>
+<span class="normal">513</span>
+<span class="normal">514</span>
+<span class="normal">515</span>
+<span class="normal">516</span>
+<span class="normal">517</span>
+<span class="normal">518</span>
+<span class="normal">519</span>
+<span class="normal">520</span>
+<span class="normal">521</span>
+<span class="normal">522</span>
+<span class="normal">523</span>
+<span class="normal">524</span>
+<span class="normal">525</span>
+<span class="normal">526</span>
+<span class="normal">527</span>
+<span class="normal">528</span>
+<span class="normal">529</span>
+<span class="normal">530</span>
+<span class="normal">531</span>
+<span class="normal">532</span>
+<span class="normal">533</span>
+<span class="normal">534</span>
+<span class="normal">535</span>
+<span class="normal">536</span>
+<span class="normal">537</span>
+<span class="normal">538</span>
+<span class="normal">539</span>
+<span class="normal">540</span>
+<span class="normal">541</span>
+<span class="normal">542</span>
+<span class="normal">543</span>
+<span class="normal">544</span>
+<span class="normal">545</span>
+<span class="normal">546</span>
+<span class="normal">547</span>
+<span class="normal">548</span>
+<span class="normal">549</span>
+<span class="normal">550</span>
+<span class="normal">551</span>
+<span class="normal">552</span>
+<span class="normal">553</span>
+<span class="normal">554</span>
+<span class="normal">555</span>
+<span class="normal">556</span>
+<span class="normal">557</span>
+<span class="normal">558</span>
+<span class="normal">559</span>
+<span class="normal">560</span>
+<span class="normal">561</span>
+<span class="normal">562</span>
+<span class="normal">563</span>
+<span class="normal">564</span>
+<span class="normal">565</span>
+<span class="normal">566</span>
+<span class="normal">567</span>
+<span class="normal">568</span>
+<span class="normal">569</span>
+<span class="normal">570</span>
+<span class="normal">571</span>
+<span class="normal">572</span>
+<span class="normal">573</span>
+<span class="normal">574</span>
+<span class="normal">575</span>
+<span class="normal">576</span>
+<span class="normal">577</span>
+<span class="normal">578</span>
+<span class="normal">579</span>
+<span class="normal">580</span>
+<span class="normal">581</span>
+<span class="normal">582</span>
+<span class="normal">583</span>
+<span class="normal">584</span>
+<span class="normal">585</span>
+<span class="normal">586</span>
+<span class="normal">587</span>
+<span class="normal">588</span>
+<span class="normal">589</span>
+<span class="normal">590</span>
+<span class="normal">591</span>
+<span class="normal">592</span>
+<span class="normal">593</span>
+<span class="normal">594</span>
+<span class="normal">595</span>
+<span class="normal">596</span>
+<span class="normal">597</span>
+<span class="normal">598</span>
+<span class="normal">599</span>
+<span class="normal">600</span>
+<span class="normal">601</span>
+<span class="normal">602</span>
+<span class="normal">603</span>
+<span class="normal">604</span>
+<span class="normal">605</span>
+<span class="normal">606</span>
+<span class="normal">607</span>
+<span class="normal">608</span>
+<span class="normal">609</span>
+<span class="normal">610</span>
+<span class="normal">611</span>
+<span class="normal">612</span>
+<span class="normal">613</span>
+<span class="normal">614</span>
+<span class="normal">615</span>
+<span class="normal">616</span>
+<span class="normal">617</span>
+<span class="normal">618</span>
+<span class="normal">619</span>
+<span class="normal">620</span>
+<span class="normal">621</span>
+<span class="normal">622</span>
+<span class="normal">623</span>
+<span class="normal">624</span>
+<span class="normal">625</span>
+<span class="normal">626</span>
+<span class="normal">627</span>
+<span class="normal">628</span>
+<span class="normal">629</span>
+<span class="normal">630</span>
+<span class="normal">631</span>
+<span class="normal">632</span>
+<span class="normal">633</span>
+<span class="normal">634</span>
+<span class="normal">635</span>
+<span class="normal">636</span>
+<span class="normal">637</span>
+<span class="normal">638</span>
+<span class="normal">639</span>
+<span class="normal">640</span>
+<span class="normal">641</span>
+<span class="normal">642</span>
+<span class="normal">643</span>
+<span class="normal">644</span>
+<span class="normal">645</span>
+<span class="normal">646</span>
+<span class="normal">647</span>
+<span class="normal">648</span>
+<span class="normal">649</span>
+<span class="normal">650</span>
+<span class="normal">651</span>
+<span class="normal">652</span>
+<span class="normal">653</span>
+<span class="normal">654</span>
+<span class="normal">655</span>
+<span class="normal">656</span>
+<span class="normal">657</span>
+<span class="normal">658</span>
+<span class="normal">659</span>
+<span class="normal">660</span>
+<span class="normal">661</span>
+<span class="normal">662</span>
+<span class="normal">663</span>
+<span class="normal">664</span>
+<span class="normal">665</span>
+<span class="normal">666</span>
+<span class="normal">667</span>
+<span class="normal">668</span>
+<span class="normal">669</span>
+<span class="normal">670</span>
+<span class="normal">671</span>
+<span class="normal">672</span>
+<span class="normal">673</span>
+<span class="normal">674</span>
+<span class="normal">675</span>
+<span class="normal">676</span>
+<span class="normal">677</span>
+<span class="normal">678</span>
+<span class="normal">679</span>
+<span class="normal">680</span>
+<span class="normal">681</span>
+<span class="normal">682</span>
+<span class="normal">683</span>
+<span class="normal">684</span>
+<span class="normal">685</span>
+<span class="normal">686</span>
+<span class="normal">687</span>
+<span class="normal">688</span>
+<span class="normal">689</span>
+<span class="normal">690</span>
+<span class="normal">691</span>
+<span class="normal">692</span>
+<span class="normal">693</span>
+<span class="normal">694</span>
+<span class="normal">695</span>
+<span class="normal">696</span>
+<span class="normal">697</span>
+<span class="normal">698</span>
+<span class="normal">699</span>
+<span class="normal">700</span>
+<span class="normal">701</span>
+<span class="normal">702</span>
+<span class="normal">703</span>
+<span class="normal">704</span>
+<span class="normal">705</span>
+<span class="normal">706</span>
+<span class="normal">707</span>
+<span class="normal">708</span>
+<span class="normal">709</span>
+<span class="normal">710</span>
+<span class="normal">711</span>
+<span class="normal">712</span>
+<span class="normal">713</span>
+<span class="normal">714</span>
+<span class="normal">715</span>
+<span class="normal">716</span>
+<span class="normal">717</span>
+<span class="normal">718</span>
+<span class="normal">719</span>
+<span class="normal">720</span>
+<span class="normal">721</span>
+<span class="normal">722</span>
+<span class="normal">723</span>
+<span class="normal">724</span>
+<span class="normal">725</span>
+<span class="normal">726</span>
+<span class="normal">727</span>
+<span class="normal">728</span>
+<span class="normal">729</span>
+<span class="normal">730</span>
+<span class="normal">731</span>
+<span class="normal">732</span>
+<span class="normal">733</span>
+<span class="normal">734</span>
+<span class="normal">735</span>
+<span class="normal">736</span>
+<span class="normal">737</span>
+<span class="normal">738</span>
+<span class="normal">739</span>
+<span class="normal">740</span>
+<span class="normal">741</span>
+<span class="normal">742</span>
+<span class="normal">743</span>
+<span class="normal">744</span>
+<span class="normal">745</span>
+<span class="normal">746</span>
+<span class="normal">747</span>
+<span class="normal">748</span>
+<span class="normal">749</span>
+<span class="normal">750</span>
+<span class="normal">751</span>
+<span class="normal">752</span>
+<span class="normal">753</span>
+<span class="normal">754</span>
+<span class="normal">755</span>
+<span class="normal">756</span>
+<span class="normal">757</span>
+<span class="normal">758</span>
+<span class="normal">759</span>
+<span class="normal">760</span>
+<span class="normal">761</span>
+<span class="normal">762</span>
+<span class="normal">763</span>
+<span class="normal">764</span>
+<span class="normal">765</span>
+<span class="normal">766</span>
+<span class="normal">767</span>
+<span class="normal">768</span>
+<span class="normal">769</span>
+<span class="normal">770</span>
+<span class="normal">771</span>
+<span class="normal">772</span>
+<span class="normal">773</span>
+<span class="normal">774</span>
+<span class="normal">775</span>
+<span class="normal">776</span>
+<span class="normal">777</span>
+<span class="normal">778</span>
+<span class="normal">779</span>
+<span class="normal">780</span>
+<span class="normal">781</span>
+<span class="normal">782</span>
+<span class="normal">783</span>
+<span class="normal">784</span>
+<span class="normal">785</span>
+<span class="normal">786</span>
+<span class="normal">787</span>
+<span class="normal">788</span>
+<span class="normal">789</span>
+<span class="normal">790</span>
+<span class="normal">791</span>
+<span class="normal">792</span>
+<span class="normal">793</span>
+<span class="normal">794</span>
+<span class="normal">795</span>
+<span class="normal">796</span>
+<span class="normal">797</span>
+<span class="normal">798</span>
+<span class="normal">799</span>
+<span class="normal">800</span>
+<span class="normal">801</span>
+<span class="normal">802</span>
+<span class="normal">803</span>
+<span class="normal">804</span>
+<span class="normal">805</span>
+<span class="normal">806</span>
+<span class="normal">807</span>
+<span class="normal">808</span>
+<span class="normal">809</span>
+<span class="normal">810</span>
+<span class="normal">811</span>
+<span class="normal">812</span>
+<span class="normal">813</span>
+<span class="normal">814</span>
+<span class="normal">815</span>
+<span class="normal">816</span>
+<span class="normal">817</span>
+<span class="normal">818</span>
+<span class="normal">819</span>
+<span class="normal">820</span>
+<span class="normal">821</span>
+<span class="normal">822</span>
+<span class="normal">823</span>
+<span class="normal">824</span>
+<span class="normal">825</span>
+<span class="normal">826</span>
+<span class="normal">827</span>
+<span class="normal">828</span>
+<span class="normal">829</span>
+<span class="normal">830</span>
+<span class="normal">831</span>
+<span class="normal">832</span>
+<span class="normal">833</span>
+<span class="normal">834</span>
+<span class="normal">835</span>
+<span class="normal">836</span>
+<span class="normal">837</span>
+<span class="normal">838</span>
+<span class="normal">839</span>
+<span class="normal">840</span>
+<span class="normal">841</span>
+<span class="normal">842</span>
+<span class="normal">843</span>
+<span class="normal">844</span>
+<span class="normal">845</span>
+<span class="normal">846</span>
+<span class="normal">847</span>
+<span class="normal">848</span>
+<span class="normal">849</span>
+<span class="normal">850</span>
+<span class="normal">851</span>
+<span class="normal">852</span>
+<span class="normal">853</span>
+<span class="normal">854</span>
+<span class="normal">855</span>
+<span class="normal">856</span>
+<span class="normal">857</span>
+<span class="normal">858</span>
+<span class="normal">859</span>
+<span class="normal">860</span>
+<span class="normal">861</span>
+<span class="normal">862</span>
+<span class="normal">863</span>
+<span class="normal">864</span>
+<span class="normal">865</span>
+<span class="normal">866</span>
+<span class="normal">867</span>
+<span class="normal">868</span>
+<span class="normal">869</span>
+<span class="normal">870</span>
+<span class="normal">871</span>
+<span class="normal">872</span>
+<span class="normal">873</span>
+<span class="normal">874</span>
+<span class="normal">875</span>
+<span class="normal">876</span>
+<span class="normal">877</span>
+<span class="normal">878</span>
+<span class="normal">879</span>
+<span class="normal">880</span>
+<span class="normal">881</span>
+<span class="normal">882</span>
+<span class="normal">883</span>
+<span class="normal">884</span>
+<span class="normal">885</span>
+<span class="normal">886</span>
+<span class="normal">887</span>
+<span class="normal">888</span>
+<span class="normal">889</span>
+<span class="normal">890</span>
+<span class="normal">891</span>
+<span class="normal">892</span>
+<span class="normal">893</span>
+<span class="normal">894</span>
+<span class="normal">895</span>
+<span class="normal">896</span>
+<span class="normal">897</span>
+<span class="normal">898</span>
+<span class="normal">899</span>
+<span class="normal">900</span>
+<span class="normal">901</span>
+<span class="normal">902</span>
+<span class="normal">903</span>
+<span class="normal">904</span>
+<span class="normal">905</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">class</span> <span class="nc">SAFEDesign</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Molecular generation using SAFE pretrained model&quot;&quot;&quot;</span>
+
+    <span class="n">_DEFAULT_MAX_LENGTH</span> <span class="o">=</span> <span class="mi">1024</span>  <span class="c1"># default max length used during training</span>
+    <span class="n">_DEFAULT_MODEL_PATH</span> <span class="o">=</span> <span class="s2">&quot;datamol-io/safe-gpt&quot;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">SAFEDoubleHeadsModel</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
+        <span class="n">tokenizer</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">SAFETokenizer</span><span class="p">],</span>
+        <span class="n">generation_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">GenerationConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">safe_encoder</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">sf</span><span class="o">.</span><span class="n">SAFEConverter</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;SAFEDesign constructor</span>
+
+<span class="sd">        !!! info</span>
+<span class="sd">            Design methods in SAFE are not deterministic when it comes to the token sampling step.</span>
+<span class="sd">            If a method accepts a `random_seed`, it&#39;s for the SAFE-related algorithms and not the</span>
+<span class="sd">            sampling from the autoregressive model. To ensure you get a deterministic sampling,</span>
+<span class="sd">            please set the seed at the `transformers` package level.</span>
+
+<span class="sd">            ```python</span>
+<span class="sd">            import safe as sf</span>
+<span class="sd">            import transformers</span>
+<span class="sd">            my_seed = 100</span>
+<span class="sd">            designer = sf.SAFEDesign(...)</span>
+
+<span class="sd">            transformers.set_seed(100) # use this before calling a design function</span>
+<span class="sd">            designer.linker_generation(...)</span>
+<span class="sd">            ```</span>
+
+
+<span class="sd">        Args:</span>
+<span class="sd">            model: input SAFEDoubleHeadsModel to use for generation</span>
+<span class="sd">            tokenizer: input SAFETokenizer to use for generation</span>
+<span class="sd">            generation_config: input GenerationConfig to use for generation</span>
+<span class="sd">            safe_encoder: custom safe encoder to use</span>
+<span class="sd">            verbose: whether to print out logging information during generation</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">PathLike</span><span class="p">)):</span>
+            <span class="n">model</span> <span class="o">=</span> <span class="n">SAFEDoubleHeadsModel</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">PathLike</span><span class="p">)):</span>
+            <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">SAFETokenizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">)</span>
+
+        <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="o">=</span> <span class="n">model</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="o">=</span> <span class="n">tokenizer</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">generation_config</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">PathLike</span><span class="p">):</span>
+            <span class="n">generation_config</span> <span class="o">=</span> <span class="n">GenerationConfig</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">generation_config</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">generation_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">generation_config</span> <span class="o">=</span> <span class="n">GenerationConfig</span><span class="o">.</span><span class="n">from_model_config</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">config</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span> <span class="o">=</span> <span class="n">generation_config</span>
+        <span class="k">for</span> <span class="n">special_token_id</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;bos_token_id&quot;</span><span class="p">,</span> <span class="s2">&quot;eos_token_id&quot;</span><span class="p">,</span> <span class="s2">&quot;pad_token_id&quot;</span><span class="p">]:</span>
+            <span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span><span class="p">,</span> <span class="n">special_token_id</span><span class="p">)</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="nb">setattr</span><span class="p">(</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span><span class="p">,</span> <span class="n">special_token_id</span><span class="p">,</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">special_token_id</span><span class="p">)</span>
+                <span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span> <span class="o">=</span> <span class="n">verbose</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span> <span class="o">=</span> <span class="n">safe_encoder</span> <span class="ow">or</span> <span class="n">sf</span><span class="o">.</span><span class="n">SAFEConverter</span><span class="p">()</span>
+
+    <span class="nd">@classmethod</span>
+    <span class="k">def</span> <span class="nf">load_default</span><span class="p">(</span>
+        <span class="bp">cls</span><span class="p">,</span> <span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">model_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">device</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SAFEDesign&quot;</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Load default SAFEGenerator model</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            verbose: whether to print out logging information during generation</span>
+<span class="sd">            model_dir: Optional path to model folder to use instead of the default one.</span>
+<span class="sd">                If provided the tokenizer should be in the model_dir named as `tokenizer.json`</span>
+<span class="sd">            device: optional device where to move the model</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">model_dir</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">model_dir</span><span class="p">:</span>
+            <span class="n">model_dir</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_DEFAULT_MODEL_PATH</span>
+        <span class="n">model</span> <span class="o">=</span> <span class="n">SAFEDoubleHeadsModel</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
+        <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">SAFETokenizer</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
+        <span class="n">gen_config</span> <span class="o">=</span> <span class="n">GenerationConfig</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">device</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">model</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+        <span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span> <span class="n">tokenizer</span><span class="o">=</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">generation_config</span><span class="o">=</span><span class="n">gen_config</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="n">verbose</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">linker_generation</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="o">*</span><span class="n">groups</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">model_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform linker generation using the pretrained SAFE model.</span>
+<span class="sd">        Linker generation is really just scaffold morphing underlying.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            groups: list of fragments to link together, they are joined in the order provided</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            model_only: whether to use the model only ability and nothing more.</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">side_chains</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">groups</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">side_chains</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                <span class="s2">&quot;Linker generation only works when providing two groups as side chains&quot;</span>
+            <span class="p">)</span>
+
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_fragment_linking</span><span class="p">(</span>
+            <span class="n">side_chains</span><span class="o">=</span><span class="n">side_chains</span><span class="p">,</span>
+            <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+            <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+            <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+            <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+            <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+            <span class="n">is_linking</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+            <span class="n">model_only</span><span class="o">=</span><span class="n">model_only</span><span class="p">,</span>
+            <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+        <span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">scaffold_morphing</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">side_chains</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">mol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">core</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform scaffold morphing decoration using the pretrained SAFE model</span>
+
+<span class="sd">        For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.</span>
+<span class="sd">        If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the</span>
+<span class="sd">        scaffold morphing then.</span>
+
+<span class="sd">        !!! note &quot;Finding the side chains&quot;</span>
+<span class="sd">            The algorithm to find the side chains from core assumes that the core we get as input has attachment points.</span>
+<span class="sd">            Those attachment points are never considered as part of the query, rather they are used to define the attachment points.</span>
+<span class="sd">            See ~sf.utils.compute_side_chains for more information.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)</span>
+<span class="sd">            mol: input molecules when side_chains are not provided</span>
+<span class="sd">            core: core to morph into another scaffold</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_fragment_linking</span><span class="p">(</span>
+            <span class="n">side_chains</span><span class="o">=</span><span class="n">side_chains</span><span class="p">,</span>
+            <span class="n">mol</span><span class="o">=</span><span class="n">mol</span><span class="p">,</span>
+            <span class="n">core</span><span class="o">=</span><span class="n">core</span><span class="p">,</span>
+            <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+            <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+            <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+            <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+            <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+            <span class="n">is_linking</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+            <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+        <span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">_fragment_linking</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">side_chains</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">mol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">core</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">is_linking</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">model_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform scaffold morphing decoration using the pretrained SAFE model</span>
+
+<span class="sd">        For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.</span>
+<span class="sd">        If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the</span>
+<span class="sd">        scaffold morphing then.</span>
+
+<span class="sd">        !!! note &quot;Finding the side chains&quot;</span>
+<span class="sd">            The algorithm to find the side chains from core assumes that the core we get as input has attachment points.</span>
+<span class="sd">            Those attachment points are never considered as part of the query, rather they are used to define the attachment points.</span>
+<span class="sd">            See ~sf.utils.compute_side_chains for more information.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)</span>
+<span class="sd">            mol: input molecules when side_chains are not provided</span>
+<span class="sd">            core: core to morph into another scaffold</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            is_linking: whether it&#39;s a linking task or not.</span>
+<span class="sd">                For linking tasks, we use a different custom strategy of completing up to the attachment signal</span>
+<span class="sd">            model_only: whether to use the model only ability and nothing more. Only relevant when doing linker generation</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">side_chains</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">mol</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">core</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="s2">&quot;Either side_chains OR mol+core should be provided for scaffold morphing&quot;</span>
+                <span class="p">)</span>
+            <span class="n">side_chains</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">trainer</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">compute_side_chains</span><span class="p">(</span><span class="n">mol</span><span class="p">,</span> <span class="n">core</span><span class="p">)</span>
+        <span class="n">side_chains</span> <span class="o">=</span> <span class="p">(</span>
+            <span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">side_chains</span><span class="p">]</span>
+            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">side_chains</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span>
+            <span class="k">else</span> <span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">side_chains</span><span class="p">)]</span>
+        <span class="p">)</span>
+
+        <span class="n">side_chains</span> <span class="o">=</span> <span class="s2">&quot;.&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="n">dm</span><span class="o">.</span><span class="n">to_smiles</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">side_chains</span><span class="p">])</span>
+
+        <span class="k">if</span> <span class="s2">&quot;*&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">side_chains</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;Side chain </span><span class="si">{</span><span class="n">side_chains</span><span class="si">}</span><span class="s2"> does not contain any dummy atoms, this might not be what you want&quot;</span>
+            <span class="p">)</span>
+
+        <span class="n">rng</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">Random</span><span class="p">(</span><span class="n">random_seed</span><span class="p">)</span>
+        <span class="n">new_seed</span> <span class="o">=</span> <span class="n">rng</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1000</span><span class="p">)</span>
+
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">n_trials</span> <span class="o">=</span> <span class="n">n_trials</span> <span class="ow">or</span> <span class="mi">1</span>
+        <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_trials</span><span class="p">),</span> <span class="n">disable</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">),</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+            <span class="k">with</span> <span class="n">dm</span><span class="o">.</span><span class="n">without_rdkit_log</span><span class="p">():</span>
+                <span class="n">context_mng</span> <span class="o">=</span> <span class="p">(</span>
+                    <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">attr_as</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="p">,</span> <span class="s2">&quot;slicer&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+                    <span class="k">if</span> <span class="n">do_not_fragment_further</span>
+                    <span class="k">else</span> <span class="n">suppress</span><span class="p">()</span>
+                <span class="p">)</span>
+                <span class="n">old_slicer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="p">,</span> <span class="s2">&quot;slicer&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+                <span class="k">with</span> <span class="n">context_mng</span><span class="p">:</span>
+                    <span class="k">try</span><span class="p">:</span>
+                        <span class="n">encoded_fragment</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">encoder</span><span class="p">(</span>
+                            <span class="n">side_chains</span><span class="p">,</span>
+                            <span class="n">canonical</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                            <span class="n">randomize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                            <span class="n">constraints</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+                            <span class="n">allow_empty</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                            <span class="n">seed</span><span class="o">=</span><span class="n">new_seed</span><span class="p">,</span>
+                        <span class="p">)</span>
+
+                    <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+                        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+                            <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
+                        <span class="k">raise</span> <span class="n">sf</span><span class="o">.</span><span class="n">SAFEEncodeError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Failed to encode </span><span class="si">{</span><span class="n">side_chains</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="kn">from</span> <span class="nn">e</span>
+                    <span class="k">finally</span><span class="p">:</span>
+                        <span class="k">if</span> <span class="n">old_slicer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                            <span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">slicer</span> <span class="o">=</span> <span class="n">old_slicer</span>
+
+            <span class="n">fragments</span> <span class="o">=</span> <span class="n">encoded_fragment</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)</span>
+            <span class="n">missing_closure</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">_find_branch_number</span><span class="p">(</span><span class="n">encoded_fragment</span><span class="p">))</span>
+            <span class="n">missing_closure</span> <span class="o">=</span> <span class="p">[</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="nb">str</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">missing_closure</span> <span class="k">if</span> <span class="n">missing_closure</span><span class="p">[</span><span class="n">x</span><span class="p">]</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">1</span><span class="p">]</span>
+
+            <span class="n">closure_pos</span> <span class="o">=</span> <span class="p">[</span>
+                <span class="n">m</span><span class="o">.</span><span class="n">start</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">missing_closure</span> <span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="n">re</span><span class="o">.</span><span class="n">finditer</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">encoded_fragment</span><span class="p">)</span>
+            <span class="p">]</span>
+            <span class="n">fragment_pos</span> <span class="o">=</span> <span class="p">[</span><span class="n">m</span><span class="o">.</span><span class="n">start</span><span class="p">()</span> <span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="n">re</span><span class="o">.</span><span class="n">finditer</span><span class="p">(</span><span class="sa">r</span><span class="s2">&quot;\.&quot;</span><span class="p">,</span> <span class="n">encoded_fragment</span><span class="p">)]</span>
+            <span class="n">min_pos</span> <span class="o">=</span> <span class="mi">0</span>
+            <span class="k">while</span> <span class="n">fragment_pos</span><span class="p">[</span><span class="n">min_pos</span><span class="p">]</span> <span class="o">&lt;</span> <span class="n">closure_pos</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">and</span> <span class="n">min_pos</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">fragment_pos</span><span class="p">):</span>
+                <span class="n">min_pos</span> <span class="o">+=</span> <span class="mi">1</span>
+            <span class="n">min_pos</span> <span class="o">+=</span> <span class="mi">1</span>
+            <span class="n">max_pos</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">fragment_pos</span><span class="p">)</span>
+            <span class="k">while</span> <span class="n">fragment_pos</span><span class="p">[</span><span class="n">max_pos</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span> <span class="o">&gt;</span> <span class="n">closure_pos</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="ow">and</span> <span class="n">max_pos</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="n">max_pos</span> <span class="o">-=</span> <span class="mi">1</span>
+
+            <span class="n">split_index</span> <span class="o">=</span> <span class="n">rng</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="n">min_pos</span><span class="p">,</span> <span class="n">max_pos</span><span class="p">)</span>
+            <span class="n">prefix</span><span class="p">,</span> <span class="n">suffixes</span> <span class="o">=</span> <span class="s2">&quot;.&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">fragments</span><span class="p">[:</span><span class="n">split_index</span><span class="p">]),</span> <span class="s2">&quot;.&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">fragments</span><span class="p">[</span><span class="n">split_index</span><span class="p">:])</span>
+
+            <span class="n">missing_prefix_closure</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">_find_branch_number</span><span class="p">(</span><span class="n">prefix</span><span class="p">))</span>
+            <span class="n">missing_suffix_closure</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">_find_branch_number</span><span class="p">(</span><span class="n">suffixes</span><span class="p">))</span>
+
+            <span class="n">missing_prefix_closure</span> <span class="o">=</span> <span class="p">(</span>
+                <span class="p">[</span><span class="s2">&quot;.&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">missing_closure</span> <span class="k">if</span> <span class="nb">int</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">missing_prefix_closure</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;.&quot;</span><span class="p">]</span>
+            <span class="p">)</span>
+            <span class="n">missing_suffix_closure</span> <span class="o">=</span> <span class="p">(</span>
+                <span class="p">[</span><span class="s2">&quot;.&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">missing_closure</span> <span class="k">if</span> <span class="nb">int</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">missing_suffix_closure</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;.&quot;</span><span class="p">]</span>
+            <span class="p">)</span>
+
+            <span class="n">constraints_ids</span> <span class="o">=</span> <span class="p">[]</span>
+            <span class="k">for</span> <span class="n">permutation</span> <span class="ow">in</span> <span class="n">itertools</span><span class="o">.</span><span class="n">permutations</span><span class="p">(</span><span class="n">missing_closure</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;.&quot;</span><span class="p">]):</span>
+                <span class="n">constraints_ids</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">permutation</span><span class="p">),</span> <span class="n">add_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+                <span class="p">)</span>
+
+            <span class="c1"># prefix_constraints_ids = self.tokenizer.encode(missing_prefix_closure, add_special_tokens=False)</span>
+            <span class="c1"># suffix_constraints_ids = self.tokenizer.encode(missing_suffix_closure, add_special_tokens=False)</span>
+
+            <span class="c1"># suffix_ids = self.tokenizer.encode([suffixes+self.tokenizer.tokenizer.eos_token], add_special_tokens=False)</span>
+            <span class="c1"># prefix_ids = self.tokenizer.encode([prefix], add_special_tokens=False)</span>
+
+            <span class="n">prefix_kwargs</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+            <span class="n">suffix_kwargs</span> <span class="o">=</span> <span class="n">prefix_kwargs</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+
+            <span class="k">if</span> <span class="n">is_linking</span> <span class="ow">and</span> <span class="n">model_only</span><span class="p">:</span>
+                <span class="k">for</span> <span class="n">_kwargs</span> <span class="ow">in</span> <span class="p">[</span><span class="n">prefix_kwargs</span><span class="p">,</span> <span class="n">suffix_kwargs</span><span class="p">]:</span>
+                    <span class="n">_kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;how&quot;</span><span class="p">,</span> <span class="s2">&quot;beam&quot;</span><span class="p">)</span>
+                    <span class="n">_kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;num_beams&quot;</span><span class="p">,</span> <span class="n">n_samples_per_trial</span><span class="p">)</span>
+                    <span class="n">_kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;do_sample&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
+
+                <span class="n">prefix_kwargs</span><span class="p">[</span><span class="s2">&quot;constraints&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+                <span class="n">suffix_kwargs</span><span class="p">[</span><span class="s2">&quot;constraints&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
+                <span class="c1"># prefix_kwargs[&quot;constraints&quot;] = [PhrasalConstraint(tkl) for tkl in suffix_constraints_ids]</span>
+                <span class="c1"># suffix_kwargs[&quot;constraints&quot;] = [PhrasalConstraint(tkl) for tkl in prefix_constraints_ids]</span>
+
+                <span class="c1"># we first generate a part of the fragment with for unique constraint that it should contain</span>
+                <span class="c1"># the closure required to join something to the suffix.</span>
+                <span class="n">prefix_kwargs</span><span class="p">[</span><span class="s2">&quot;constraints&quot;</span><span class="p">]</span> <span class="o">+=</span> <span class="p">[</span>
+                    <span class="n">DisjunctiveConstraint</span><span class="p">(</span><span class="n">tkl</span><span class="p">)</span> <span class="k">for</span> <span class="n">tkl</span> <span class="ow">in</span> <span class="n">constraints_ids</span>
+                <span class="p">]</span>
+                <span class="n">suffix_kwargs</span><span class="p">[</span><span class="s2">&quot;constraints&quot;</span><span class="p">]</span> <span class="o">+=</span> <span class="p">[</span>
+                    <span class="n">DisjunctiveConstraint</span><span class="p">(</span><span class="n">tkl</span><span class="p">)</span> <span class="k">for</span> <span class="n">tkl</span> <span class="ow">in</span> <span class="n">constraints_ids</span>
+                <span class="p">]</span>
+
+                <span class="n">prefix_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span>
+                    <span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="n">safe_prefix</span><span class="o">=</span><span class="n">prefix</span><span class="p">,</span> <span class="o">**</span><span class="n">prefix_kwargs</span>
+                <span class="p">)</span>
+                <span class="n">suffix_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span>
+                    <span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="n">safe_prefix</span><span class="o">=</span><span class="n">suffixes</span><span class="p">,</span> <span class="o">**</span><span class="n">suffix_kwargs</span>
+                <span class="p">)</span>
+
+                <span class="n">prefix_sequences</span> <span class="o">=</span> <span class="p">[</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">_find_fragment_cut</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">prefix</span><span class="p">,</span> <span class="n">missing_prefix_closure</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+                    <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">prefix_sequences</span>
+                <span class="p">]</span>
+                <span class="n">suffix_sequences</span> <span class="o">=</span> <span class="p">[</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">_find_fragment_cut</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">suffixes</span><span class="p">,</span> <span class="n">missing_suffix_closure</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+                    <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">suffix_sequences</span>
+                <span class="p">]</span>
+
+                <span class="n">linkers</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="nb">set</span><span class="p">(</span><span class="n">prefix_sequences</span> <span class="o">+</span> <span class="n">suffix_sequences</span><span class="p">)</span> <span class="k">if</span> <span class="n">x</span><span class="p">]</span>
+                <span class="n">sequences</span> <span class="o">=</span> <span class="p">[</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">prefix</span><span class="si">}</span><span class="s2">.</span><span class="si">{</span><span class="n">linker</span><span class="si">}</span><span class="s2">.</span><span class="si">{</span><span class="n">suffixes</span><span class="si">}</span><span class="s2">&quot;</span> <span class="k">for</span> <span class="n">linker</span> <span class="ow">in</span> <span class="n">linkers</span><span class="p">]</span>
+                <span class="n">sequences</span> <span class="o">+=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decode_safe</span><span class="p">(</span><span class="n">sequences</span><span class="p">,</span> <span class="n">canonical</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="o">=</span><span class="n">sanitize</span><span class="p">)</span>
+
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">mol_linker_slicer</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">MolSlicer</span><span class="p">(</span>
+                    <span class="n">shortest_linker</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="n">is_linking</span><span class="p">),</span> <span class="n">require_ring_system</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="n">is_linking</span><span class="p">)</span>
+                <span class="p">)</span>
+                <span class="n">prefix_smiles</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="n">remove_dummies</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">as_mol</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+                <span class="n">suffix_smiles</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">suffixes</span><span class="p">,</span> <span class="n">remove_dummies</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">as_mol</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+
+                <span class="n">prefix_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span>
+                    <span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="n">safe_prefix</span><span class="o">=</span><span class="n">prefix</span> <span class="o">+</span> <span class="s2">&quot;.&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">prefix_kwargs</span>
+                <span class="p">)</span>
+                <span class="n">suffix_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span>
+                    <span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="n">safe_prefix</span><span class="o">=</span><span class="n">suffixes</span> <span class="o">+</span> <span class="s2">&quot;.&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">suffix_kwargs</span>
+                <span class="p">)</span>
+
+                <span class="n">prefix_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decode_safe</span><span class="p">(</span>
+                    <span class="n">prefix_sequences</span><span class="p">,</span> <span class="n">canonical</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="o">=</span><span class="kc">True</span>
+                <span class="p">)</span>
+                <span class="n">suffix_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decode_safe</span><span class="p">(</span>
+                    <span class="n">suffix_sequences</span><span class="p">,</span> <span class="n">canonical</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="o">=</span><span class="kc">True</span>
+                <span class="p">)</span>
+                <span class="n">sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__mix_sequences</span><span class="p">(</span>
+                    <span class="n">prefix_sequences</span><span class="p">,</span>
+                    <span class="n">suffix_sequences</span><span class="p">,</span>
+                    <span class="n">prefix_smiles</span><span class="p">,</span>
+                    <span class="n">suffix_smiles</span><span class="p">,</span>
+                    <span class="n">n_samples_per_trial</span><span class="p">,</span>
+                    <span class="n">mol_linker_slicer</span><span class="p">,</span>
+                <span class="p">)</span>
+
+            <span class="n">total_sequences</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">sequences</span><span class="p">)</span>
+
+        <span class="c1"># then we should filter out molecules that do not match the requested</span>
+        <span class="k">if</span> <span class="n">sanitize</span><span class="p">:</span>
+            <span class="n">total_sequences</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">filter_by_substructure_constraints</span><span class="p">(</span>
+                <span class="n">total_sequences</span><span class="p">,</span> <span class="n">side_chains</span>
+            <span class="p">)</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %)  generated molecules are valid !&quot;</span>
+                <span class="p">)</span>
+        <span class="k">return</span> <span class="n">total_sequences</span>
+
+    <span class="k">def</span> <span class="nf">motif_extension</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">motif</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform motif extension using the pretrained SAFE model.</span>
+<span class="sd">        Motif extension is really just scaffold decoration underlying.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            motif: scaffold (with attachment points) to decorate</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules and check</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">scaffold_decoration</span><span class="p">(</span>
+            <span class="n">motif</span><span class="p">,</span>
+            <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+            <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+            <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+            <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+            <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+            <span class="n">add_dot</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+            <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+        <span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">super_structure</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">core</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">attachment_point_depth</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform super structure generation using the pretrained SAFE model.</span>
+
+<span class="sd">        To generate super-structure, we basically just create various attachment points to the input core,</span>
+<span class="sd">        then perform scaffold decoration.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            core: input substructure to use. We aim to generate super structures of this molecule</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of different attachment points to consider</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            attachment_point_depth: depth of opening the attachment points.</span>
+<span class="sd">                Increasing this, means you increase the number of substitution point to consider.</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+
+        <span class="n">core</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">core</span><span class="p">)</span>
+        <span class="n">cores</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">list_individual_attach_points</span><span class="p">(</span><span class="n">core</span><span class="p">,</span> <span class="n">depth</span><span class="o">=</span><span class="n">attachment_point_depth</span><span class="p">)</span>
+        <span class="c1"># get the fully open mol, everytime too.</span>
+        <span class="n">cores</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">to_smiles</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">reactions</span><span class="o">.</span><span class="n">open_attach_points</span><span class="p">(</span><span class="n">core</span><span class="p">)))</span>
+        <span class="n">cores</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">cores</span><span class="p">))</span>
+        <span class="n">rng</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">Random</span><span class="p">(</span><span class="n">random_seed</span><span class="p">)</span>
+        <span class="n">rng</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">cores</span><span class="p">)</span>
+        <span class="c1"># now also get the single openining of an attachment point</span>
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">n_trials</span> <span class="o">=</span> <span class="n">n_trials</span> <span class="ow">or</span> <span class="mi">1</span>
+        <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_trials</span><span class="p">),</span> <span class="n">disable</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">),</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+            <span class="n">core</span> <span class="o">=</span> <span class="n">cores</span><span class="p">[</span><span class="n">_</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">cores</span><span class="p">)]</span>
+            <span class="n">old_verbose</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span>
+            <span class="k">try</span><span class="p">:</span>
+                <span class="k">with</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">attr_as</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;verbose&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span>
+                    <span class="n">out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_completion</span><span class="p">(</span>
+                        <span class="n">fragment</span><span class="o">=</span><span class="n">core</span><span class="p">,</span>
+                        <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+                        <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
+                        <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+                        <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+                        <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+                        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+                    <span class="p">)</span>
+                    <span class="n">total_sequences</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">out</span><span class="p">)</span>
+            <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+                <span class="k">if</span> <span class="n">old_verbose</span><span class="p">:</span>
+                    <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
+
+            <span class="k">finally</span><span class="p">:</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span> <span class="o">=</span> <span class="n">old_verbose</span>
+
+        <span class="k">if</span> <span class="n">sanitize</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %)  generated molecules are valid !&quot;</span>
+            <span class="p">)</span>
+        <span class="k">return</span> <span class="n">total_sequences</span>
+
+    <span class="k">def</span> <span class="nf">scaffold_decoration</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">scaffold</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">add_dot</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform scaffold decoration using the pretrained SAFE model</span>
+
+<span class="sd">        For scaffold decoration, we basically starts with a prefix with the attachment point.</span>
+<span class="sd">        We first convert the prefix into valid safe string.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            scaffold: scaffold (with attachment points) to decorate</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules and check if the scaffold is still present</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_completion</span><span class="p">(</span>
+            <span class="n">fragment</span><span class="o">=</span><span class="n">scaffold</span><span class="p">,</span>
+            <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+            <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+            <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+            <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+            <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+            <span class="n">add_dot</span><span class="o">=</span><span class="n">add_dot</span><span class="p">,</span>
+            <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+        <span class="p">)</span>
+        <span class="c1"># if we require sanitization</span>
+        <span class="c1"># then we should filter out molecules that do not match the requested</span>
+        <span class="k">if</span> <span class="n">sanitize</span><span class="p">:</span>
+            <span class="n">total_sequences</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">filter_by_substructure_constraints</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">,</span> <span class="n">scaffold</span><span class="p">)</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+                <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %)  generated molecules are valid !&quot;</span>
+                <span class="p">)</span>
+        <span class="k">return</span> <span class="n">total_sequences</span>
+
+    <span class="k">def</span> <span class="nf">de_novo_generation</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform de novo generation using the pretrained SAFE model.</span>
+
+<span class="sd">        De novo generation is equivalent to not having any prefix.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate</span>
+<span class="sd">            sanitize: whether to perform sanitization, aka, perform control to ensure what is asked is what is returned</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="c1"># EN: lazy programming much ?</span>
+        <span class="n">kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;how&quot;</span><span class="p">,</span> <span class="s2">&quot;random&quot;</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;how&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">&quot;random&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;do_sample&quot;</span><span class="p">):</span>
+            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
+                <span class="s2">&quot;I don&#39;t think you know what you are doing ... for de novo generation `do_sample=True` or `how=&#39;random&#39;` is expected !&quot;</span>
+            <span class="p">)</span>
+
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">n_trials</span> <span class="o">=</span> <span class="n">n_trials</span> <span class="ow">or</span> <span class="mi">1</span>
+        <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_trials</span><span class="p">),</span> <span class="n">disable</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">),</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+            <span class="n">sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span><span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+            <span class="n">total_sequences</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">sequences</span><span class="p">)</span>
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decode_safe</span><span class="p">(</span>
+            <span class="n">total_sequences</span><span class="p">,</span> <span class="n">canonical</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="o">=</span><span class="n">sanitize</span>
+        <span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">sanitize</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %) generated molecules are valid !&quot;</span>
+            <span class="p">)</span>
+        <span class="k">return</span> <span class="n">total_sequences</span>
+
+    <span class="k">def</span> <span class="nf">_find_fragment_cut</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">fragment</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">prefix_constraint</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">branching_id</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Perform a cut on the input fragment in such a way that it could be joined with another fragments sharing the same</span>
+<span class="sd">        branching id.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            fragment: fragment to cut</span>
+<span class="sd">            prefix_constraint: prefix constraint to use</span>
+<span class="sd">            branching_id: branching id to use</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">prefix_constraint</span> <span class="o">=</span> <span class="n">prefix_constraint</span><span class="o">.</span><span class="n">rstrip</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;.&quot;</span>
+        <span class="n">fragment</span> <span class="o">=</span> <span class="p">(</span>
+            <span class="n">fragment</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">prefix_constraint</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+            <span class="k">if</span> <span class="n">fragment</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="n">prefix_constraint</span><span class="p">)</span>
+            <span class="k">else</span> <span class="n">fragment</span>
+        <span class="p">)</span>
+        <span class="n">fragments</span> <span class="o">=</span> <span class="n">fragment</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)</span>
+        <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span>
+        <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">fragments</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">branching_id</span> <span class="ow">in</span> <span class="n">x</span><span class="p">:</span>
+                <span class="n">i</span> <span class="o">+=</span> <span class="mi">1</span>
+                <span class="k">break</span>
+        <span class="k">return</span> <span class="s2">&quot;.&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">fragments</span><span class="p">[:</span><span class="n">i</span><span class="p">])</span>
+
+    <span class="k">def</span> <span class="nf">__mix_sequences</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">prefix_sequences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
+        <span class="n">suffix_sequences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
+        <span class="n">prefix</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+        <span class="n">suffix</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+        <span class="n">n_samples</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+        <span class="n">mol_linker_slicer</span><span class="p">,</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Use generated prefix and suffix sequences to form new molecules</span>
+<span class="sd">        that will be the merging of both. This is the two step scaffold morphing and linker generation scheme</span>
+<span class="sd">        Args:</span>
+<span class="sd">            prefix_sequences: list of prefix sequences</span>
+<span class="sd">            suffix_sequences: list of suffix sequences</span>
+<span class="sd">            prefix: decoded smiles of the prefix</span>
+<span class="sd">            suffix: decoded smiles of the suffix</span>
+<span class="sd">            n_samples: number of samples to generate</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">prefix_linkers</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">suffix_linkers</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">prefix_query</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">from_smarts</span><span class="p">(</span><span class="n">prefix</span><span class="p">)</span>
+        <span class="n">suffix_query</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">from_smarts</span><span class="p">(</span><span class="n">suffix</span><span class="p">)</span>
+
+        <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">prefix_sequences</span><span class="p">:</span>
+            <span class="k">with</span> <span class="n">suppress</span><span class="p">(</span><span class="ne">Exception</span><span class="p">):</span>
+                <span class="n">x</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+                <span class="n">out</span> <span class="o">=</span> <span class="n">mol_linker_slicer</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">prefix_query</span><span class="p">)</span>
+                <span class="n">prefix_linkers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">out</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+        <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">suffix_sequences</span><span class="p">:</span>
+            <span class="k">with</span> <span class="n">suppress</span><span class="p">(</span><span class="ne">Exception</span><span class="p">):</span>
+                <span class="n">x</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+                <span class="n">out</span> <span class="o">=</span> <span class="n">mol_linker_slicer</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">suffix_query</span><span class="p">)</span>
+                <span class="n">suffix_linkers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">out</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+        <span class="n">n_linked</span> <span class="o">=</span> <span class="mi">0</span>
+        <span class="n">linked</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">linkers</span> <span class="o">=</span> <span class="n">prefix_linkers</span> <span class="o">+</span> <span class="n">suffix_linkers</span>
+        <span class="n">linkers</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">linkers</span> <span class="k">if</span> <span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">]</span>
+        <span class="k">for</span> <span class="n">n_linked</span><span class="p">,</span> <span class="n">linker</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">linkers</span><span class="p">):</span>
+            <span class="n">linked</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">mol_linker_slicer</span><span class="o">.</span><span class="n">link_fragments</span><span class="p">(</span><span class="n">linker</span><span class="p">,</span> <span class="n">prefix</span><span class="p">,</span> <span class="n">suffix</span><span class="p">))</span>
+            <span class="k">if</span> <span class="n">n_linked</span> <span class="o">&gt;</span> <span class="n">n_samples</span><span class="p">:</span>
+                <span class="k">break</span>
+            <span class="n">linked</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">linked</span> <span class="k">if</span> <span class="n">x</span><span class="p">]</span>
+        <span class="k">return</span> <span class="n">linked</span><span class="p">[:</span><span class="n">n_samples</span><span class="p">]</span>
+
+    <span class="k">def</span> <span class="nf">_decode_safe</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span> <span class="n">sequences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">canonical</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Decode a safe sequence into a molecule</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            sequence: safe sequence to decode</span>
+<span class="sd">            canonical: whether to return canonical sequence</span>
+<span class="sd">            remove_invalid: whether to remove invalid safe strings or keep them</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+
+        <span class="k">def</span> <span class="nf">_decode_fn</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
+            <span class="k">return</span> <span class="n">sf</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span>
+                <span class="n">x</span><span class="p">,</span>
+                <span class="n">as_mol</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                <span class="n">fix</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                <span class="n">remove_added_hs</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                <span class="n">canonical</span><span class="o">=</span><span class="n">canonical</span><span class="p">,</span>
+                <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                <span class="n">remove_dummies</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+            <span class="p">)</span>
+
+        <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sequences</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">100</span><span class="p">:</span>
+            <span class="n">safe_strings</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">parallelized</span><span class="p">(</span><span class="n">_decode_fn</span><span class="p">,</span> <span class="n">sequences</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">safe_strings</span> <span class="o">=</span> <span class="p">[</span><span class="n">_decode_fn</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">sequences</span><span class="p">]</span>
+        <span class="k">if</span> <span class="n">remove_invalid</span><span class="p">:</span>
+            <span class="n">safe_strings</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">safe_strings</span> <span class="k">if</span> <span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">]</span>
+
+        <span class="k">return</span> <span class="n">safe_strings</span>
+
+    <span class="k">def</span> <span class="nf">_completion</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">fragment</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+        <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">add_dot</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="n">is_safe</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Perform sentence completion using a prefix fragment</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            fragment: fragment (with attachment points)</span>
+<span class="sd">            n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">            n_trials: number of randomization to perform</span>
+<span class="sd">            do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">            sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">            random_seed: random seed to use</span>
+<span class="sd">            is_safe: whether the smiles is already encoded as a safe string</span>
+<span class="sd">            add_dot: whether to add a dot at the end of the fragments to signal to the model that we want to generate a distinct fragment.</span>
+<span class="sd">            kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+
+        <span class="c1"># EN: lazy programming much ?</span>
+        <span class="n">kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;how&quot;</span><span class="p">,</span> <span class="s2">&quot;random&quot;</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;how&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">&quot;random&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;do_sample&quot;</span><span class="p">):</span>
+            <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
+                <span class="s2">&quot;I don&#39;t think you know what you are doing ... for de novo generation `do_sample=True` or `how=&#39;random&#39;` is expected !&quot;</span>
+            <span class="p">)</span>
+
+        <span class="c1"># Step 1: we conver the fragment into the relevant safe string format</span>
+        <span class="c1"># we use the provided safe encoder with the slicer that was expected</span>
+
+        <span class="n">rng</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">Random</span><span class="p">(</span><span class="n">random_seed</span><span class="p">)</span>
+        <span class="n">new_seed</span> <span class="o">=</span> <span class="n">rng</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1000</span><span class="p">)</span>
+
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="p">[]</span>
+        <span class="n">n_trials</span> <span class="o">=</span> <span class="n">n_trials</span> <span class="ow">or</span> <span class="mi">1</span>
+        <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_trials</span><span class="p">),</span> <span class="n">disable</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">),</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+            <span class="k">if</span> <span class="n">is_safe</span><span class="p">:</span>
+                <span class="n">encoded_fragment</span> <span class="o">=</span> <span class="n">fragment</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="k">with</span> <span class="n">dm</span><span class="o">.</span><span class="n">without_rdkit_log</span><span class="p">():</span>
+                    <span class="n">context_mng</span> <span class="o">=</span> <span class="p">(</span>
+                        <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">attr_as</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="p">,</span> <span class="s2">&quot;slicer&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+                        <span class="k">if</span> <span class="n">do_not_fragment_further</span>
+                        <span class="k">else</span> <span class="n">suppress</span><span class="p">()</span>
+                    <span class="p">)</span>
+                    <span class="n">old_slicer</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="p">,</span> <span class="s2">&quot;slicer&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+                    <span class="k">with</span> <span class="n">context_mng</span><span class="p">:</span>
+                        <span class="k">try</span><span class="p">:</span>
+                            <span class="n">encoded_fragment</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">encoder</span><span class="p">(</span>
+                                <span class="n">fragment</span><span class="p">,</span>
+                                <span class="n">canonical</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                                <span class="n">randomize</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                                <span class="n">constraints</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+                                <span class="n">allow_empty</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                                <span class="n">seed</span><span class="o">=</span><span class="n">new_seed</span><span class="p">,</span>
+                            <span class="p">)</span>
+
+                        <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+                            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+                                <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
+                            <span class="k">raise</span> <span class="n">sf</span><span class="o">.</span><span class="n">SAFEEncodeError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Failed to encode </span><span class="si">{</span><span class="n">fragment</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> <span class="kn">from</span> <span class="nn">e</span>
+                        <span class="k">finally</span><span class="p">:</span>
+                            <span class="k">if</span> <span class="n">old_slicer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                                <span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span><span class="o">.</span><span class="n">slicer</span> <span class="o">=</span> <span class="n">old_slicer</span>
+
+            <span class="k">if</span> <span class="n">add_dot</span> <span class="ow">and</span> <span class="n">encoded_fragment</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="s2">&quot;(&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="n">encoded_fragment</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="s2">&quot;)&quot;</span><span class="p">):</span>
+                <span class="n">encoded_fragment</span> <span class="o">=</span> <span class="n">encoded_fragment</span><span class="o">.</span><span class="n">rstrip</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;.&quot;</span>
+
+            <span class="n">sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span>
+                <span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="n">safe_prefix</span><span class="o">=</span><span class="n">encoded_fragment</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
+            <span class="p">)</span>
+
+            <span class="n">sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decode_safe</span><span class="p">(</span><span class="n">sequences</span><span class="p">,</span> <span class="n">canonical</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="o">=</span><span class="n">sanitize</span><span class="p">)</span>
+            <span class="n">total_sequences</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">sequences</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="n">total_sequences</span>
+
+    <span class="k">def</span> <span class="nf">_generate</span><span class="p">(</span>
+        <span class="bp">self</span><span class="p">,</span>
+        <span class="n">n_samples</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+        <span class="n">safe_prefix</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">max_length</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span>
+        <span class="n">how</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;random&quot;</span><span class="p">,</span>
+        <span class="n">num_beams</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">num_beam_groups</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">do_sample</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+    <span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Sample a new sequence using the underlying hugging face model.</span>
+<span class="sd">        This emulates the izanagi sampling models, if you wish to retain the hugging face generation</span>
+<span class="sd">        behaviour, either call the hugging face functions directly or overwrite this function</span>
+
+<span class="sd">        ??? note &quot;Generation Parameters&quot;</span>
+<span class="sd">            From the hugging face documentation:</span>
+
+<span class="sd">            * `greedy decoding` if how=&quot;greedy&quot; and num_beams=1 and do_sample=False.</span>
+<span class="sd">            * `multinomial sampling` if num_beams=1 and do_sample=True.</span>
+<span class="sd">            * `beam-search decoding` if how=&quot;beam&quot; and num_beams&gt;1 and do_sample=False.</span>
+<span class="sd">            * `beam-search multinomial` sampling by calling if beam=True, num_beams&gt;1 and do_sample=True or how=&quot;random&quot; and num_beams&gt;1</span>
+<span class="sd">            * `diverse beam-search decoding` if num_beams&gt;1 and num_beam_groups&gt;1</span>
+
+<span class="sd">            It&#39;s also possible to ignore the &#39;how&#39; shortcut and directly call the underlying generation methods using the proper arguments.</span>
+<span class="sd">            Learn more here: https://huggingface.co/docs/transformers/v4.32.0/en/main_classes/text_generation#transformers.GenerationConfig</span>
+<span class="sd">            Under the hood, the following will be applied depending on the arguments:</span>
+
+<span class="sd">            * greedy decoding by calling greedy_search() if num_beams=1 and do_sample=False</span>
+<span class="sd">            * contrastive search by calling contrastive_search() if penalty_alpha&gt;0. and top_k&gt;1</span>
+<span class="sd">            * multinomial sampling by calling sample() if num_beams=1 and do_sample=True</span>
+<span class="sd">            * beam-search decoding by calling beam_search() if num_beams&gt;1 and do_sample=False</span>
+<span class="sd">            * beam-search multinomial sampling by calling beam_sample() if num_beams&gt;1 and do_sample=True</span>
+<span class="sd">            * diverse beam-search decoding by calling group_beam_search(), if num_beams&gt;1 and num_beam_groups&gt;1</span>
+<span class="sd">            * constrained beam-search decoding by calling constrained_beam_search(), if constraints!=None or force_words_ids!=None</span>
+<span class="sd">            * assisted decoding by calling assisted_decoding(), if assistant_model is passed to .generate()</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            n_samples: number of sequences to return</span>
+<span class="sd">            safe_prefix: Prefix to use in sampling, should correspond to a safe fragment</span>
+<span class="sd">            max_length : maximum length of sampled sequence</span>
+<span class="sd">            how: which sampling method to use: &quot;beam&quot;, &quot;greedy&quot; or &quot;random&quot;. Can be used to control other parameters by setting defaults</span>
+<span class="sd">            num_beams: number of beams for beam search. 1 means no beam search, unless beam is specified then max(n_samples, num_beams) is used</span>
+<span class="sd">            num_beam_groups: number of beam groups for diverse beam search</span>
+<span class="sd">            do_sample: whether to perform random sampling or not, equivalent to setting random to True</span>
+<span class="sd">            kwargs: any additional keyword argument to pass to the underlying sampling `generate`  from hugging face transformer</span>
+
+<span class="sd">        Returns:</span>
+<span class="sd">            samples: list of sampled molecules, including failed validation</span>
+
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="n">pretrained_tk</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">get_pretrained</span><span class="p">()</span>
+        <span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">pretrained_tk</span><span class="p">,</span> <span class="s2">&quot;model_max_length&quot;</span><span class="p">)</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="nb">setattr</span><span class="p">(</span>
+                <span class="n">pretrained_tk</span><span class="p">,</span>
+                <span class="s2">&quot;model_max_length&quot;</span><span class="p">,</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">_DEFAULT_MAX_LENGTH</span><span class="p">,</span>  <span class="c1"># this was the defaul</span>
+            <span class="p">)</span>
+
+        <span class="n">input_ids</span> <span class="o">=</span> <span class="n">safe_prefix</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">safe_prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
+            <span class="c1"># EN: should we address the special token issues</span>
+            <span class="n">input_ids</span> <span class="o">=</span> <span class="n">pretrained_tk</span><span class="p">(</span>
+                <span class="n">safe_prefix</span><span class="p">,</span>
+                <span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">,</span>
+            <span class="p">)</span>
+
+        <span class="n">num_beams</span> <span class="o">=</span> <span class="n">num_beams</span> <span class="ow">or</span> <span class="kc">None</span>
+        <span class="n">do_sample</span> <span class="o">=</span> <span class="n">do_sample</span> <span class="ow">or</span> <span class="kc">False</span>
+
+        <span class="k">if</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">&quot;random&quot;</span><span class="p">:</span>
+            <span class="n">do_sample</span> <span class="o">=</span> <span class="kc">True</span>
+
+        <span class="k">elif</span> <span class="n">how</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="s2">&quot;beam&quot;</span> <span class="ow">in</span> <span class="n">how</span><span class="p">:</span>
+            <span class="n">num_beams</span> <span class="o">=</span> <span class="nb">max</span><span class="p">((</span><span class="n">num_beams</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">),</span> <span class="n">n_samples</span><span class="p">)</span>
+
+        <span class="n">is_greedy</span> <span class="o">=</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">&quot;greedy&quot;</span> <span class="ow">or</span> <span class="p">(</span><span class="n">num_beams</span> <span class="ow">in</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">None</span><span class="p">])</span> <span class="ow">and</span> <span class="n">do_sample</span> <span class="ow">is</span> <span class="kc">False</span>
+
+        <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;do_sample&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">do_sample</span>
+        <span class="k">if</span> <span class="n">num_beams</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;num_beams&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">num_beams</span>
+        <span class="k">if</span> <span class="n">num_beam_groups</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;num_beam_groups&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">num_beam_groups</span>
+        <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;output_scores&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
+        <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;return_dict_in_generate&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
+        <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;num_return_sequences&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">n_samples</span>
+        <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;max_length&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">max_length</span>
+        <span class="n">kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;early_stopping&quot;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
+        <span class="c1"># EN we don&#39;t do anything with the score that the model might return on generate ...</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">input_ids</span><span class="p">,</span> <span class="n">Mapping</span><span class="p">):</span>
+            <span class="n">input_ids</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;inputs&quot;</span><span class="p">:</span> <span class="kc">None</span><span class="p">}</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="c1"># EN: we remove the EOS token added before running the prediction</span>
+            <span class="c1"># because the model output nonsense when we keep it.</span>
+            <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">input_ids</span><span class="p">:</span>
+                <span class="n">input_ids</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">input_ids</span><span class="p">[</span><span class="n">k</span><span class="p">][:,</span> <span class="p">:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
+
+        <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">input_ids</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
+            <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">is_tensor</span><span class="p">(</span><span class="n">v</span><span class="p">):</span>
+                <span class="n">input_ids</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+
+        <span class="c1"># we remove the token_type_ids to support more model type than just GPT2</span>
+        <span class="n">input_ids</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">&quot;token_type_ids&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">is_greedy</span><span class="p">:</span>
+            <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;num_return_sequences&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
+            <span class="k">if</span> <span class="n">num_beams</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">num_beams</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Cannot set num_beams|num_beam_groups &gt; 1 for greedy&quot;</span><span class="p">)</span>
+            <span class="c1"># under greedy decoding there can only be a single solution</span>
+            <span class="c1"># we just duplicate the solution several time for efficiency</span>
+            <span class="n">outputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
+                <span class="o">**</span><span class="n">input_ids</span><span class="p">,</span>
+                <span class="n">generation_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span><span class="p">,</span>
+                <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+            <span class="p">)</span>
+            <span class="n">sequences</span> <span class="o">=</span> <span class="p">[</span>
+                <span class="n">pretrained_tk</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">outputs</span><span class="o">.</span><span class="n">sequences</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(),</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+            <span class="p">]</span> <span class="o">*</span> <span class="n">n_samples</span>
+
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">outputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
+                <span class="o">**</span><span class="n">input_ids</span><span class="p">,</span>
+                <span class="n">generation_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span><span class="p">,</span>
+                <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+            <span class="p">)</span>
+            <span class="n">sequences</span> <span class="o">=</span> <span class="n">pretrained_tk</span><span class="o">.</span><span class="n">batch_decode</span><span class="p">(</span><span class="n">outputs</span><span class="o">.</span><span class="n">sequences</span><span class="p">,</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">sequences</span>
+</code></pre></div></td></tr></table></div>
+              </details>
+
+
+
+  <div class="doc doc-children">
+
+
+
+
+
+
+
+
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.__init__" class="doc doc-heading">
+            <code class="highlight language-python"><span class="fm">__init__</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">tokenizer</span><span class="p">,</span> <span class="n">generation_config</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">safe_encoder</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.__init__" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>SAFEDesign constructor</p>
+<div class="admonition info">
+<p class="admonition-title">Info</p>
+<p>Design methods in SAFE are not deterministic when it comes to the token sampling step.
+If a method accepts a <code>random_seed</code>, it's for the SAFE-related algorithms and not the
+sampling from the autoregressive model. To ensure you get a deterministic sampling,
+please set the seed at the <code>transformers</code> package level.</p>
+<div class="highlight"><pre><span></span><code><span class="kn">import</span> <span class="nn">safe</span> <span class="k">as</span> <span class="nn">sf</span>
+<span class="kn">import</span> <span class="nn">transformers</span>
+<span class="n">my_seed</span> <span class="o">=</span> <span class="mi">100</span>
+<span class="n">designer</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">SAFEDesign</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
+
+<span class="n">transformers</span><span class="o">.</span><span class="n">set_seed</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span> <span class="c1"># use this before calling a design function</span>
+<span class="n">designer</span><span class="o">.</span><span class="n">linker_generation</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
+</code></pre></div>
+</div>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>model</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-internal" title="safe.trainer.model.SAFEDoubleHeadsModel" href="safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel">SAFEDoubleHeadsModel</a>, <a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>input SAFEDoubleHeadsModel to use for generation</p>
+              </div>
+            </td>
+            <td>
+                <em>required</em>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>tokenizer</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <a class="autorefs autorefs-internal" title="safe.tokenizer.SAFETokenizer" href="#safe.tokenizer.SAFETokenizer">SAFETokenizer</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>input SAFETokenizer to use for generation</p>
+              </div>
+            </td>
+            <td>
+                <em>required</em>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>generation_config</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <span title="transformers.GenerationConfig">GenerationConfig</span>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>input GenerationConfig to use for generation</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>safe_encoder</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-internal" title="safe.SAFEConverter" href="#safe.converter.SAFEConverter">SAFEConverter</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>custom safe encoder to use</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>verbose</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to print out logging information during generation</p>
+              </div>
+            </td>
+            <td>
+                  <code>True</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">28</span>
+<span class="normal">29</span>
+<span class="normal">30</span>
+<span class="normal">31</span>
+<span class="normal">32</span>
+<span class="normal">33</span>
+<span class="normal">34</span>
+<span class="normal">35</span>
+<span class="normal">36</span>
+<span class="normal">37</span>
+<span class="normal">38</span>
+<span class="normal">39</span>
+<span class="normal">40</span>
+<span class="normal">41</span>
+<span class="normal">42</span>
+<span class="normal">43</span>
+<span class="normal">44</span>
+<span class="normal">45</span>
+<span class="normal">46</span>
+<span class="normal">47</span>
+<span class="normal">48</span>
+<span class="normal">49</span>
+<span class="normal">50</span>
+<span class="normal">51</span>
+<span class="normal">52</span>
+<span class="normal">53</span>
+<span class="normal">54</span>
+<span class="normal">55</span>
+<span class="normal">56</span>
+<span class="normal">57</span>
+<span class="normal">58</span>
+<span class="normal">59</span>
+<span class="normal">60</span>
+<span class="normal">61</span>
+<span class="normal">62</span>
+<span class="normal">63</span>
+<span class="normal">64</span>
+<span class="normal">65</span>
+<span class="normal">66</span>
+<span class="normal">67</span>
+<span class="normal">68</span>
+<span class="normal">69</span>
+<span class="normal">70</span>
+<span class="normal">71</span>
+<span class="normal">72</span>
+<span class="normal">73</span>
+<span class="normal">74</span>
+<span class="normal">75</span>
+<span class="normal">76</span>
+<span class="normal">77</span>
+<span class="normal">78</span>
+<span class="normal">79</span>
+<span class="normal">80</span>
+<span class="normal">81</span>
+<span class="normal">82</span>
+<span class="normal">83</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">SAFEDoubleHeadsModel</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
+    <span class="n">tokenizer</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">SAFETokenizer</span><span class="p">],</span>
+    <span class="n">generation_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">GenerationConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">safe_encoder</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">sf</span><span class="o">.</span><span class="n">SAFEConverter</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;SAFEDesign constructor</span>
+
+<span class="sd">    !!! info</span>
+<span class="sd">        Design methods in SAFE are not deterministic when it comes to the token sampling step.</span>
+<span class="sd">        If a method accepts a `random_seed`, it&#39;s for the SAFE-related algorithms and not the</span>
+<span class="sd">        sampling from the autoregressive model. To ensure you get a deterministic sampling,</span>
+<span class="sd">        please set the seed at the `transformers` package level.</span>
+
+<span class="sd">        ```python</span>
+<span class="sd">        import safe as sf</span>
+<span class="sd">        import transformers</span>
+<span class="sd">        my_seed = 100</span>
+<span class="sd">        designer = sf.SAFEDesign(...)</span>
+
+<span class="sd">        transformers.set_seed(100) # use this before calling a design function</span>
+<span class="sd">        designer.linker_generation(...)</span>
+<span class="sd">        ```</span>
+
+
+<span class="sd">    Args:</span>
+<span class="sd">        model: input SAFEDoubleHeadsModel to use for generation</span>
+<span class="sd">        tokenizer: input SAFETokenizer to use for generation</span>
+<span class="sd">        generation_config: input GenerationConfig to use for generation</span>
+<span class="sd">        safe_encoder: custom safe encoder to use</span>
+<span class="sd">        verbose: whether to print out logging information during generation</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">PathLike</span><span class="p">)):</span>
+        <span class="n">model</span> <span class="o">=</span> <span class="n">SAFEDoubleHeadsModel</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">PathLike</span><span class="p">)):</span>
+        <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">SAFETokenizer</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">)</span>
+
+    <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="o">=</span> <span class="n">model</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="o">=</span> <span class="n">tokenizer</span>
+    <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">generation_config</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">PathLike</span><span class="p">):</span>
+        <span class="n">generation_config</span> <span class="o">=</span> <span class="n">GenerationConfig</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">generation_config</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">generation_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="n">generation_config</span> <span class="o">=</span> <span class="n">GenerationConfig</span><span class="o">.</span><span class="n">from_model_config</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">config</span><span class="p">)</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span> <span class="o">=</span> <span class="n">generation_config</span>
+    <span class="k">for</span> <span class="n">special_token_id</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;bos_token_id&quot;</span><span class="p">,</span> <span class="s2">&quot;eos_token_id&quot;</span><span class="p">,</span> <span class="s2">&quot;pad_token_id&quot;</span><span class="p">]:</span>
+        <span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span><span class="p">,</span> <span class="n">special_token_id</span><span class="p">)</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="nb">setattr</span><span class="p">(</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">generation_config</span><span class="p">,</span> <span class="n">special_token_id</span><span class="p">,</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">special_token_id</span><span class="p">)</span>
+            <span class="p">)</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span> <span class="o">=</span> <span class="n">verbose</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">safe_encoder</span> <span class="o">=</span> <span class="n">safe_encoder</span> <span class="ow">or</span> <span class="n">sf</span><span class="o">.</span><span class="n">SAFEConverter</span><span class="p">()</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.__mix_sequences" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">__mix_sequences</span><span class="p">(</span><span class="n">prefix_sequences</span><span class="p">,</span> <span class="n">suffix_sequences</span><span class="p">,</span> <span class="n">prefix</span><span class="p">,</span> <span class="n">suffix</span><span class="p">,</span> <span class="n">n_samples</span><span class="p">,</span> <span class="n">mol_linker_slicer</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.__mix_sequences" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Use generated prefix and suffix sequences to form new molecules
+that will be the merging of both. This is the two step scaffold morphing and linker generation scheme
+Args:
+    prefix_sequences: list of prefix sequences
+    suffix_sequences: list of suffix sequences
+    prefix: decoded smiles of the prefix
+    suffix: decoded smiles of the suffix
+    n_samples: number of samples to generate</p>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">621</span>
+<span class="normal">622</span>
+<span class="normal">623</span>
+<span class="normal">624</span>
+<span class="normal">625</span>
+<span class="normal">626</span>
+<span class="normal">627</span>
+<span class="normal">628</span>
+<span class="normal">629</span>
+<span class="normal">630</span>
+<span class="normal">631</span>
+<span class="normal">632</span>
+<span class="normal">633</span>
+<span class="normal">634</span>
+<span class="normal">635</span>
+<span class="normal">636</span>
+<span class="normal">637</span>
+<span class="normal">638</span>
+<span class="normal">639</span>
+<span class="normal">640</span>
+<span class="normal">641</span>
+<span class="normal">642</span>
+<span class="normal">643</span>
+<span class="normal">644</span>
+<span class="normal">645</span>
+<span class="normal">646</span>
+<span class="normal">647</span>
+<span class="normal">648</span>
+<span class="normal">649</span>
+<span class="normal">650</span>
+<span class="normal">651</span>
+<span class="normal">652</span>
+<span class="normal">653</span>
+<span class="normal">654</span>
+<span class="normal">655</span>
+<span class="normal">656</span>
+<span class="normal">657</span>
+<span class="normal">658</span>
+<span class="normal">659</span>
+<span class="normal">660</span>
+<span class="normal">661</span>
+<span class="normal">662</span>
+<span class="normal">663</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">__mix_sequences</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">prefix_sequences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
+    <span class="n">suffix_sequences</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span>
+    <span class="n">prefix</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="n">suffix</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+    <span class="n">n_samples</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+    <span class="n">mol_linker_slicer</span><span class="p">,</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Use generated prefix and suffix sequences to form new molecules</span>
+<span class="sd">    that will be the merging of both. This is the two step scaffold morphing and linker generation scheme</span>
+<span class="sd">    Args:</span>
+<span class="sd">        prefix_sequences: list of prefix sequences</span>
+<span class="sd">        suffix_sequences: list of suffix sequences</span>
+<span class="sd">        prefix: decoded smiles of the prefix</span>
+<span class="sd">        suffix: decoded smiles of the suffix</span>
+<span class="sd">        n_samples: number of samples to generate</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">prefix_linkers</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="n">suffix_linkers</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="n">prefix_query</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">from_smarts</span><span class="p">(</span><span class="n">prefix</span><span class="p">)</span>
+    <span class="n">suffix_query</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">from_smarts</span><span class="p">(</span><span class="n">suffix</span><span class="p">)</span>
+
+    <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">prefix_sequences</span><span class="p">:</span>
+        <span class="k">with</span> <span class="n">suppress</span><span class="p">(</span><span class="ne">Exception</span><span class="p">):</span>
+            <span class="n">x</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+            <span class="n">out</span> <span class="o">=</span> <span class="n">mol_linker_slicer</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">prefix_query</span><span class="p">)</span>
+            <span class="n">prefix_linkers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">out</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+    <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">suffix_sequences</span><span class="p">:</span>
+        <span class="k">with</span> <span class="n">suppress</span><span class="p">(</span><span class="ne">Exception</span><span class="p">):</span>
+            <span class="n">x</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+            <span class="n">out</span> <span class="o">=</span> <span class="n">mol_linker_slicer</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">suffix_query</span><span class="p">)</span>
+            <span class="n">suffix_linkers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">out</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+    <span class="n">n_linked</span> <span class="o">=</span> <span class="mi">0</span>
+    <span class="n">linked</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="n">linkers</span> <span class="o">=</span> <span class="n">prefix_linkers</span> <span class="o">+</span> <span class="n">suffix_linkers</span>
+    <span class="n">linkers</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">linkers</span> <span class="k">if</span> <span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">]</span>
+    <span class="k">for</span> <span class="n">n_linked</span><span class="p">,</span> <span class="n">linker</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">linkers</span><span class="p">):</span>
+        <span class="n">linked</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">mol_linker_slicer</span><span class="o">.</span><span class="n">link_fragments</span><span class="p">(</span><span class="n">linker</span><span class="p">,</span> <span class="n">prefix</span><span class="p">,</span> <span class="n">suffix</span><span class="p">))</span>
+        <span class="k">if</span> <span class="n">n_linked</span> <span class="o">&gt;</span> <span class="n">n_samples</span><span class="p">:</span>
+            <span class="k">break</span>
+        <span class="n">linked</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">linked</span> <span class="k">if</span> <span class="n">x</span><span class="p">]</span>
+    <span class="k">return</span> <span class="n">linked</span><span class="p">[:</span><span class="n">n_samples</span><span class="p">]</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.de_novo_generation" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">de_novo_generation</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">sanitize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">n_trials</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.de_novo_generation" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Perform de novo generation using the pretrained SAFE model.</p>
+<p>De novo generation is equivalent to not having any prefix.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>n_samples_per_trial</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of new molecules to generate</p>
+              </div>
+            </td>
+            <td>
+                  <code>10</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>sanitize</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to perform sanitization, aka, perform control to ensure what is asked is what is returned</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_trials</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of randomization to perform</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>kwargs</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Dict" href="https://docs.python.org/3/library/typing.html#typing.Dict">Dict</a>[<a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>, <a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>any argument to provide to the underlying generation function</p>
+              </div>
+            </td>
+            <td>
+                  <code>{}</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">558</span>
+<span class="normal">559</span>
+<span class="normal">560</span>
+<span class="normal">561</span>
+<span class="normal">562</span>
+<span class="normal">563</span>
+<span class="normal">564</span>
+<span class="normal">565</span>
+<span class="normal">566</span>
+<span class="normal">567</span>
+<span class="normal">568</span>
+<span class="normal">569</span>
+<span class="normal">570</span>
+<span class="normal">571</span>
+<span class="normal">572</span>
+<span class="normal">573</span>
+<span class="normal">574</span>
+<span class="normal">575</span>
+<span class="normal">576</span>
+<span class="normal">577</span>
+<span class="normal">578</span>
+<span class="normal">579</span>
+<span class="normal">580</span>
+<span class="normal">581</span>
+<span class="normal">582</span>
+<span class="normal">583</span>
+<span class="normal">584</span>
+<span class="normal">585</span>
+<span class="normal">586</span>
+<span class="normal">587</span>
+<span class="normal">588</span>
+<span class="normal">589</span>
+<span class="normal">590</span>
+<span class="normal">591</span>
+<span class="normal">592</span>
+<span class="normal">593</span>
+<span class="normal">594</span>
+<span class="normal">595</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">de_novo_generation</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Perform de novo generation using the pretrained SAFE model.</span>
+
+<span class="sd">    De novo generation is equivalent to not having any prefix.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        n_samples_per_trial: number of new molecules to generate</span>
+<span class="sd">        sanitize: whether to perform sanitization, aka, perform control to ensure what is asked is what is returned</span>
+<span class="sd">        n_trials: number of randomization to perform</span>
+<span class="sd">        kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="c1"># EN: lazy programming much ?</span>
+    <span class="n">kwargs</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="s2">&quot;how&quot;</span><span class="p">,</span> <span class="s2">&quot;random&quot;</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">kwargs</span><span class="p">[</span><span class="s2">&quot;how&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="s2">&quot;random&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;do_sample&quot;</span><span class="p">):</span>
+        <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
+            <span class="s2">&quot;I don&#39;t think you know what you are doing ... for de novo generation `do_sample=True` or `how=&#39;random&#39;` is expected !&quot;</span>
+        <span class="p">)</span>
+
+    <span class="n">total_sequences</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="n">n_trials</span> <span class="o">=</span> <span class="n">n_trials</span> <span class="ow">or</span> <span class="mi">1</span>
+    <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_trials</span><span class="p">),</span> <span class="n">disable</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">),</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+        <span class="n">sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate</span><span class="p">(</span><span class="n">n_samples</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+        <span class="n">total_sequences</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">sequences</span><span class="p">)</span>
+    <span class="n">total_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_decode_safe</span><span class="p">(</span>
+        <span class="n">total_sequences</span><span class="p">,</span> <span class="n">canonical</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">remove_invalid</span><span class="o">=</span><span class="n">sanitize</span>
+    <span class="p">)</span>
+
+    <span class="k">if</span> <span class="n">sanitize</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %) generated molecules are valid !&quot;</span>
+        <span class="p">)</span>
+    <span class="k">return</span> <span class="n">total_sequences</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.linker_generation" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">linker_generation</span><span class="p">(</span><span class="o">*</span><span class="n">groups</span><span class="p">,</span> <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">sanitize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">random_seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">model_only</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.linker_generation" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Perform linker generation using the pretrained SAFE model.
+Linker generation is really just scaffold morphing underlying.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>groups</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <span title="datamol.Mol">Mol</span>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>list of fragments to link together, they are joined in the order provided</p>
+              </div>
+            </td>
+            <td>
+                  <code>()</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_samples_per_trial</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of new molecules to generate for each randomization</p>
+              </div>
+            </td>
+            <td>
+                  <code>10</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_trials</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of randomization to perform</p>
+              </div>
+            </td>
+            <td>
+                  <code>1</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>do_not_fragment_further</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to fragment the scaffold further or not</p>
+              </div>
+            </td>
+            <td>
+                  <code>True</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>sanitize</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to sanitize the generated molecules</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>random_seed</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>random seed to use</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>model_only</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to use the model only ability and nothing more.</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>kwargs</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Dict" href="https://docs.python.org/3/library/typing.html#typing.Dict">Dict</a>[<a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>, <a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>any argument to provide to the underlying generation function</p>
+              </div>
+            </td>
+            <td>
+                  <code>{}</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">106</span>
+<span class="normal">107</span>
+<span class="normal">108</span>
+<span class="normal">109</span>
+<span class="normal">110</span>
+<span class="normal">111</span>
+<span class="normal">112</span>
+<span class="normal">113</span>
+<span class="normal">114</span>
+<span class="normal">115</span>
+<span class="normal">116</span>
+<span class="normal">117</span>
+<span class="normal">118</span>
+<span class="normal">119</span>
+<span class="normal">120</span>
+<span class="normal">121</span>
+<span class="normal">122</span>
+<span class="normal">123</span>
+<span class="normal">124</span>
+<span class="normal">125</span>
+<span class="normal">126</span>
+<span class="normal">127</span>
+<span class="normal">128</span>
+<span class="normal">129</span>
+<span class="normal">130</span>
+<span class="normal">131</span>
+<span class="normal">132</span>
+<span class="normal">133</span>
+<span class="normal">134</span>
+<span class="normal">135</span>
+<span class="normal">136</span>
+<span class="normal">137</span>
+<span class="normal">138</span>
+<span class="normal">139</span>
+<span class="normal">140</span>
+<span class="normal">141</span>
+<span class="normal">142</span>
+<span class="normal">143</span>
+<span class="normal">144</span>
+<span class="normal">145</span>
+<span class="normal">146</span>
+<span class="normal">147</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">linker_generation</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="o">*</span><span class="n">groups</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+    <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+    <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">model_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Perform linker generation using the pretrained SAFE model.</span>
+<span class="sd">    Linker generation is really just scaffold morphing underlying.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        groups: list of fragments to link together, they are joined in the order provided</span>
+<span class="sd">        n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">        n_trials: number of randomization to perform</span>
+<span class="sd">        do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">        sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">        random_seed: random seed to use</span>
+<span class="sd">        model_only: whether to use the model only ability and nothing more.</span>
+<span class="sd">        kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">side_chains</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">groups</span><span class="p">)</span>
+
+    <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">side_chains</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">2</span><span class="p">:</span>
+        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+            <span class="s2">&quot;Linker generation only works when providing two groups as side chains&quot;</span>
+        <span class="p">)</span>
+
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_fragment_linking</span><span class="p">(</span>
+        <span class="n">side_chains</span><span class="o">=</span><span class="n">side_chains</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+        <span class="n">is_linking</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="n">model_only</span><span class="o">=</span><span class="n">model_only</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+    <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.load_default" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">load_default</span><span class="p">(</span><span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">model_dir</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span></code>
+
+  <span class="doc doc-labels">
+      <small class="doc doc-label doc-label-classmethod"><code>classmethod</code></small>
+  </span>
+
+<a href="#safe.sample.SAFEDesign.load_default" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Load default SAFEGenerator model</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>verbose</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to print out logging information during generation</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>model_dir</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>Optional path to model folder to use instead of the default one.
+If provided the tokenizer should be in the model_dir named as <code>tokenizer.json</code></p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>device</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>optional device where to move the model</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal"> 85</span>
+<span class="normal"> 86</span>
+<span class="normal"> 87</span>
+<span class="normal"> 88</span>
+<span class="normal"> 89</span>
+<span class="normal"> 90</span>
+<span class="normal"> 91</span>
+<span class="normal"> 92</span>
+<span class="normal"> 93</span>
+<span class="normal"> 94</span>
+<span class="normal"> 95</span>
+<span class="normal"> 96</span>
+<span class="normal"> 97</span>
+<span class="normal"> 98</span>
+<span class="normal"> 99</span>
+<span class="normal">100</span>
+<span class="normal">101</span>
+<span class="normal">102</span>
+<span class="normal">103</span>
+<span class="normal">104</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="nd">@classmethod</span>
+<span class="k">def</span> <span class="nf">load_default</span><span class="p">(</span>
+    <span class="bp">cls</span><span class="p">,</span> <span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">model_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">device</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="kc">None</span>
+<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SAFEDesign&quot;</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Load default SAFEGenerator model</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        verbose: whether to print out logging information during generation</span>
+<span class="sd">        model_dir: Optional path to model folder to use instead of the default one.</span>
+<span class="sd">            If provided the tokenizer should be in the model_dir named as `tokenizer.json`</span>
+<span class="sd">        device: optional device where to move the model</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">model_dir</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">model_dir</span><span class="p">:</span>
+        <span class="n">model_dir</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_DEFAULT_MODEL_PATH</span>
+    <span class="n">model</span> <span class="o">=</span> <span class="n">SAFEDoubleHeadsModel</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
+    <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">SAFETokenizer</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
+    <span class="n">gen_config</span> <span class="o">=</span> <span class="n">GenerationConfig</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">device</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="n">model</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+    <span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span> <span class="n">tokenizer</span><span class="o">=</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">generation_config</span><span class="o">=</span><span class="n">gen_config</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="n">verbose</span><span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.motif_extension" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">motif_extension</span><span class="p">(</span><span class="n">motif</span><span class="p">,</span> <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">sanitize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">random_seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.motif_extension" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Perform motif extension using the pretrained SAFE model.
+Motif extension is really just scaffold decoration underlying.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>motif</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <span title="datamol.Mol">Mol</span>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>scaffold (with attachment points) to decorate</p>
+              </div>
+            </td>
+            <td>
+                <em>required</em>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_samples_per_trial</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of new molecules to generate for each randomization</p>
+              </div>
+            </td>
+            <td>
+                  <code>10</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_trials</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of randomization to perform</p>
+              </div>
+            </td>
+            <td>
+                  <code>1</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>do_not_fragment_further</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to fragment the scaffold further or not</p>
+              </div>
+            </td>
+            <td>
+                  <code>True</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>sanitize</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to sanitize the generated molecules and check</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>random_seed</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>random seed to use</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>kwargs</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Dict" href="https://docs.python.org/3/library/typing.html#typing.Dict">Dict</a>[<a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>, <a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>any argument to provide to the underlying generation function</p>
+              </div>
+            </td>
+            <td>
+                  <code>{}</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">413</span>
+<span class="normal">414</span>
+<span class="normal">415</span>
+<span class="normal">416</span>
+<span class="normal">417</span>
+<span class="normal">418</span>
+<span class="normal">419</span>
+<span class="normal">420</span>
+<span class="normal">421</span>
+<span class="normal">422</span>
+<span class="normal">423</span>
+<span class="normal">424</span>
+<span class="normal">425</span>
+<span class="normal">426</span>
+<span class="normal">427</span>
+<span class="normal">428</span>
+<span class="normal">429</span>
+<span class="normal">430</span>
+<span class="normal">431</span>
+<span class="normal">432</span>
+<span class="normal">433</span>
+<span class="normal">434</span>
+<span class="normal">435</span>
+<span class="normal">436</span>
+<span class="normal">437</span>
+<span class="normal">438</span>
+<span class="normal">439</span>
+<span class="normal">440</span>
+<span class="normal">441</span>
+<span class="normal">442</span>
+<span class="normal">443</span>
+<span class="normal">444</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">motif_extension</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">motif</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+    <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+    <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Perform motif extension using the pretrained SAFE model.</span>
+<span class="sd">    Motif extension is really just scaffold decoration underlying.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        motif: scaffold (with attachment points) to decorate</span>
+<span class="sd">        n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">        n_trials: number of randomization to perform</span>
+<span class="sd">        do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">        sanitize: whether to sanitize the generated molecules and check</span>
+<span class="sd">        random_seed: random seed to use</span>
+<span class="sd">        kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">scaffold_decoration</span><span class="p">(</span>
+        <span class="n">motif</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+        <span class="n">add_dot</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+    <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.scaffold_decoration" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">scaffold_decoration</span><span class="p">(</span><span class="n">scaffold</span><span class="p">,</span> <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">sanitize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">random_seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">add_dot</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.scaffold_decoration" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Perform scaffold decoration using the pretrained SAFE model</p>
+<p>For scaffold decoration, we basically starts with a prefix with the attachment point.
+We first convert the prefix into valid safe string.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>scaffold</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <span title="datamol.Mol">Mol</span>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>scaffold (with attachment points) to decorate</p>
+              </div>
+            </td>
+            <td>
+                <em>required</em>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_samples_per_trial</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of new molecules to generate for each randomization</p>
+              </div>
+            </td>
+            <td>
+                  <code>10</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_trials</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of randomization to perform</p>
+              </div>
+            </td>
+            <td>
+                  <code>1</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>do_not_fragment_further</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to fragment the scaffold further or not</p>
+              </div>
+            </td>
+            <td>
+                  <code>True</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>sanitize</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to sanitize the generated molecules and check if the scaffold is still present</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>random_seed</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>random seed to use</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>kwargs</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Dict" href="https://docs.python.org/3/library/typing.html#typing.Dict">Dict</a>[<a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>, <a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>any argument to provide to the underlying generation function</p>
+              </div>
+            </td>
+            <td>
+                  <code>{}</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">512</span>
+<span class="normal">513</span>
+<span class="normal">514</span>
+<span class="normal">515</span>
+<span class="normal">516</span>
+<span class="normal">517</span>
+<span class="normal">518</span>
+<span class="normal">519</span>
+<span class="normal">520</span>
+<span class="normal">521</span>
+<span class="normal">522</span>
+<span class="normal">523</span>
+<span class="normal">524</span>
+<span class="normal">525</span>
+<span class="normal">526</span>
+<span class="normal">527</span>
+<span class="normal">528</span>
+<span class="normal">529</span>
+<span class="normal">530</span>
+<span class="normal">531</span>
+<span class="normal">532</span>
+<span class="normal">533</span>
+<span class="normal">534</span>
+<span class="normal">535</span>
+<span class="normal">536</span>
+<span class="normal">537</span>
+<span class="normal">538</span>
+<span class="normal">539</span>
+<span class="normal">540</span>
+<span class="normal">541</span>
+<span class="normal">542</span>
+<span class="normal">543</span>
+<span class="normal">544</span>
+<span class="normal">545</span>
+<span class="normal">546</span>
+<span class="normal">547</span>
+<span class="normal">548</span>
+<span class="normal">549</span>
+<span class="normal">550</span>
+<span class="normal">551</span>
+<span class="normal">552</span>
+<span class="normal">553</span>
+<span class="normal">554</span>
+<span class="normal">555</span>
+<span class="normal">556</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">scaffold_decoration</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">scaffold</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+    <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+    <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">add_dot</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Perform scaffold decoration using the pretrained SAFE model</span>
+
+<span class="sd">    For scaffold decoration, we basically starts with a prefix with the attachment point.</span>
+<span class="sd">    We first convert the prefix into valid safe string.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        scaffold: scaffold (with attachment points) to decorate</span>
+<span class="sd">        n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">        n_trials: number of randomization to perform</span>
+<span class="sd">        do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">        sanitize: whether to sanitize the generated molecules and check if the scaffold is still present</span>
+<span class="sd">        random_seed: random seed to use</span>
+<span class="sd">        kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="n">total_sequences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_completion</span><span class="p">(</span>
+        <span class="n">fragment</span><span class="o">=</span><span class="n">scaffold</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+        <span class="n">add_dot</span><span class="o">=</span><span class="n">add_dot</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+    <span class="p">)</span>
+    <span class="c1"># if we require sanitization</span>
+    <span class="c1"># then we should filter out molecules that do not match the requested</span>
+    <span class="k">if</span> <span class="n">sanitize</span><span class="p">:</span>
+        <span class="n">total_sequences</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">filter_by_substructure_constraints</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">,</span> <span class="n">scaffold</span><span class="p">)</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+            <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %)  generated molecules are valid !&quot;</span>
+            <span class="p">)</span>
+    <span class="k">return</span> <span class="n">total_sequences</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.scaffold_morphing" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">scaffold_morphing</span><span class="p">(</span><span class="n">side_chains</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">mol</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">core</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">sanitize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">random_seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.scaffold_morphing" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Perform scaffold morphing decoration using the pretrained SAFE model</p>
+<p>For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.
+If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the
+scaffold morphing then.</p>
+<div class="admonition note">
+<p class="admonition-title">Finding the side chains</p>
+<p>The algorithm to find the side chains from core assumes that the core we get as input has attachment points.
+Those attachment points are never considered as part of the query, rather they are used to define the attachment points.
+See ~sf.utils.compute_side_chains for more information.</p>
+</div>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>side_chains</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<span title="datamol.Mol">Mol</span>, <a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <a class="autorefs autorefs-external" title="typing.List" href="https://docs.python.org/3/library/typing.html#typing.List">List</a>[<a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <span title="datamol.Mol">Mol</span>]]]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>mol</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<span title="datamol.Mol">Mol</span>, <a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>input molecules when side_chains are not provided</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>core</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<span title="datamol.Mol">Mol</span>, <a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>core to morph into another scaffold</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_samples_per_trial</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of new molecules to generate for each randomization</p>
+              </div>
+            </td>
+            <td>
+                  <code>10</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_trials</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of randomization to perform</p>
+              </div>
+            </td>
+            <td>
+                  <code>1</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>do_not_fragment_further</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to fragment the scaffold further or not</p>
+              </div>
+            </td>
+            <td>
+                  <code>True</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>sanitize</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to sanitize the generated molecules</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>random_seed</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>random seed to use</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>kwargs</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Dict" href="https://docs.python.org/3/library/typing.html#typing.Dict">Dict</a>[<a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>, <a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>any argument to provide to the underlying generation function</p>
+              </div>
+            </td>
+            <td>
+                  <code>{}</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">149</span>
+<span class="normal">150</span>
+<span class="normal">151</span>
+<span class="normal">152</span>
+<span class="normal">153</span>
+<span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
+<span class="normal">159</span>
+<span class="normal">160</span>
+<span class="normal">161</span>
+<span class="normal">162</span>
+<span class="normal">163</span>
+<span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span>
+<span class="normal">167</span>
+<span class="normal">168</span>
+<span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span>
+<span class="normal">172</span>
+<span class="normal">173</span>
+<span class="normal">174</span>
+<span class="normal">175</span>
+<span class="normal">176</span>
+<span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span>
+<span class="normal">185</span>
+<span class="normal">186</span>
+<span class="normal">187</span>
+<span class="normal">188</span>
+<span class="normal">189</span>
+<span class="normal">190</span>
+<span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">scaffold_morphing</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">side_chains</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">mol</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">core</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+    <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Perform scaffold morphing decoration using the pretrained SAFE model</span>
+
+<span class="sd">    For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.</span>
+<span class="sd">    If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the</span>
+<span class="sd">    scaffold morphing then.</span>
+
+<span class="sd">    !!! note &quot;Finding the side chains&quot;</span>
+<span class="sd">        The algorithm to find the side chains from core assumes that the core we get as input has attachment points.</span>
+<span class="sd">        Those attachment points are never considered as part of the query, rather they are used to define the attachment points.</span>
+<span class="sd">        See ~sf.utils.compute_side_chains for more information.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)</span>
+<span class="sd">        mol: input molecules when side_chains are not provided</span>
+<span class="sd">        core: core to morph into another scaffold</span>
+<span class="sd">        n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">        n_trials: number of randomization to perform</span>
+<span class="sd">        do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">        sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">        random_seed: random seed to use</span>
+<span class="sd">        kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_fragment_linking</span><span class="p">(</span>
+        <span class="n">side_chains</span><span class="o">=</span><span class="n">side_chains</span><span class="p">,</span>
+        <span class="n">mol</span><span class="o">=</span><span class="n">mol</span><span class="p">,</span>
+        <span class="n">core</span><span class="o">=</span><span class="n">core</span><span class="p">,</span>
+        <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+        <span class="n">n_trials</span><span class="o">=</span><span class="n">n_trials</span><span class="p">,</span>
+        <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+        <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+        <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+        <span class="n">is_linking</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+    <span class="p">)</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="safe.sample.SAFEDesign.super_structure" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">super_structure</span><span class="p">(</span><span class="n">core</span><span class="p">,</span> <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">sanitize</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">random_seed</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">attachment_point_depth</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></code>
+
+<a href="#safe.sample.SAFEDesign.super_structure" class="headerlink" title="Permanent link">&para;</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Perform super structure generation using the pretrained SAFE model.</p>
+<p>To generate super-structure, we basically just create various attachment points to the input core,
+then perform scaffold decoration.</p>
+
+
+<p><span class="doc-section-title">Parameters:</span></p>
+    <table>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Type</th>
+          <th>Description</th>
+          <th>Default</th>
+        </tr>
+      </thead>
+      <tbody>
+          <tr class="doc-section-item">
+            <td><code>core</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Union" href="https://docs.python.org/3/library/typing.html#typing.Union">Union</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/stdtypes.html#str">str</a>, <span title="datamol.Mol">Mol</span>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>input substructure to use. We aim to generate super structures of this molecule</p>
+              </div>
+            </td>
+            <td>
+                <em>required</em>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_samples_per_trial</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of new molecules to generate for each randomization</p>
+              </div>
+            </td>
+            <td>
+                  <code>10</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>n_trials</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>number of different attachment points to consider</p>
+              </div>
+            </td>
+            <td>
+                  <code>1</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>do_not_fragment_further</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to fragment the scaffold further or not</p>
+              </div>
+            </td>
+            <td>
+                  <code>True</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>sanitize</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#bool">bool</a></code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>whether to sanitize the generated molecules</p>
+              </div>
+            </td>
+            <td>
+                  <code>False</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>random_seed</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>random seed to use</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>attachment_point_depth</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" href="https://docs.python.org/3/library/functions.html#int">int</a>]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>depth of opening the attachment points.
+Increasing this, means you increase the number of substitution point to consider.</p>
+              </div>
+            </td>
+            <td>
+                  <code>None</code>
+            </td>
+          </tr>
+          <tr class="doc-section-item">
+            <td><code>kwargs</code></td>
+            <td>
+                  <code><a class="autorefs autorefs-external" title="typing.Optional" href="https://docs.python.org/3/library/typing.html#typing.Optional">Optional</a>[<a class="autorefs autorefs-external" title="typing.Dict" href="https://docs.python.org/3/library/typing.html#typing.Dict">Dict</a>[<a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>, <a class="autorefs autorefs-external" title="typing.Any" href="https://docs.python.org/3/library/typing.html#typing.Any">Any</a>]]</code>
+            </td>
+            <td>
+              <div class="doc-md-description">
+                <p>any argument to provide to the underlying generation function</p>
+              </div>
+            </td>
+            <td>
+                  <code>{}</code>
+            </td>
+          </tr>
+      </tbody>
+    </table>
+
+            <details class="quote">
+              <summary>Source code in <code>safe/sample.py</code></summary>
+              <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">446</span>
+<span class="normal">447</span>
+<span class="normal">448</span>
+<span class="normal">449</span>
+<span class="normal">450</span>
+<span class="normal">451</span>
+<span class="normal">452</span>
+<span class="normal">453</span>
+<span class="normal">454</span>
+<span class="normal">455</span>
+<span class="normal">456</span>
+<span class="normal">457</span>
+<span class="normal">458</span>
+<span class="normal">459</span>
+<span class="normal">460</span>
+<span class="normal">461</span>
+<span class="normal">462</span>
+<span class="normal">463</span>
+<span class="normal">464</span>
+<span class="normal">465</span>
+<span class="normal">466</span>
+<span class="normal">467</span>
+<span class="normal">468</span>
+<span class="normal">469</span>
+<span class="normal">470</span>
+<span class="normal">471</span>
+<span class="normal">472</span>
+<span class="normal">473</span>
+<span class="normal">474</span>
+<span class="normal">475</span>
+<span class="normal">476</span>
+<span class="normal">477</span>
+<span class="normal">478</span>
+<span class="normal">479</span>
+<span class="normal">480</span>
+<span class="normal">481</span>
+<span class="normal">482</span>
+<span class="normal">483</span>
+<span class="normal">484</span>
+<span class="normal">485</span>
+<span class="normal">486</span>
+<span class="normal">487</span>
+<span class="normal">488</span>
+<span class="normal">489</span>
+<span class="normal">490</span>
+<span class="normal">491</span>
+<span class="normal">492</span>
+<span class="normal">493</span>
+<span class="normal">494</span>
+<span class="normal">495</span>
+<span class="normal">496</span>
+<span class="normal">497</span>
+<span class="normal">498</span>
+<span class="normal">499</span>
+<span class="normal">500</span>
+<span class="normal">501</span>
+<span class="normal">502</span>
+<span class="normal">503</span>
+<span class="normal">504</span>
+<span class="normal">505</span>
+<span class="normal">506</span>
+<span class="normal">507</span>
+<span class="normal">508</span>
+<span class="normal">509</span>
+<span class="normal">510</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">super_structure</span><span class="p">(</span>
+    <span class="bp">self</span><span class="p">,</span>
+    <span class="n">core</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">dm</span><span class="o">.</span><span class="n">Mol</span><span class="p">],</span>
+    <span class="n">n_samples_per_trial</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="n">n_trials</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
+    <span class="n">sanitize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="n">do_not_fragment_further</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="n">random_seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="n">attachment_point_depth</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+    <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span>
+<span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Perform super structure generation using the pretrained SAFE model.</span>
+
+<span class="sd">    To generate super-structure, we basically just create various attachment points to the input core,</span>
+<span class="sd">    then perform scaffold decoration.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        core: input substructure to use. We aim to generate super structures of this molecule</span>
+<span class="sd">        n_samples_per_trial: number of new molecules to generate for each randomization</span>
+<span class="sd">        n_trials: number of different attachment points to consider</span>
+<span class="sd">        do_not_fragment_further: whether to fragment the scaffold further or not</span>
+<span class="sd">        sanitize: whether to sanitize the generated molecules</span>
+<span class="sd">        random_seed: random seed to use</span>
+<span class="sd">        attachment_point_depth: depth of opening the attachment points.</span>
+<span class="sd">            Increasing this, means you increase the number of substitution point to consider.</span>
+<span class="sd">        kwargs: any argument to provide to the underlying generation function</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="n">core</span> <span class="o">=</span> <span class="n">dm</span><span class="o">.</span><span class="n">to_mol</span><span class="p">(</span><span class="n">core</span><span class="p">)</span>
+    <span class="n">cores</span> <span class="o">=</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">list_individual_attach_points</span><span class="p">(</span><span class="n">core</span><span class="p">,</span> <span class="n">depth</span><span class="o">=</span><span class="n">attachment_point_depth</span><span class="p">)</span>
+    <span class="c1"># get the fully open mol, everytime too.</span>
+    <span class="n">cores</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">to_smiles</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">reactions</span><span class="o">.</span><span class="n">open_attach_points</span><span class="p">(</span><span class="n">core</span><span class="p">)))</span>
+    <span class="n">cores</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">cores</span><span class="p">))</span>
+    <span class="n">rng</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">Random</span><span class="p">(</span><span class="n">random_seed</span><span class="p">)</span>
+    <span class="n">rng</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">cores</span><span class="p">)</span>
+    <span class="c1"># now also get the single openining of an attachment point</span>
+    <span class="n">total_sequences</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="n">n_trials</span> <span class="o">=</span> <span class="n">n_trials</span> <span class="ow">or</span> <span class="mi">1</span>
+    <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="n">n_trials</span><span class="p">),</span> <span class="n">disable</span><span class="o">=</span><span class="p">(</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">),</span> <span class="n">leave</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
+        <span class="n">core</span> <span class="o">=</span> <span class="n">cores</span><span class="p">[</span><span class="n">_</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">cores</span><span class="p">)]</span>
+        <span class="n">old_verbose</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span>
+        <span class="k">try</span><span class="p">:</span>
+            <span class="k">with</span> <span class="n">sf</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">attr_as</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;verbose&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span>
+                <span class="n">out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_completion</span><span class="p">(</span>
+                    <span class="n">fragment</span><span class="o">=</span><span class="n">core</span><span class="p">,</span>
+                    <span class="n">n_samples_per_trial</span><span class="o">=</span><span class="n">n_samples_per_trial</span><span class="p">,</span>
+                    <span class="n">n_trials</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
+                    <span class="n">do_not_fragment_further</span><span class="o">=</span><span class="n">do_not_fragment_further</span><span class="p">,</span>
+                    <span class="n">sanitize</span><span class="o">=</span><span class="n">sanitize</span><span class="p">,</span>
+                    <span class="n">random_seed</span><span class="o">=</span><span class="n">random_seed</span><span class="p">,</span>
+                    <span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
+                <span class="p">)</span>
+                <span class="n">total_sequences</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">out</span><span class="p">)</span>
+        <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">old_verbose</span><span class="p">:</span>
+                <span class="n">logger</span><span class="o">.</span><span class="n">error</span><span class="p">(</span><span class="n">e</span><span class="p">)</span>
+
+        <span class="k">finally</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span> <span class="o">=</span> <span class="n">old_verbose</span>
+
+    <span class="k">if</span> <span class="n">sanitize</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">verbose</span><span class="p">:</span>
+        <span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
+            <span class="sa">f</span><span class="s2">&quot;After sanitization, </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="si">}</span><span class="s2"> / </span><span class="si">{</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">total_sequences</span><span class="p">)</span><span class="o">*</span><span class="mi">100</span><span class="o">/</span><span class="p">(</span><span class="n">n_samples_per_trial</span><span class="o">*</span><span class="n">n_trials</span><span class="p">)</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> %)  generated molecules are valid !&quot;</span>
+        <span class="p">)</span>
+    <span class="k">return</span> <span class="n">total_sequences</span>
+</code></pre></div></td></tr></table></div>
+            </details>
+    </div>
+
+</div>
+
+
+
+  </div>
+
+    </div>
+
+</div>
+
+
+
+
   </div>
 
     </div>
diff --git a/main/objects.inv b/main/objects.inv
index 038f219..430b237 100644
Binary files a/main/objects.inv and b/main/objects.inv differ
diff --git a/main/search/search_index.json b/main/search/search_index.json
index 8098cad..33e9fb7 100644
--- a/main/search/search_index.json
+++ b/main/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"\ud83e\uddba SAFE  Sequential Attachment-based Fragment Embedding (SAFE) is a novel molecular line notation that represents molecules as an unordered sequence of fragment blocks to improve molecule design using generative models. <p>        Paper    |          Docs    |        \ud83e\udd17 Model    |        \ud83e\udd17 Training Dataset    </p> <p></p> <p> </p> <p> </p>"},{"location":"index.html#overview-of-safe","title":"Overview of SAFE","text":"<p>SAFE is the  deep learning molecular representation. It's an encoding leveraging a peculiarity in the decoding schemes of SMILES, to allow representation of molecules as a contiguous sequence of connected fragments. SAFE strings are valid SMILES strings, and thus are able to preserve the same amount of information. The intuitive representation of molecules as an ordered sequence of connected fragments greatly simplifies the following tasks often encountered in molecular design:</p> <ul> <li>de novo design</li> <li>superstructure generation</li> <li>scaffold decoration</li> <li>motif extension</li> <li>linker generation</li> <li>scaffold morphing.</li> </ul> <p>The construction of a SAFE strings requires defining a molecular fragmentation algorithm. By default, we use [BRICS], but any other fragmentation algorithm can be used. The image below illustrates the process of building a SAFE string. The resulting string is a valid SMILES that can be read by datamol or RDKit.</p> <p></p>"},{"location":"index.html#news","title":"News \ud83d\ude80","text":""},{"location":"index.html#20240115","title":"\ud83d\udca5 2024/01/15 \ud83d\udca5","text":"<ol> <li>@IanAWatson has a C++ implementation of SAFE in LillyMol that is quite fast and use a custom fragmentation algorithm. Follow the installation instruction on the repo and checkout the docs of the CLI here: docs/Molecule_Tools/SAFE.md</li> </ol>"},{"location":"index.html#installation","title":"Installation","text":"<p>You can install <code>safe</code> using pip:</p> <pre><code>pip install safe-mol\n</code></pre> <p>You can use conda/mamba:</p> <pre><code>mamba install -c conda-forge safe-mol\n</code></pre>"},{"location":"index.html#datasets-and-models","title":"Datasets and Models","text":"Type Name Infos Size Comment Model datamol-io/safe-gpt 87M params 350M Default model Training Dataset datamol-io/safe-gpt 1.1B rows 250GB Training dataset Drug Benchmark Dataset datamol-io/safe-drugs 26 rows 20 kB Benchmarking dataset"},{"location":"index.html#usage","title":"Usage","text":"<p>The tutorials in the documentation can help you get started with <code>safe</code> and <code>SAFE-GPT</code>.</p>"},{"location":"index.html#api","title":"API","text":"<p>We summarize some key functions provided by the <code>safe</code> package below.</p> Function Description <code>safe.encode</code> Translates a SMILES string into its corresponding SAFE string. <code>safe.decode</code> Translates a SAFE string into its corresponding SMILES string. The SAFE decoder just augment RDKit's <code>Chem.MolFromSmiles</code> with an optional correction argument to take care of missing hydrogens bonds. <code>safe.split</code> Tokenizes a SAFE string to build a generative model."},{"location":"index.html#examples","title":"Examples","text":""},{"location":"index.html#translation-between-safe-and-smiles-representations","title":"Translation between SAFE and SMILES representations","text":"<pre><code>import safe\n\nibuprofen = \"CC(Cc1ccc(cc1)C(C(=O)O)C)C\"\n\n# SMILES -&gt; SAFE -&gt; SMILES translation\ntry:\n    ibuprofen_sf = safe.encode(ibuprofen)  # c12ccc3cc1.C3(C)C(=O)O.CC(C)C2\n    ibuprofen_smi = safe.decode(ibuprofen_sf, canonical=True)  # CC(C)Cc1ccc(C(C)C(=O)O)cc1\nexcept safe.EncoderError:\n    pass\nexcept safe.DecoderError:\n    pass\n\nibuprofen_tokens = list(safe.split(ibuprofen_sf))\n</code></pre>"},{"location":"index.html#trainingfinetuning-a-new-model","title":"Training/Finetuning a (new) model","text":"<p>A command line interface is available to train a new model, please run <code>safe-train --help</code>. You can also provide an existing checkpoint to continue training or finetune on you own dataset.</p> <p>For example:</p> <pre><code>safe-train --config &lt;path to config&gt; \\\n    --model-path &lt;path to model&gt; \\\n    --tokenizer  &lt;path to tokenizer&gt; \\\n    --dataset &lt;path to dataset&gt; \\\n    --num_labels 9 \\\n    --torch_compile True \\\n    --optim \"adamw_torch\" \\\n    --learning_rate 1e-5 \\\n    --prop_loss_coeff 1e-3 \\\n    --gradient_accumulation_steps 1 \\\n    --output_dir \"&lt;path to outputdir&gt;\" \\\n    --max_steps 5\n</code></pre>"},{"location":"index.html#references","title":"References","text":"<p>If you use this repository, please cite the following related paper:</p> <pre><code>@misc{noutahi2023gotta,\n      title={Gotta be SAFE: A New Framework for Molecular Design},\n      author={Emmanuel Noutahi and Cristian Gabellini and Michael Craig and Jonathan S. C Lim and Prudencio Tossou},\n      year={2023},\n      eprint={2310.10773},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n</code></pre>"},{"location":"index.html#license","title":"License","text":"<p>Note that all data and model weights of SAFE are exclusively licensed for research purposes. The accompanying dataset is licensed under CC BY 4.0, which permits solely non-commercial usage. See DATA_LICENSE for details.</p> <p>This code base is licensed under the Apache-2.0 license. See LICENSE for details.</p>"},{"location":"index.html#development-lifecycle","title":"Development lifecycle","text":""},{"location":"index.html#setup-dev-environment","title":"Setup dev environment","text":"<pre><code>mamba create -n safe -f env.yml\nmamba activate safe\n\npip install --no-deps -e .\n</code></pre>"},{"location":"index.html#tests","title":"Tests","text":"<p>You can run tests locally with:</p> <pre><code>pytest\n</code></pre>"},{"location":"cli.html","title":"CLI for model Training","text":"<p>You can train a new <code>SAFE</code> generative models using the provided CLI, which uses \ud83e\udd17 Transformers !</p> <p>Usage: </p> <pre><code>safe-train [-h] [--model_path MODEL_PATH] [--config CONFIG] [--tokenizer TOKENIZER] [--num_labels NUM_LABELS]\n              [--include_descriptors [INCLUDE_DESCRIPTORS]] [--no_include_descriptors] [--prop_loss_coeff PROP_LOSS_COEFF]\n              [--wandb_project WANDB_PROJECT] [--wandb_watch {gradients,all}] [--cache_dir CACHE_DIR]\n              [--torch_dtype {auto,bfloat16,float16,float32}] [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] [--model_max_length MODEL_MAX_LENGTH]\n              [--dataset DATASET] [--is_tokenized [IS_TOKENIZED]] [--streaming [STREAMING]] [--text_column TEXT_COLUMN] --output_dir\n              OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]\n              [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}] [--prediction_loss_only [PREDICTION_LOSS_ONLY]]\n              [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]\n              [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]\n              [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]\n              [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]\n              [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS]\n              [--max_steps MAX_STEPS]\n              [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}]\n              [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}]\n              [--log_level_replica {debug,info,warning,error,critical,passive}] [--log_on_each_node [LOG_ON_EACH_NODE]]\n              [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]\n              [--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]\n              [--no_logging_nan_inf_filter] [--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT]\n              [--save_safetensors [SAVE_SAFETENSORS]] [--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]]\n              [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]\n              [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL]\n              [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}] [--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]]\n              [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}] [--tpu_num_cores TPU_NUM_CORES]\n              [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG [DEBUG ...]] [--dataloader_drop_last [DATALOADER_DROP_LAST]]\n              [--eval_steps EVAL_STEPS] [--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME]\n              [--disable_tqdm DISABLE_TQDM] [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns]\n              [--label_names LABEL_NAMES [LABEL_NAMES ...]] [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]\n              [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER] [--ignore_data_skip [IGNORE_DATA_SKIP]]\n              [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]\n              [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED]\n              [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]\n              [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]\n              [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]]\n              [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]\n              [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]\n              [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] [--no_dataloader_pin_memory]\n              [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics]\n              [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] [--push_to_hub [PUSH_TO_HUB]]\n              [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID]\n              [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] [--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]]\n              [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]\n              [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]\n              [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]\n              [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO]\n              [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT] [--torch_compile [TORCH_COMPILE]]\n              [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]\n</code></pre> <p>Options:</p> <pre><code>-h, --help            show this help message and exit\n--model_path MODEL_PATH\n                        Optional model path or model name to use as a starting point for the safe model (default: None)\n--config CONFIG       Path to the default config file to use for the safe model (default: None)\n--tokenizer TOKENIZER\n--num_labels NUM_LABELS\n                        Optional number of labels for the descriptors (default: None)\n--include_descriptors [INCLUDE_DESCRIPTORS]\n                        Whether to train with descriptors if they are available or Not (default: True)\n--no_include_descriptors\n                        Whether to train with descriptors if they are available or Not (default: False)\n--prop_loss_coeff PROP_LOSS_COEFF\n                        coefficient for the propery loss (default: 0.01)\n--wandb_project WANDB_PROJECT\n                        Name of the wandb project to use to log the SAFE model parameter (default: safe-gpt2)\n--wandb_watch {gradients,all}\n                        Whether to watch the wandb models or not (default: None)\n--cache_dir CACHE_DIR\n                        Where do you want to store the pretrained models downloaded from s3 (default: None)\n--torch_dtype {auto,bfloat16,float16,float32}\n                        Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the dtype will be\n                        automatically derived from the model's weights. (default: None)\n--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]\n                        It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights\n                        are loaded.set True will benefit LLM loading time and RAM consumption. Only valid when loading a pretrained model\n                        (default: False)\n--model_max_length MODEL_MAX_LENGTH\n                        Maximum sequence length. Sequences will be right padded (and possibly truncated) up to that value. (default: 1024)\n--dataset DATASET     Path to the preprocessed dataset to use for the safe model building (default: None)\n--is_tokenized [IS_TOKENIZED]\n                        whether the dataset submitted as input is already tokenized or not (default: False)\n--streaming [STREAMING]\n                        Whether to use a streaming dataset or not (default: False)\n--text_column TEXT_COLUMN\n                        Column containing text data to process. (default: inputs)\n--output_dir OUTPUT_DIR\n                        The output directory where the model predictions and checkpoints will be written. (default: None)\n--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]\n                        Overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint\n                        directory. (default: False)\n--do_train [DO_TRAIN]\n                        Whether to run training. (default: False)\n--do_eval [DO_EVAL]   Whether to run eval on the dev set. (default: False)\n--do_predict [DO_PREDICT]\n                        Whether to run predictions on the test set. (default: False)\n--evaluation_strategy {no,steps,epoch}\n                        The evaluation strategy to use. (default: no)\n--prediction_loss_only [PREDICTION_LOSS_ONLY]\n                        When performing evaluation and predictions, only returns the loss. (default: False)\n--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE\n                        Batch size per GPU/TPU core/CPU for training. (default: 8)\n--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE\n                        Batch size per GPU/TPU core/CPU for evaluation. (default: 8)\n--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE\n                        Deprecated, the use of `--per_device_train_batch_size` is preferred. Batch size per GPU/TPU core/CPU for training.\n                        (default: None)\n--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE\n                        Deprecated, the use of `--per_device_eval_batch_size` is preferred. Batch size per GPU/TPU core/CPU for evaluation.\n                        (default: None)\n--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS\n                        Number of updates steps to accumulate before performing a backward/update pass. (default: 1)\n--eval_accumulation_steps EVAL_ACCUMULATION_STEPS\n                        Number of predictions steps to accumulate before moving the tensors to the CPU. (default: None)\n--eval_delay EVAL_DELAY\n                        Number of epochs or steps to wait for before the first evaluation can be performed, depending on the evaluation_strategy.\n                        (default: 0)\n--learning_rate LEARNING_RATE\n                        The initial learning rate for AdamW. (default: 5e-05)\n--weight_decay WEIGHT_DECAY\n                        Weight decay for AdamW if we apply some. (default: 0.0)\n--adam_beta1 ADAM_BETA1\n                        Beta1 for AdamW optimizer (default: 0.9)\n--adam_beta2 ADAM_BETA2\n                        Beta2 for AdamW optimizer (default: 0.999)\n--adam_epsilon ADAM_EPSILON\n                        Epsilon for AdamW optimizer. (default: 1e-08)\n--max_grad_norm MAX_GRAD_NORM\n                        Max gradient norm. (default: 1.0)\n--num_train_epochs NUM_TRAIN_EPOCHS\n                        Total number of training epochs to perform. (default: 3.0)\n--max_steps MAX_STEPS\n                        If &gt; 0: set total number of training steps to perform. Override num_train_epochs. (default: -1)\n--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}\n                        The scheduler type to use. (default: linear)\n--warmup_ratio WARMUP_RATIO\n                        Linear warmup over warmup_ratio fraction of total steps. (default: 0.0)\n--warmup_steps WARMUP_STEPS\n                        Linear warmup over warmup_steps. (default: 0)\n--log_level {debug,info,warning,error,critical,passive}\n                        Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug', 'info', 'warning',\n                        'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the application set the level. Defaults\n                        to 'passive'. (default: passive)\n--log_level_replica {debug,info,warning,error,critical,passive}\n                        Logger log level to use on replica nodes. Same choices and defaults as ``log_level`` (default: warning)\n--log_on_each_node [LOG_ON_EACH_NODE]\n                        When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: True)\n--no_log_on_each_node\n                        When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: False)\n--logging_dir LOGGING_DIR\n                        Tensorboard log dir. (default: None)\n--logging_strategy {no,steps,epoch}\n                        The logging strategy to use. (default: steps)\n--logging_first_step [LOGGING_FIRST_STEP]\n                        Log the first global_step (default: False)\n--logging_steps LOGGING_STEPS\n                        Log every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted as\n                        ratio of total training steps. (default: 500)\n--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]\n                        Filter nan and inf losses for logging. (default: True)\n--no_logging_nan_inf_filter\n                        Filter nan and inf losses for logging. (default: False)\n--save_strategy {no,steps,epoch}\n                        The checkpoint save strategy to use. (default: steps)\n--save_steps SAVE_STEPS\n                        Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be\n                        interpreted as ratio of total training steps. (default: 500)\n--save_total_limit SAVE_TOTAL_LIMIT\n                        If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. When\n                        `load_best_model_at_end` is enabled, the 'best' checkpoint according to `metric_for_best_model` will always be retained in\n                        addition to the most recent ones. For example, for `save_total_limit=5` and `load_best_model_at_end=True`, the four last\n                        checkpoints will always be retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,\n                        it is possible that two checkpoints are saved: the last one and the best one (if they are different). Default is unlimited\n                        checkpoints (default: None)\n--save_safetensors [SAVE_SAFETENSORS]\n                        Use safetensors saving and loading for state dicts instead of default torch.load and torch.save. (default: False)\n--save_on_each_node [SAVE_ON_EACH_NODE]\n                        When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one\n                        (default: False)\n--no_cuda [NO_CUDA]   Do not use CUDA even when it is available (default: False)\n--use_mps_device [USE_MPS_DEVICE]\n                        This argument is deprecated. `mps` device will be used if available similar to `cuda` device. It will be removed in\n                        version 5.0 of \ud83e\udd17 Transformers (default: False)\n--seed SEED           Random seed that will be set at the beginning of training. (default: 42)\n--data_seed DATA_SEED\n                        Random seed to be used with data samplers. (default: None)\n--jit_mode_eval [JIT_MODE_EVAL]\n                        Whether or not to use PyTorch jit trace for inference (default: False)\n--use_ipex [USE_IPEX]\n                        Use Intel extension for PyTorch when it is available, installation: 'https://github.com/intel/intel-extension-for-pytorch'\n                        (default: False)\n--bf16 [BF16]         Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or using CPU\n                        (no_cuda). This is an experimental API and it may change. (default: False)\n--fp16 [FP16]         Whether to use fp16 (mixed) precision instead of 32-bit (default: False)\n--fp16_opt_level FP16_OPT_LEVEL\n                        For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at\n                        https://nvidia.github.io/apex/amp.html (default: O1)\n--half_precision_backend {auto,cuda_amp,apex,cpu_amp}\n                        The backend to be used for half precision. (default: auto)\n--bf16_full_eval [BF16_FULL_EVAL]\n                        Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may change. (default: False)\n--fp16_full_eval [FP16_FULL_EVAL]\n                        Whether to use full float16 evaluation instead of 32-bit (default: False)\n--tf32 TF32           Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API and it may\n                        change. (default: None)\n--local_rank LOCAL_RANK\n                        For distributed training: local_rank (default: -1)\n--ddp_backend {nccl,gloo,mpi,ccl}\n                        The backend to be used for distributed training (default: None)\n--tpu_num_cores TPU_NUM_CORES\n                        TPU: Number of TPU cores (automatically passed by launcher script) (default: None)\n--tpu_metrics_debug [TPU_METRICS_DEBUG]\n                        Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU: Whether to print debug metrics (default: False)\n--debug DEBUG [DEBUG ...]\n                        Whether or not to enable debug mode. Current options: `underflow_overflow` (Detect underflow and overflow in activations\n                        and weights), `tpu_metrics_debug` (print debug metrics on TPU). (default: None)\n--dataloader_drop_last [DATALOADER_DROP_LAST]\n                        Drop the last incomplete batch if it is not divisible by the batch size. (default: False)\n--eval_steps EVAL_STEPS\n                        Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted\n                        as ratio of total training steps. (default: None)\n--dataloader_num_workers DATALOADER_NUM_WORKERS\n                        Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process.\n                        (default: 0)\n--past_index PAST_INDEX\n                        If &gt;=0, uses the corresponding part of the output as the past state for next step. (default: -1)\n--run_name RUN_NAME   An optional descriptor for the run. Notably used for wandb logging. (default: None)\n--disable_tqdm DISABLE_TQDM\n                        Whether or not to disable the tqdm progress bars. (default: None)\n--remove_unused_columns [REMOVE_UNUSED_COLUMNS]\n                        Remove columns not required by the model when using an nlp.Dataset. (default: True)\n--no_remove_unused_columns\n                        Remove columns not required by the model when using an nlp.Dataset. (default: False)\n--label_names LABEL_NAMES [LABEL_NAMES ...]\n                        The list of keys in your dictionary of inputs that correspond to the labels. (default: None)\n--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]\n                        Whether or not to load the best model found during training at the end of training. When this option is enabled, the best\n                        checkpoint will always be saved. See `save_total_limit` for more. (default: False)\n--metric_for_best_model METRIC_FOR_BEST_MODEL\n                        The metric to use to compare two different models. (default: None)\n--greater_is_better GREATER_IS_BETTER\n                        Whether the `metric_for_best_model` should be maximized or not. (default: None)\n--ignore_data_skip [IGNORE_DATA_SKIP]\n                        When resuming training, whether or not to skip the first epochs and batches to get to the same training data. (default:\n                        False)\n--sharded_ddp SHARDED_DDP\n                        Whether or not to use sharded DDP training (in distributed training only). The base option should be `simple`, `zero_dp_2`\n                        or `zero_dp_3` and you can add CPU-offload to `zero_dp_2` or `zero_dp_3` like this: zero_dp_2 offload` or `zero_dp_3\n                        offload`. You can add auto-wrap to `zero_dp_2` or `zero_dp_3` with the same syntax: zero_dp_2 auto_wrap` or `zero_dp_3\n                        auto_wrap`. (default: )\n--fsdp FSDP           Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training only). The base option\n                        should be `full_shard`, `shard_grad_op` or `no_shard` and you can add CPU-offload to `full_shard` or `shard_grad_op` like\n                        this: full_shard offload` or `shard_grad_op offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the\n                        same syntax: full_shard auto_wrap` or `shard_grad_op auto_wrap`. (default: )\n--fsdp_min_num_params FSDP_MIN_NUM_PARAMS\n                        This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp`\n                        field is passed). (default: 0)\n--fsdp_config FSDP_CONFIG\n                        Config to be used with FSDP (Pytorch Fully Sharded Data Parallel). The value is either afsdp json config file (e.g.,\n                        `fsdp_config.json`) or an already loaded json file as `dict`. (default: None)\n--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP\n                        This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,\n                        `T5Block` .... (useful only when `fsdp` flag is passed). (default: None)\n--deepspeed DEEPSPEED\n                        Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a\n                        dict (default: None)\n--label_smoothing_factor LABEL_SMOOTHING_FACTOR\n                        The label smoothing epsilon to apply (zero means no label smoothing). (default: 0.0)\n--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}\n                        The optimizer to use. (default: adamw_hf)\n--optim_args OPTIM_ARGS\n                        Optional arguments to supply to optimizer. (default: None)\n--adafactor [ADAFACTOR]\n                        Whether or not to replace AdamW by Adafactor. (default: False)\n--group_by_length [GROUP_BY_LENGTH]\n                        Whether or not to group samples of roughly the same length together when batching. (default: False)\n--length_column_name LENGTH_COLUMN_NAME\n                        Column name with precomputed lengths to use when grouping by length. (default: length)\n--report_to REPORT_TO [REPORT_TO ...]\n                        The list of integrations to report the results and logs to. (default: None)\n--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS\n                        When using distributed training, the value of the flag `find_unused_parameters` passed to `DistributedDataParallel`.\n                        (default: None)\n--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB\n                        When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`. (default:\n                        None)\n--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS\n                        When using distributed training, the value of the flag `broadcast_buffers` passed to `DistributedDataParallel`. (default:\n                        None)\n--dataloader_pin_memory [DATALOADER_PIN_MEMORY]\n                        Whether or not to pin memory for DataLoader. (default: True)\n--no_dataloader_pin_memory\n                        Whether or not to pin memory for DataLoader. (default: False)\n--skip_memory_metrics [SKIP_MEMORY_METRICS]\n                        Whether or not to skip adding of memory profiler reports to metrics. (default: True)\n--no_skip_memory_metrics\n                        Whether or not to skip adding of memory profiler reports to metrics. (default: False)\n--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]\n                        Whether or not to use the legacy prediction_loop in the Trainer. (default: False)\n--push_to_hub [PUSH_TO_HUB]\n                        Whether or not to upload the trained model to the model hub after training. (default: False)\n--resume_from_checkpoint RESUME_FROM_CHECKPOINT\n                        The path to a folder with a valid checkpoint for your model. (default: None)\n--hub_model_id HUB_MODEL_ID\n                        The name of the repository to keep in sync with the local `output_dir`. (default: None)\n--hub_strategy {end,every_save,checkpoint,all_checkpoints}\n                        The hub strategy to use when `--push_to_hub` is activated. (default: every_save)\n--hub_token HUB_TOKEN\n                        The token to use to push to the Model Hub. (default: None)\n--hub_private_repo [HUB_PRIVATE_REPO]\n                        Whether the model repository is private or not. (default: False)\n--gradient_checkpointing [GRADIENT_CHECKPOINTING]\n                        If True, use gradient checkpointing to save memory at the expense of slower backward pass. (default: False)\n--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]\n                        Whether or not the inputs will be passed to the `compute_metrics` function. (default: False)\n--fp16_backend {auto,cuda_amp,apex,cpu_amp}\n                        Deprecated. Use half_precision_backend instead (default: auto)\n--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID\n                        The name of the repository to which push the `Trainer`. (default: None)\n--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION\n                        The name of the organization in with to which push the `Trainer`. (default: None)\n--push_to_hub_token PUSH_TO_HUB_TOKEN\n                        The token to use to push to the Model Hub. (default: None)\n--mp_parameters MP_PARAMETERS\n                        Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer (default: )\n--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]\n                        Whether to automatically decrease the batch size in half and rerun the training loop again each time a CUDA Out-of-Memory\n                        was reached (default: False)\n--full_determinism [FULL_DETERMINISM]\n                        Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed training. Important: this\n                        will negatively impact the performance, so only use it for debugging. (default: False)\n--torchdynamo TORCHDYNAMO\n                        This argument is deprecated, use `--torch_compile_backend` instead. (default: None)\n--ray_scope RAY_SCOPE\n                        The scope to use when doing hyperparameter search with Ray. By default, `\"last\"` will be used. Ray will then use the last\n                        checkpoint of all trials, compare those, and select the best one. However, other options are also available. See the Ray\n                        documentation (https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for\n                        more options. (default: last)\n--ddp_timeout DDP_TIMEOUT\n                        Overrides the default timeout for distributed training (value should be given in seconds). (default: 1800)\n--torch_compile [TORCH_COMPILE]\n                        If set to `True`, the model will be wrapped in `torch.compile`. (default: False)\n--torch_compile_backend TORCH_COMPILE_BACKEND\n                        Which backend to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--torch_compile_mode TORCH_COMPILE_MODE\n                        Which mode to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--xpu_backend {mpi,ccl,gloo}\n                        The backend to be used for distributed training on Intel XPU. (default: None)\n</code></pre>"},{"location":"data_license.html","title":"Data License","text":"<pre><code># Creative Commons Attribution 4.0 International License (CC BY 4.0)\n\nThis work is licensed under the Creative Commons Attribution 4.0 International License.\n\nTo view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.\n</code></pre>"},{"location":"license.html","title":"License","text":"<pre><code>Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2023 Emmanuel Noutahi\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n</code></pre>"},{"location":"api/safe.html","title":"SAFE","text":""},{"location":"api/safe.html#safe-encoder-decoder","title":"SAFE Encoder-Decoder","text":""},{"location":"api/safe.html#safe.converter.SAFEConverter","title":"<code>SAFEConverter</code>","text":"<p>Molecule line notation conversion from SMILES to SAFE</p> <p>A SAFE representation is a string based representation of a molecule decomposition into fragment components, separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves, unless explicitely correct to add missing hydrogens.</p> <p>Slicing algorithms</p> <p>By default SAFE strings are generated using <code>BRICS</code>, however, the following alternative are supported:</p> <ul> <li>Hussain-Rea (<code>hr</code>)</li> <li>RECAP (<code>recap</code>)</li> <li>RDKit's MMPA (<code>mmpa</code>)</li> <li>Any possible attachment points (<code>attach</code>)</li> </ul> <p>Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.</p> Source code in <code>safe/converter.py</code> <pre><code>class SAFEConverter:\n    \"\"\"Molecule line notation conversion from SMILES to SAFE\n\n    A SAFE representation is a string based representation of a molecule decomposition into fragment components,\n    separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves,\n    unless explicitely correct to add missing hydrogens.\n\n    !!! note \"Slicing algorithms\"\n\n        By default SAFE strings are generated using `BRICS`, however, the following alternative are supported:\n\n        * [Hussain-Rea (`hr`)](https://pubs.acs.org/doi/10.1021/ci900450m)\n        * [RECAP (`recap`)](https://pubmed.ncbi.nlm.nih.gov/9611787/)\n        * [RDKit's MMPA (`mmpa`)](https://www.rdkit.org/docs/source/rdkit.Chem.rdMMPA.html)\n        * Any possible attachment points (`attach`)\n\n        Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms\n        corresponding to the bonds to break.\n\n    \"\"\"\n\n    SUPPORTED_SLICERS = [\"hr\", \"rotatable\", \"recap\", \"mmpa\", \"attach\", \"brics\"]\n    __SLICE_SMARTS = {\n        \"hr\": [\"[*]!@-[*]\"],  # any non ring single bond\n        \"recap\": [\n            \"[$([C;!$(C([#7])[#7])](=!@[O]))]!@[$([#7;+0;!D1])]\",\n            \"[$(C=!@O)]!@[$([O;+0])]\",\n            \"[$([N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*]))]-!@[$([*])]\",\n            \"[$(C(=!@O)([#7;+0;D2,D3])!@[#7;+0;D2,D3])]!@[$([#7;+0;D2,D3])]\",\n            \"[$([O;+0](-!@[#6!$(C=O)])-!@[#6!$(C=O)])]-!@[$([#6!$(C=O)])]\",\n            \"C=!@C\",\n            \"[N;+1;D4]!@[#6]\",\n            \"[$([n;+0])]-!@C\",\n            \"[$([O]=[C]-@[N;+0])]-!@[$([C])]\",\n            \"c-!@c\",\n            \"[$([#7;+0;D2,D3])]-!@[$([S](=[O])=[O])]\",\n        ],\n        \"mmpa\": [\"[#6+0;!$(*=,#[!#6])]!@!=!#[*]\"],  # classical mmpa slicing smarts\n        \"attach\": [\"[*]!@[*]\"],  # any potential attachment point, including hydrogens when explicit\n        \"rotatable\": [\"[!$(*#*)&amp;!D1]-&amp;!@[!$(*#*)&amp;!D1]\"],\n    }\n\n    def __init__(\n        self,\n        slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n        require_hs: Optional[bool] = None,\n        use_original_opener_for_attach: bool = True,\n        ignore_stereo: bool = False,\n    ):\n        \"\"\"Constructor for the SAFE converter\n\n        Args:\n            slicer: slicer algorithm to use for encoding.\n                Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n                or a custom callable that returns the bond ids that can be sliced.\n            require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n                `attach` slicer requires adding hydrogens.\n            use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n                mapping number to attachment points, or use simple enumeration.\n            ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n        \"\"\"\n        self.slicer = slicer\n        if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n            self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n        if self.slicer != \"brics\" and isinstance(self.slicer, str):\n            self.slicer = [self.slicer]\n        if isinstance(self.slicer, (list, tuple)):\n            self.slicer = [dm.from_smarts(x) for x in self.slicer]\n            if any(x is None for x in self.slicer):\n                raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n        self.require_hs = require_hs or (slicer == \"attach\")\n        self.use_original_opener_for_attach = use_original_opener_for_attach\n        self.ignore_stereo = ignore_stereo\n\n    @staticmethod\n    def randomize(mol: dm.Mol, rng: Optional[int] = None):\n        \"\"\"Randomize the position of the atoms in a mol.\n\n        Args:\n            mol: molecules to randomize\n            rng: optional seed to use\n        \"\"\"\n        if isinstance(rng, int):\n            rng = np.random.default_rng(rng)\n        if mol.GetNumAtoms() == 0:\n            return mol\n        atom_indices = list(range(mol.GetNumAtoms()))\n        atom_indices = rng.permutation(atom_indices).tolist()\n        return Chem.RenumberAtoms(mol, atom_indices)\n\n    @classmethod\n    def _find_branch_number(cls, inp: str):\n        \"\"\"Find the branch number and ring closure in the SMILES representation using regexp\n\n        Args:\n            inp: input smiles\n        \"\"\"\n        inp = re.sub(\"[\\[].*?[\\]]\", \"\", inp)  # noqa\n        matching_groups = re.findall(r\"((?&lt;=%)\\d{2})|((?&lt;!%)\\d+)(?![^\\[]*\\])\", inp)\n        # first match is for multiple connection as multiple digits\n        # second match is for single connections requiring 2 digits\n        # SMILES does not support triple digits\n        branch_numbers = []\n        for m in matching_groups:\n            if m[0] == \"\":\n                branch_numbers.extend(int(mm) for mm in m[1])\n            elif m[1] == \"\":\n                branch_numbers.append(int(m[0].replace(\"%\", \"\")))\n        return branch_numbers\n\n    def _ensure_valid(self, inp: str):\n        \"\"\"Ensure that the input SAFE string is valid by fixing the missing attachment points\n\n        Args:\n            inp: input SAFE string\n\n        \"\"\"\n        missing_tokens = [inp]\n        branch_numbers = self._find_branch_number(inp)\n        # only use the set that have exactly 1 element\n        # any branch number that is not pairwise should receive a dummy atom to complete the attachment point\n        branch_numbers = Counter(branch_numbers)\n        for i, (bnum, bcount) in enumerate(branch_numbers.items()):\n            if bcount % 2 != 0:\n                bnum_str = str(bnum) if bnum &lt; 10 else f\"%{bnum}\"\n                _tk = f\"[*:{i+1}]{bnum_str}\"\n                if self.use_original_opener_for_attach:\n                    bnum_digit = bnum_str.strip(\"%\")  # strip out the % sign\n                    _tk = f\"[*:{bnum_digit}]{bnum_str}\"\n                missing_tokens.append(_tk)\n        return \".\".join(missing_tokens)\n\n    def decoder(\n        self,\n        inp: str,\n        as_mol: bool = False,\n        canonical: bool = False,\n        fix: bool = True,\n        remove_dummies: bool = True,\n        remove_added_hs: bool = True,\n    ):\n        \"\"\"Convert input SAFE representation to smiles\n\n        Args:\n            inp: input SAFE representation to decode as a valid molecule or smiles\n            as_mol: whether to return a molecule object or a smiles string\n            canonical: whether to return a canonical\n            fix: whether to fix the SAFE representation to take into account non-connected attachment points\n            remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n            remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n        \"\"\"\n\n        if fix:\n            inp = self._ensure_valid(inp)\n        mol = dm.to_mol(inp)\n        if remove_dummies:\n            with suppress(Exception):\n                du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n                out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n                mol = dm.remove_dummies(out)\n        if as_mol:\n            if remove_added_hs:\n                mol = dm.remove_hs(mol, update_explicit_count=True)\n            if canonical:\n                mol = dm.standardize_mol(mol)\n                mol = dm.canonical_tautomer(mol)\n            return mol\n        out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n        if canonical:\n            out = dm.standardize_smiles(out)\n        return out\n\n    def _fragment(self, mol: dm.Mol, allow_empty: bool = False):\n        \"\"\"\n        Perform bond cutting in place for the input molecule, given the slicing algorithm\n\n        Args:\n            mol: input molecule to split\n            allow_empty: whether to allow the slicing algorithm to return empty bonds\n        Raises:\n            SAFEFragmentationError: if the slicing algorithm return empty bonds\n        \"\"\"\n\n        if self.slicer is None:\n            matching_bonds = []\n\n        elif callable(self.slicer):\n            matching_bonds = self.slicer(mol)\n            matching_bonds = list(matching_bonds)\n\n        elif self.slicer == \"brics\":\n            matching_bonds = BRICS.FindBRICSBonds(mol)\n            matching_bonds = [brics_match[0] for brics_match in matching_bonds]\n\n        else:\n            matches = set()\n            for smarts in self.slicer:\n                matches |= {\n                    tuple(sorted(match)) for match in mol.GetSubstructMatches(smarts, uniquify=True)\n                }\n            matching_bonds = list(matches)\n\n        if matching_bonds is None or len(matching_bonds) == 0 and not allow_empty:\n            raise SAFEFragmentationError(\n                \"Slicing algorithms did not return any bonds that can be cut !\"\n            )\n        return matching_bonds or []\n\n    def encoder(\n        self,\n        inp: Union[str, dm.Mol],\n        canonical: bool = True,\n        randomize: Optional[bool] = False,\n        seed: Optional[int] = None,\n        constraints: Optional[List[dm.Mol]] = None,\n        allow_empty: bool = False,\n        rdkit_safe: bool = True,\n    ):\n        \"\"\"Convert input smiles to SAFE representation\n\n        Args:\n            inp: input smiles\n            canonical: whether to return canonical smiles string. Defaults to True\n            randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n            seed: optional seed to use when allowing randomization of the SAFE encoding.\n                Randomization happens at two steps:\n                1. at the original smiles representation by randomization the atoms.\n                2. at the SAFE conversion by randomizing fragment orders\n            constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n                happen outside of a substructure matching one of the patterns.\n            allow_empty: whether to allow the slicing algorithm to return empty bonds\n            rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n        \"\"\"\n        rng = None\n        if randomize:\n            rng = np.random.default_rng(seed)\n            if not canonical:\n                inp = dm.to_mol(inp, remove_hs=False)\n                inp = self.randomize(inp, rng)\n\n        if isinstance(inp, dm.Mol):\n            inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n        # EN: we first normalize the attachment if the molecule is a query:\n        # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n        # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n        # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n        branch_numbers = self._find_branch_number(inp)\n\n        mol = dm.to_mol(inp, remove_hs=False)\n        potential_stereos = Chem.FindPotentialStereo(mol)\n        has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n        if self.ignore_stereo:\n            mol = dm.remove_stereochemistry(mol)\n\n        bond_map_id = 1\n        for atom in mol.GetAtoms():\n            if atom.GetAtomicNum() == 0:\n                atom.SetAtomMapNum(0)\n                atom.SetIsotope(bond_map_id)\n                bond_map_id += 1\n\n        if self.require_hs:\n            mol = dm.add_hs(mol)\n        matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n        substructed_ignored = []\n        if constraints is not None:\n            substructed_ignored = list(\n                itertools.chain(\n                    *[\n                        mol.GetSubstructMatches(constraint, uniquify=True)\n                        for constraint in constraints\n                    ]\n                )\n            )\n\n        bonds = []\n        for i_a, i_b in matching_bonds:\n            # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n            # on the other end, a bond between two substructure to preserved independently is perfectly fine\n            if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n                continue\n            obond = mol.GetBondBetweenAtoms(i_a, i_b)\n            bonds.append(obond.GetIdx())\n\n        if len(bonds) &gt; 0:\n            mol = Chem.FragmentOnBonds(\n                mol,\n                bonds,\n                dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n            )\n        # here we need to be clever and disable rooted atom as the atom with mapping\n\n        frags = list(Chem.GetMolFrags(mol, asMols=True))\n        if randomize:\n            frags = rng.permutation(frags).tolist()\n        elif canonical:\n            frags = sorted(\n                frags,\n                key=lambda x: x.GetNumAtoms(),\n                reverse=True,\n            )\n\n        frags_str = []\n        for frag in frags:\n            non_map_atom_idxs = [\n                atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n            ]\n            frags_str.append(\n                Chem.MolToSmiles(\n                    frag,\n                    isomericSmiles=True,\n                    canonical=True,  # needs to always be true\n                    rootedAtAtom=non_map_atom_idxs[0],\n                )\n            )\n\n        scaffold_str = \".\".join(frags_str)\n        # EN: fix for https://github.com/datamol-io/safe/issues/37\n        # we were using the wrong branch number count which did not take into account\n        # possible change in digit utilization after bond slicing\n        scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n        # don't capture atom mapping in the scaffold\n        attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n        if canonical:\n            attach_pos = sorted(attach_pos)\n        starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n        for attach in attach_pos:\n            val = str(starting_num) if starting_num &lt; 10 else f\"%{starting_num}\"\n            # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n            attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n            scaffold_str = attach_regexp.sub(val, scaffold_str)\n            starting_num += 1\n\n        # now we need to remove all the parenthesis around digit only number\n        wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n        scaffold_str = wrong_attach.sub(r\"\\g&lt;1&gt;\", scaffold_str)\n        # furthermore, we autoapply rdkit-compatible digit standardization.\n        if rdkit_safe:\n            pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n            replacement = r\"\\g&lt;1&gt;\\g&lt;2&gt;\"\n            scaffold_str = re.sub(pattern, replacement, scaffold_str)\n        if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n            logger.warning(\n                \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n            )\n        return scaffold_str\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.__init__","title":"<code>__init__(slicer='brics', require_hs=None, use_original_opener_for_attach=True, ignore_stereo=False)</code>","text":"<p>Constructor for the SAFE converter</p> <p>Parameters:</p> Name Type Description Default <code>slicer</code> <code>Optional[Union[str, List[str], Callable]]</code> <p>slicer algorithm to use for encoding. Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS) or a custom callable that returns the bond ids that can be sliced.</p> <code>'brics'</code> <code>require_hs</code> <code>Optional[bool]</code> <p>whether the slicing algorithm require the molecule to have hydrogen explictly added. <code>attach</code> slicer requires adding hydrogens.</p> <code>None</code> <code>use_original_opener_for_attach</code> <code>bool</code> <p>whether to use the original branch opener digit when adding back mapping number to attachment points, or use simple enumeration.</p> <code>True</code> <code>ignore_stereo</code> <code>bool</code> <p>RDKIT does not support some particular SAFE subset when stereochemistry is defined.</p> <code>False</code> Source code in <code>safe/converter.py</code> <pre><code>def __init__(\n    self,\n    slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n    require_hs: Optional[bool] = None,\n    use_original_opener_for_attach: bool = True,\n    ignore_stereo: bool = False,\n):\n    \"\"\"Constructor for the SAFE converter\n\n    Args:\n        slicer: slicer algorithm to use for encoding.\n            Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n            or a custom callable that returns the bond ids that can be sliced.\n        require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n            `attach` slicer requires adding hydrogens.\n        use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n            mapping number to attachment points, or use simple enumeration.\n        ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n    \"\"\"\n    self.slicer = slicer\n    if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n        self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n    if self.slicer != \"brics\" and isinstance(self.slicer, str):\n        self.slicer = [self.slicer]\n    if isinstance(self.slicer, (list, tuple)):\n        self.slicer = [dm.from_smarts(x) for x in self.slicer]\n        if any(x is None for x in self.slicer):\n            raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n    self.require_hs = require_hs or (slicer == \"attach\")\n    self.use_original_opener_for_attach = use_original_opener_for_attach\n    self.ignore_stereo = ignore_stereo\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.decoder","title":"<code>decoder(inp, as_mol=False, canonical=False, fix=True, remove_dummies=True, remove_added_hs=True)</code>","text":"<p>Convert input SAFE representation to smiles</p> <p>Parameters:</p> Name Type Description Default <code>inp</code> <code>str</code> <p>input SAFE representation to decode as a valid molecule or smiles</p> required <code>as_mol</code> <code>bool</code> <p>whether to return a molecule object or a smiles string</p> <code>False</code> <code>canonical</code> <code>bool</code> <p>whether to return a canonical</p> <code>False</code> <code>fix</code> <code>bool</code> <p>whether to fix the SAFE representation to take into account non-connected attachment points</p> <code>True</code> <code>remove_dummies</code> <code>bool</code> <p>whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with</p> <code>True</code> <code>remove_added_hs</code> <code>bool</code> <p>whether to remove all the added hydrogen atoms after applying dummy removal for recovery</p> <code>True</code> Source code in <code>safe/converter.py</code> <pre><code>def decoder(\n    self,\n    inp: str,\n    as_mol: bool = False,\n    canonical: bool = False,\n    fix: bool = True,\n    remove_dummies: bool = True,\n    remove_added_hs: bool = True,\n):\n    \"\"\"Convert input SAFE representation to smiles\n\n    Args:\n        inp: input SAFE representation to decode as a valid molecule or smiles\n        as_mol: whether to return a molecule object or a smiles string\n        canonical: whether to return a canonical\n        fix: whether to fix the SAFE representation to take into account non-connected attachment points\n        remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n        remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n    \"\"\"\n\n    if fix:\n        inp = self._ensure_valid(inp)\n    mol = dm.to_mol(inp)\n    if remove_dummies:\n        with suppress(Exception):\n            du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n            out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n            mol = dm.remove_dummies(out)\n    if as_mol:\n        if remove_added_hs:\n            mol = dm.remove_hs(mol, update_explicit_count=True)\n        if canonical:\n            mol = dm.standardize_mol(mol)\n            mol = dm.canonical_tautomer(mol)\n        return mol\n    out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n    if canonical:\n        out = dm.standardize_smiles(out)\n    return out\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.encoder","title":"<code>encoder(inp, canonical=True, randomize=False, seed=None, constraints=None, allow_empty=False, rdkit_safe=True)</code>","text":"<p>Convert input smiles to SAFE representation</p> <p>Parameters:</p> Name Type Description Default <code>inp</code> <code>Union[str, Mol]</code> <p>input smiles</p> required <code>canonical</code> <code>bool</code> <p>whether to return canonical smiles string. Defaults to True</p> <code>True</code> <code>randomize</code> <code>Optional[bool]</code> <p>whether to randomize the safe string encoding. Will be ignored if canonical is provided</p> <code>False</code> <code>seed</code> <code>Optional[int]</code> <p>optional seed to use when allowing randomization of the SAFE encoding. Randomization happens at two steps: 1. at the original smiles representation by randomization the atoms. 2. at the SAFE conversion by randomizing fragment orders</p> <code>None</code> <code>constraints</code> <code>Optional[List[Mol]]</code> <p>List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would happen outside of a substructure matching one of the patterns.</p> <code>None</code> <code>allow_empty</code> <code>bool</code> <p>whether to allow the slicing algorithm to return empty bonds</p> <code>False</code> <code>rdkit_safe</code> <code>bool</code> <p>whether to apply rdkit-safe digit standardization to the output SAFE string.</p> <code>True</code> Source code in <code>safe/converter.py</code> <pre><code>def encoder(\n    self,\n    inp: Union[str, dm.Mol],\n    canonical: bool = True,\n    randomize: Optional[bool] = False,\n    seed: Optional[int] = None,\n    constraints: Optional[List[dm.Mol]] = None,\n    allow_empty: bool = False,\n    rdkit_safe: bool = True,\n):\n    \"\"\"Convert input smiles to SAFE representation\n\n    Args:\n        inp: input smiles\n        canonical: whether to return canonical smiles string. Defaults to True\n        randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n        seed: optional seed to use when allowing randomization of the SAFE encoding.\n            Randomization happens at two steps:\n            1. at the original smiles representation by randomization the atoms.\n            2. at the SAFE conversion by randomizing fragment orders\n        constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n            happen outside of a substructure matching one of the patterns.\n        allow_empty: whether to allow the slicing algorithm to return empty bonds\n        rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n    \"\"\"\n    rng = None\n    if randomize:\n        rng = np.random.default_rng(seed)\n        if not canonical:\n            inp = dm.to_mol(inp, remove_hs=False)\n            inp = self.randomize(inp, rng)\n\n    if isinstance(inp, dm.Mol):\n        inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n    # EN: we first normalize the attachment if the molecule is a query:\n    # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n    # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n    # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n    branch_numbers = self._find_branch_number(inp)\n\n    mol = dm.to_mol(inp, remove_hs=False)\n    potential_stereos = Chem.FindPotentialStereo(mol)\n    has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n    if self.ignore_stereo:\n        mol = dm.remove_stereochemistry(mol)\n\n    bond_map_id = 1\n    for atom in mol.GetAtoms():\n        if atom.GetAtomicNum() == 0:\n            atom.SetAtomMapNum(0)\n            atom.SetIsotope(bond_map_id)\n            bond_map_id += 1\n\n    if self.require_hs:\n        mol = dm.add_hs(mol)\n    matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n    substructed_ignored = []\n    if constraints is not None:\n        substructed_ignored = list(\n            itertools.chain(\n                *[\n                    mol.GetSubstructMatches(constraint, uniquify=True)\n                    for constraint in constraints\n                ]\n            )\n        )\n\n    bonds = []\n    for i_a, i_b in matching_bonds:\n        # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n        # on the other end, a bond between two substructure to preserved independently is perfectly fine\n        if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n            continue\n        obond = mol.GetBondBetweenAtoms(i_a, i_b)\n        bonds.append(obond.GetIdx())\n\n    if len(bonds) &gt; 0:\n        mol = Chem.FragmentOnBonds(\n            mol,\n            bonds,\n            dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n        )\n    # here we need to be clever and disable rooted atom as the atom with mapping\n\n    frags = list(Chem.GetMolFrags(mol, asMols=True))\n    if randomize:\n        frags = rng.permutation(frags).tolist()\n    elif canonical:\n        frags = sorted(\n            frags,\n            key=lambda x: x.GetNumAtoms(),\n            reverse=True,\n        )\n\n    frags_str = []\n    for frag in frags:\n        non_map_atom_idxs = [\n            atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n        ]\n        frags_str.append(\n            Chem.MolToSmiles(\n                frag,\n                isomericSmiles=True,\n                canonical=True,  # needs to always be true\n                rootedAtAtom=non_map_atom_idxs[0],\n            )\n        )\n\n    scaffold_str = \".\".join(frags_str)\n    # EN: fix for https://github.com/datamol-io/safe/issues/37\n    # we were using the wrong branch number count which did not take into account\n    # possible change in digit utilization after bond slicing\n    scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n    # don't capture atom mapping in the scaffold\n    attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n    if canonical:\n        attach_pos = sorted(attach_pos)\n    starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n    for attach in attach_pos:\n        val = str(starting_num) if starting_num &lt; 10 else f\"%{starting_num}\"\n        # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n        attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n        scaffold_str = attach_regexp.sub(val, scaffold_str)\n        starting_num += 1\n\n    # now we need to remove all the parenthesis around digit only number\n    wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n    scaffold_str = wrong_attach.sub(r\"\\g&lt;1&gt;\", scaffold_str)\n    # furthermore, we autoapply rdkit-compatible digit standardization.\n    if rdkit_safe:\n        pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n        replacement = r\"\\g&lt;1&gt;\\g&lt;2&gt;\"\n        scaffold_str = re.sub(pattern, replacement, scaffold_str)\n    if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n        logger.warning(\n            \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n        )\n    return scaffold_str\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.randomize","title":"<code>randomize(mol, rng=None)</code>  <code>staticmethod</code>","text":"<p>Randomize the position of the atoms in a mol.</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecules to randomize</p> required <code>rng</code> <code>Optional[int]</code> <p>optional seed to use</p> <code>None</code> Source code in <code>safe/converter.py</code> <pre><code>@staticmethod\ndef randomize(mol: dm.Mol, rng: Optional[int] = None):\n    \"\"\"Randomize the position of the atoms in a mol.\n\n    Args:\n        mol: molecules to randomize\n        rng: optional seed to use\n    \"\"\"\n    if isinstance(rng, int):\n        rng = np.random.default_rng(rng)\n    if mol.GetNumAtoms() == 0:\n        return mol\n    atom_indices = list(range(mol.GetNumAtoms()))\n    atom_indices = rng.permutation(atom_indices).tolist()\n    return Chem.RenumberAtoms(mol, atom_indices)\n</code></pre>"},{"location":"api/safe.html#safe.converter.encode","title":"<code>encode(inp, canonical=True, randomize=False, seed=None, slicer=None, require_hs=None, constraints=None, ignore_stereo=False)</code>","text":"<p>Convert input smiles to SAFE representation</p> <p>Parameters:</p> Name Type Description Default <code>inp</code> <code>Union[str, Mol]</code> <p>input smiles</p> required <code>canonical</code> <code>bool</code> <p>whether to return canonical SAFE string. Defaults to True</p> <code>True</code> <code>randomize</code> <code>Optional[bool]</code> <p>whether to randomize the safe string encoding. Will be ignored if canonical is provided</p> <code>False</code> <code>seed</code> <code>Optional[int]</code> <p>optional seed to use when allowing randomization of the SAFE encoding.</p> <code>None</code> <code>slicer</code> <code>Optional[Union[List[str], str, Callable]]</code> <p>slicer algorithm to use for encoding. Defaults to \"brics\".</p> <code>None</code> <code>require_hs</code> <code>Optional[bool]</code> <p>whether the slicing algorithm require the molecule to have hydrogen explictly added.</p> <code>None</code> <code>constraints</code> <code>Optional[List[Mol]]</code> <p>List of molecules or pattern to preserve during the SAFE construction.</p> <code>None</code> <code>ignore_stereo</code> <code>Optional[bool]</code> <p>RDKIT does not support some particular SAFE subset when stereochemistry is defined.</p> <code>False</code> Source code in <code>safe/converter.py</code> <pre><code>def encode(\n    inp: Union[str, dm.Mol],\n    canonical: bool = True,\n    randomize: Optional[bool] = False,\n    seed: Optional[int] = None,\n    slicer: Optional[Union[List[str], str, Callable]] = None,\n    require_hs: Optional[bool] = None,\n    constraints: Optional[List[dm.Mol]] = None,\n    ignore_stereo: Optional[bool] = False,\n):\n    \"\"\"\n    Convert input smiles to SAFE representation\n\n    Args:\n        inp: input smiles\n        canonical: whether to return canonical SAFE string. Defaults to True\n        randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n        seed: optional seed to use when allowing randomization of the SAFE encoding.\n        slicer: slicer algorithm to use for encoding. Defaults to \"brics\".\n        require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n        constraints: List of molecules or pattern to preserve during the SAFE construction.\n        ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n    \"\"\"\n    if slicer is None:\n        slicer = \"brics\"\n    with dm.without_rdkit_log():\n        safe_obj = SAFEConverter(slicer=slicer, require_hs=require_hs, ignore_stereo=ignore_stereo)\n        try:\n            encoded = safe_obj.encoder(\n                inp,\n                canonical=canonical,\n                randomize=randomize,\n                constraints=constraints,\n                seed=seed,\n            )\n        except SAFEFragmentationError as e:\n            raise e\n        except Exception as e:\n            raise SAFEEncodeError(f\"Failed to encode {inp} with {slicer}\") from e\n        return encoded\n</code></pre>"},{"location":"api/safe.html#safe.converter.decode","title":"<code>decode(safe_str, as_mol=False, canonical=False, fix=True, remove_added_hs=True, remove_dummies=True, ignore_errors=False)</code>","text":"<p>Convert input SAFE representation to smiles Args:     safe_str: input SAFE representation to decode as a valid molecule or smiles     as_mol: whether to return a molecule object or a smiles string     canonical: whether to return a canonical smiles or a randomized smiles     fix: whether to fix the SAFE representation to take into account non-connected attachment points     remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string.     remove_dummies: whether to remove dummy atoms from the SAFE representation     ignore_errors: whether to ignore error and return None on decoding failure or raise an error</p> Source code in <code>safe/converter.py</code> <pre><code>def decode(\n    safe_str: str,\n    as_mol: bool = False,\n    canonical: bool = False,\n    fix: bool = True,\n    remove_added_hs: bool = True,\n    remove_dummies: bool = True,\n    ignore_errors: bool = False,\n):\n    \"\"\"Convert input SAFE representation to smiles\n    Args:\n        safe_str: input SAFE representation to decode as a valid molecule or smiles\n        as_mol: whether to return a molecule object or a smiles string\n        canonical: whether to return a canonical smiles or a randomized smiles\n        fix: whether to fix the SAFE representation to take into account non-connected attachment points\n        remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string.\n        remove_dummies: whether to remove dummy atoms from the SAFE representation\n        ignore_errors: whether to ignore error and return None on decoding failure or raise an error\n\n    \"\"\"\n    with dm.without_rdkit_log():\n        safe_obj = SAFEConverter()\n        try:\n            decoded = safe_obj.decoder(\n                safe_str,\n                as_mol=as_mol,\n                canonical=canonical,\n                fix=fix,\n                remove_dummies=remove_dummies,\n                remove_added_hs=remove_added_hs,\n            )\n\n        except Exception as e:\n            if ignore_errors:\n                return None\n            raise SAFEDecodeError(f\"Failed to decode {safe_str}\") from e\n        return decoded\n</code></pre>"},{"location":"api/safe.html#safe-tokenizer","title":"SAFE Tokenizer","text":""},{"location":"api/safe.html#safe.tokenizer.SAFESplitter","title":"<code>SAFESplitter</code>","text":"<p>Standard Splitter for SAFE string</p> Source code in <code>safe/tokenizer.py</code> <pre><code>class SAFESplitter:\n    \"\"\"Standard Splitter for SAFE string\"\"\"\n\n    REGEX_PATTERN = r\"\"\"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|&gt;&gt;?|\\*|\\$|\\%[0-9]{2}|[0-9])\"\"\"\n\n    name = \"safe\"\n\n    def __init__(self, pattern: Optional[str] = None):\n        # do not use this as raw strings (not r before)\n        if pattern is None:\n            pattern = self.REGEX_PATTERN\n        self.regex = re.compile(pattern)\n\n    def tokenize(self, line):\n        \"\"\"Tokenize a safe string into characters.\"\"\"\n        if isinstance(line, str):\n            tokens = list(self.regex.findall(line))\n            reconstruction = \"\".join(tokens)\n            if line != reconstruction:\n                logger.error(\n                    f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n                )\n                raise ValueError(line)\n        else:\n            idxs = re.finditer(self.regex, str(line))\n            tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n        return tokens\n\n    def detokenize(self, chars):\n        \"\"\"Detokenize SAFE notation\"\"\"\n        if isinstance(chars, str):\n            chars = chars.split(\" \")\n        return \"\".join([x.strip() for x in chars])\n\n    def split(self, n, normalized):\n        \"\"\"Perform splitting for pretokenization\"\"\"\n        return self.tokenize(normalized)\n\n    def pre_tokenize(self, pretok):\n        \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n        pretok.split(self.split)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.detokenize","title":"<code>detokenize(chars)</code>","text":"<p>Detokenize SAFE notation</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def detokenize(self, chars):\n    \"\"\"Detokenize SAFE notation\"\"\"\n    if isinstance(chars, str):\n        chars = chars.split(\" \")\n    return \"\".join([x.strip() for x in chars])\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.pre_tokenize","title":"<code>pre_tokenize(pretok)</code>","text":"<p>Pretokenize using an input pretokenizer object from the tokenizer library</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def pre_tokenize(self, pretok):\n    \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n    pretok.split(self.split)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.split","title":"<code>split(n, normalized)</code>","text":"<p>Perform splitting for pretokenization</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def split(self, n, normalized):\n    \"\"\"Perform splitting for pretokenization\"\"\"\n    return self.tokenize(normalized)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.tokenize","title":"<code>tokenize(line)</code>","text":"<p>Tokenize a safe string into characters.</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def tokenize(self, line):\n    \"\"\"Tokenize a safe string into characters.\"\"\"\n    if isinstance(line, str):\n        tokens = list(self.regex.findall(line))\n        reconstruction = \"\".join(tokens)\n        if line != reconstruction:\n            logger.error(\n                f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n            )\n            raise ValueError(line)\n    else:\n        idxs = re.finditer(self.regex, str(line))\n        tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n    return tokens\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer","title":"<code>SAFETokenizer</code>","text":"<p>               Bases: <code>PushToHubMixin</code></p> <p>Class to initialize and train a tokenizer for SAFE string Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast</p> Source code in <code>safe/tokenizer.py</code> <pre><code>class SAFETokenizer(PushToHubMixin):\n    \"\"\"\n    Class to initialize and train a tokenizer for SAFE string\n    Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast\n    \"\"\"\n\n    vocab_files_names: str = \"tokenizer.json\"\n\n    def __init__(\n        self,\n        tokenizer_type: str = \"bpe\",\n        splitter: Optional[str] = \"safe\",\n        trainer_args=None,\n        decoder_args=None,\n        token_model_args=None,\n    ):\n        super().__init__()\n        self.tokenizer_type = tokenizer_type\n        self.trainer_args = trainer_args or {}\n        self.decoder_args = decoder_args or {}\n        self.token_model_args = token_model_args or {}\n        if tokenizer_type is not None and tokenizer_type.startswith(\"bpe\"):\n            self.model = BPE(unk_token=UNK_TOKEN, **self.token_model_args)\n            self.trainer = BpeTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n        else:\n            self.model = WordLevel(unk_token=UNK_TOKEN, **self.token_model_args)\n            self.trainer = WordLevelTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n        self.tokenizer = Tokenizer(self.model)\n        self.splitter = None\n        if splitter == \"safe\":\n            self.splitter = SAFESplitter()\n            self.tokenizer.pre_tokenizer = PreTokenizer.custom(self.splitter)\n        self.tokenizer.post_processor = TemplateProcessing(\n            single=TEMPLATE_SINGLE,\n            pair=TEMPLATE_PAIR,\n            special_tokens=TEMPLATE_SPECIAL_TOKENS,\n        )\n        self.tokenizer.decoder = decoders.BPEDecoder(**self.decoder_args)\n        self.tokenizer = self.set_special_tokens(self.tokenizer)\n\n    @property\n    def bos_token_id(self):\n        \"\"\"Get the bos token id\"\"\"\n        return self.tokenizer.token_to_id(self.tokenizer.bos_token)\n\n    @property\n    def pad_token_id(self):\n        \"\"\"Get the bos token id\"\"\"\n        return self.tokenizer.token_to_id(self.tokenizer.pad_token)\n\n    @property\n    def eos_token_id(self):\n        \"\"\"Get the bos token id\"\"\"\n        return self.tokenizer.token_to_id(self.tokenizer.eos_token)\n\n    @classmethod\n    def set_special_tokens(\n        cls,\n        tokenizer: Tokenizer,\n        bos_token: str = CLS_TOKEN,\n        eos_token: str = SEP_TOKEN,\n    ):\n        \"\"\"Set special tokens for a tokenizer\n\n        Args:\n            tokenizer: tokenizer for which special tokens will be set\n            bos_token: Optional bos token to use\n            eos_token: Optional eos token to use\n        \"\"\"\n        tokenizer.pad_token = PADDING_TOKEN\n        tokenizer.cls_token = CLS_TOKEN\n        tokenizer.sep_token = SEP_TOKEN\n        tokenizer.mask_token = MASK_TOKEN\n        tokenizer.unk_token = UNK_TOKEN\n        tokenizer.eos_token = eos_token\n        tokenizer.bos_token = bos_token\n\n        if isinstance(tokenizer, Tokenizer):\n            tokenizer.add_special_tokens(\n                [\n                    PADDING_TOKEN,\n                    CLS_TOKEN,\n                    SEP_TOKEN,\n                    MASK_TOKEN,\n                    UNK_TOKEN,\n                    eos_token,\n                    bos_token,\n                ]\n            )\n        return tokenizer\n\n    def train(self, files: Optional[List[str]], **kwargs):\n        r\"\"\"\n        This is to train a new tokenizer from either a list of file or some input data\n\n        Args\n            files (str): file in which your molecules are separated by new line\n            kwargs (dict): optional args for the tokenizer `train`\n        \"\"\"\n        if isinstance(files, str):\n            files = [files]\n        self.tokenizer.train(files=files, trainer=self.trainer)\n\n    def __getstate__(self):\n        \"\"\"Getting state to allow pickling\"\"\"\n        with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n            d = copy.deepcopy(self.__dict__)\n        # copy back tokenizer level attribute\n        d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n        d[\"tokenizer\"].pre_tokenizer = Whitespace()\n        return d\n\n    def __setstate__(self, d):\n        \"\"\"Setting state during reloading pickling\"\"\"\n        use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n        if use_pretokenizer:\n            d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n        d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n        self.__dict__.update(d)\n\n    def train_from_iterator(self, data: Iterator, **kwargs: Any):\n        \"\"\"Train the Tokenizer using the provided iterator.\n\n        You can provide anything that is a Python Iterator\n            * A list of sequences :obj:`List[str]`\n            * A generator that yields :obj:`str` or :obj:`List[str]`\n            * A Numpy array of strings\n\n        Args:\n            data: data iterator\n            **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n        \"\"\"\n        self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n\n    def __len__(self):\n        r\"\"\"\n        Gets the count of tokens in vocab along with special tokens.\n        \"\"\"\n        return len(self.tokenizer.get_vocab().keys())\n\n    def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -&gt; list:\n        r\"\"\"\n        Encodes a given molecule string once training is done\n\n        Args:\n            sample_str: Sample string to encode molecule\n            ids_only: whether to return only the ids or the encoding objet\n\n        Returns:\n            object: Returns encoded list of IDs\n        \"\"\"\n        if isinstance(sample_str, str):\n            enc = self.tokenizer.encode(sample_str, **kwargs)\n            if ids_only:\n                return enc.ids\n            return enc\n\n        encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n        if ids_only:\n            return [enc.ids for enc in encs]\n        return encs\n\n    def to_dict(self, **kwargs):\n        \"\"\"Convert tokenizer to dict\"\"\"\n        # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n        if self.splitter is None:\n            tk_data = json.loads(self.tokenizer.to_str())\n        else:\n            with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n                # temporary replace pre tokenizer with whitespace\n                tk_data = json.loads(self.tokenizer.to_str())\n                tk_data[\"custom_pre_tokenizer\"] = True\n        tk_data[\"tokenizer_type\"] = self.tokenizer_type\n        tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n        return tk_data\n\n    def save_pretrained(self, *args, **kwargs):\n        \"\"\"Save pretrained tokenizer\"\"\"\n        self.tokenizer.save_pretrained(*args, **kwargs)\n\n    def save(self, file_name=None):\n        r\"\"\"\n        Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n        Args:\n            file_name (str, optional): File where to save tokenizer\n        \"\"\"\n        # EN: whole logic here assumes noone is going to mess with the special token\n        tk_data = self.to_dict()\n        with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n            out_str = json.dumps(tk_data, ensure_ascii=False)\n            OUT.write(out_str)\n\n    @classmethod\n    def from_dict(cls, data: dict):\n        \"\"\"Load tokenizer from dict\n\n        Args:\n            data: dictionary containing the tokenizer info\n        \"\"\"\n        tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n        tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n        custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n        tokenizer = Tokenizer.from_str(json.dumps(data))\n        if custom_pre_tokenizer:\n            tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n        mol_tokenizer = cls(tokenizer_type)\n        mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n        if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n            mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n        return mol_tokenizer\n\n    @classmethod\n    def load(cls, file_name):\n        \"\"\"Load the current tokenizer from file\"\"\"\n        with fsspec.open(file_name, \"r\") as OUT:\n            data_str = OUT.read()\n        data = json.loads(data_str)\n        # EN: the rust json parser of tokenizers has a predefined structure\n        # the next two lines are important\n        return cls.from_dict(data)\n\n    def decode(\n        self,\n        ids: list,\n        skip_special_tokens: bool = True,\n        ignore_stops: bool = False,\n        stop_token_ids: Optional[List[int]] = None,\n    ) -&gt; str:\n        r\"\"\"\n        Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n        Args:\n            ids: list of IDs\n            skip_special_tokens: whether to skip all special tokens when encountering them\n            ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n            stop_token_ids: optional list of stop token ids to use\n\n        Returns:\n            sequence: str representation of molecule\n        \"\"\"\n        old_id_list = ids\n        if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n            old_id_list = [ids]\n        if not stop_token_ids:\n            stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n        new_ids_list = []\n        for ids in old_id_list:\n            new_ids = ids\n            if not ignore_stops:\n                new_ids = []\n                # if first tokens are stop, we just remove it\n                # this is because of bart essentially\n                pos = 0\n                if len(ids) &gt; 1:\n                    while ids[pos] in stop_token_ids:\n                        pos += 1\n                # we only ignore when there is a list of tokens\n                ids = ids[pos:]\n                for pos, id in enumerate(ids):\n                    if int(id) in stop_token_ids:\n                        break\n                    new_ids.append(id)\n            new_ids_list.append(new_ids)\n        if len(new_ids_list) == 1:\n            return self.tokenizer.decode(\n                list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n            )\n        return self.tokenizer.decode_batch(\n            list(new_ids_list), skip_special_tokens=skip_special_tokens\n        )\n\n    def get_pretrained(self, **kwargs) -&gt; PreTrainedTokenizerFast:\n        r\"\"\"\n        Get a pretrained tokenizer from this tokenizer\n\n        Returns:\n            Returns pre-trained fast tokenizer for hugging face models.\n        \"\"\"\n        with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n            tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n        tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n        # now we need to add special_tokens\n        tk.add_special_tokens(\n            {\n                \"cls_token\": self.tokenizer.cls_token,\n                \"bos_token\": self.tokenizer.bos_token,\n                \"eos_token\": self.tokenizer.eos_token,\n                \"mask_token\": self.tokenizer.mask_token,\n                \"pad_token\": self.tokenizer.pad_token,\n                \"unk_token\": self.tokenizer.unk_token,\n                \"sep_token\": self.tokenizer.sep_token,\n            }\n        )\n        if (\n            tk.model_max_length is None\n            or tk.model_max_length &gt; 1e8\n            and hasattr(self.tokenizer, \"model_max_length\")\n        ):\n            tk.model_max_length = self.tokenizer.model_max_length\n            setattr(\n                tk,\n                \"model_max_length\",\n                getattr(self.tokenizer, \"model_max_length\"),\n            )\n        return tk\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        use_temp_dir: Optional[bool] = None,\n        commit_message: Optional[str] = None,\n        private: Optional[bool] = None,\n        token: Optional[Union[bool, str]] = None,\n        max_shard_size: Optional[Union[int, str]] = \"10GB\",\n        create_pr: bool = False,\n        safe_serialization: bool = False,\n        **deprecated_kwargs,\n    ) -&gt; str:\n        \"\"\"\n        Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n        Args:\n            repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n                when pushing to a given organization.\n            use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n                Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n            commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n            private: Whether or not the repository created should be private.\n            token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n                is not specified.\n            max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n                will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n                by a unit (like `\"5MB\"`).\n            create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n            safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n        \"\"\"\n        use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n        if use_auth_token is not None:\n            warnings.warn(\n                \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n                FutureWarning,\n            )\n            if token is not None:\n                raise ValueError(\n                    \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n                )\n            token = use_auth_token\n\n        repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n        if repo_path_or_name is not None:\n            # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n            # repo_id from the folder path, if it exists.\n            warnings.warn(\n                \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n                \"`repo_id` instead.\",\n                FutureWarning,\n            )\n            if repo_id is not None:\n                raise ValueError(\n                    \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n                )\n            if os.path.isdir(repo_path_or_name):\n                # repo_path: infer repo_id from the path\n                repo_id = repo_id.split(os.path.sep)[-1]\n                working_dir = repo_id\n            else:\n                # repo_name: use it as repo_id\n                repo_id = repo_path_or_name\n                working_dir = repo_id.split(\"/\")[-1]\n        else:\n            # Repo_id is passed correctly: infer working_dir from it\n            working_dir = repo_id.split(\"/\")[-1]\n\n        # Deprecation warning will be sent after for repo_url and organization\n        repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n        organization = deprecated_kwargs.pop(\"organization\", None)\n\n        repo_id = self._create_repo(\n            repo_id, private, token, repo_url=repo_url, organization=organization\n        )\n\n        if use_temp_dir is None:\n            use_temp_dir = not os.path.isdir(working_dir)\n\n        with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n            files_timestamps = self._get_files_timestamps(work_dir)\n\n            # Save all files.\n            with contextlib.suppress(Exception):\n                self.save_pretrained(\n                    work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n                )\n\n            self.save(os.path.join(work_dir, self.vocab_files_names))\n\n            return self._upload_modified_files(\n                work_dir,\n                repo_id,\n                files_timestamps,\n                commit_message=commit_message,\n                token=token,\n                create_pr=create_pr,\n            )\n\n    @classmethod\n    def from_pretrained(\n        cls,\n        pretrained_model_name_or_path: Union[str, os.PathLike],\n        cache_dir: Optional[Union[str, os.PathLike]] = None,\n        force_download: bool = False,\n        local_files_only: bool = False,\n        token: Optional[Union[str, bool]] = None,\n        return_fast_tokenizer: Optional[bool] = False,\n        proxies: Optional[Dict[str, str]] = None,\n        **kwargs,\n    ):\n        r\"\"\"\n        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n        tokenizer.\n\n        Args:\n            pretrained_model_name_or_path:\n                Can be either:\n\n                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n                  user or organization name, like `dbmdz/bert-base-german-cased`.\n                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n                  using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n                  `./my_model_directory/`.\n                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n                  `./my_model_directory/vocab.txt`.\n            cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n                standard cache should not be used.\n            force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n            proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n                `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n            token: The token to use as HTTP bearer authorization for remote files.\n                If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n            local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n            return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n        Examples:\n        ``` py\n            # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n            # Download vocabulary from huggingface.co and cache.\n            tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n            # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n            tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n            # If the tokenizer uses a single vocabulary file, you can point directly to this file\n            tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n        ```\n        \"\"\"\n        resume_download = kwargs.pop(\"resume_download\", False)\n        use_auth_token = kwargs.pop(\"use_auth_token\", None)\n        subfolder = kwargs.pop(\"subfolder\", None)\n        from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n        from_auto_class = kwargs.pop(\"_from_auto\", False)\n        commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n        if use_auth_token is not None:\n            warnings.warn(\n                \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n                FutureWarning,\n            )\n            if token is not None:\n                raise ValueError(\n                    \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n                )\n            token = use_auth_token\n\n        user_agent = {\n            \"file_type\": \"tokenizer\",\n            \"from_auto_class\": from_auto_class,\n            \"is_fast\": \"Fast\" in cls.__name__,\n        }\n        if from_pipeline is not None:\n            user_agent[\"using_pipeline\"] = from_pipeline\n\n        if is_offline_mode() and not local_files_only:\n            logger.info(\"Offline mode: forcing local_files_only=True\")\n            local_files_only = True\n\n        pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n        os.path.isdir(pretrained_model_name_or_path)\n        file_path = None\n        if os.path.isfile(pretrained_model_name_or_path):\n            file_path = pretrained_model_name_or_path\n        elif is_remote_url(pretrained_model_name_or_path):\n            file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n        else:\n            # EN: remove this when transformers package has uniform API\n            cached_file_extra_kwargs = {\"use_auth_token\": token}\n            if packaging.version.parse(transformers_version) &gt;= packaging.version.parse(\"5.0\"):\n                cached_file_extra_kwargs = {\"token\": token}\n            # Try to get the tokenizer config to see if there are versioned tokenizer files.\n            resolved_vocab_files = cached_file(\n                pretrained_model_name_or_path,\n                cls.vocab_files_names,\n                cache_dir=cache_dir,\n                force_download=force_download,\n                resume_download=resume_download,\n                proxies=proxies,\n                local_files_only=local_files_only,\n                subfolder=subfolder,\n                user_agent=user_agent,\n                _raise_exceptions_for_missing_entries=False,\n                _raise_exceptions_for_connection_errors=False,\n                _commit_hash=commit_hash,\n                **cached_file_extra_kwargs,\n            )\n            commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n            file_path = resolved_vocab_files\n\n        if not os.path.isfile(file_path):\n            logger.info(\n                f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n            )\n\n        tokenizer = cls.load(file_path)\n        if return_fast_tokenizer:\n            return tokenizer.get_pretrained()\n        return tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.bos_token_id","title":"<code>bos_token_id</code>  <code>property</code>","text":"<p>Get the bos token id</p>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.eos_token_id","title":"<code>eos_token_id</code>  <code>property</code>","text":"<p>Get the bos token id</p>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.pad_token_id","title":"<code>pad_token_id</code>  <code>property</code>","text":"<p>Get the bos token id</p>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__getstate__","title":"<code>__getstate__()</code>","text":"<p>Getting state to allow pickling</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def __getstate__(self):\n    \"\"\"Getting state to allow pickling\"\"\"\n    with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n        d = copy.deepcopy(self.__dict__)\n    # copy back tokenizer level attribute\n    d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n    d[\"tokenizer\"].pre_tokenizer = Whitespace()\n    return d\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__len__","title":"<code>__len__()</code>","text":"<p>Gets the count of tokens in vocab along with special tokens.</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def __len__(self):\n    r\"\"\"\n    Gets the count of tokens in vocab along with special tokens.\n    \"\"\"\n    return len(self.tokenizer.get_vocab().keys())\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__setstate__","title":"<code>__setstate__(d)</code>","text":"<p>Setting state during reloading pickling</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def __setstate__(self, d):\n    \"\"\"Setting state during reloading pickling\"\"\"\n    use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n    if use_pretokenizer:\n        d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n    d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n    self.__dict__.update(d)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.decode","title":"<code>decode(ids, skip_special_tokens=True, ignore_stops=False, stop_token_ids=None)</code>","text":"<p>Decodes a list of ids to molecular representation in the format in which this tokenizer was created.</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list</code> <p>list of IDs</p> required <code>skip_special_tokens</code> <code>bool</code> <p>whether to skip all special tokens when encountering them</p> <code>True</code> <code>ignore_stops</code> <code>bool</code> <p>whether to ignore the stop tokens, thus decoding till the end</p> <code>False</code> <code>stop_token_ids</code> <code>Optional[List[int]]</code> <p>optional list of stop token ids to use</p> <code>None</code> <p>Returns:</p> Name Type Description <code>sequence</code> <code>str</code> <p>str representation of molecule</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def decode(\n    self,\n    ids: list,\n    skip_special_tokens: bool = True,\n    ignore_stops: bool = False,\n    stop_token_ids: Optional[List[int]] = None,\n) -&gt; str:\n    r\"\"\"\n    Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n    Args:\n        ids: list of IDs\n        skip_special_tokens: whether to skip all special tokens when encountering them\n        ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n        stop_token_ids: optional list of stop token ids to use\n\n    Returns:\n        sequence: str representation of molecule\n    \"\"\"\n    old_id_list = ids\n    if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n        old_id_list = [ids]\n    if not stop_token_ids:\n        stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n    new_ids_list = []\n    for ids in old_id_list:\n        new_ids = ids\n        if not ignore_stops:\n            new_ids = []\n            # if first tokens are stop, we just remove it\n            # this is because of bart essentially\n            pos = 0\n            if len(ids) &gt; 1:\n                while ids[pos] in stop_token_ids:\n                    pos += 1\n            # we only ignore when there is a list of tokens\n            ids = ids[pos:]\n            for pos, id in enumerate(ids):\n                if int(id) in stop_token_ids:\n                    break\n                new_ids.append(id)\n        new_ids_list.append(new_ids)\n    if len(new_ids_list) == 1:\n        return self.tokenizer.decode(\n            list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n        )\n    return self.tokenizer.decode_batch(\n        list(new_ids_list), skip_special_tokens=skip_special_tokens\n    )\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.encode","title":"<code>encode(sample_str, ids_only=True, **kwargs)</code>","text":"<p>Encodes a given molecule string once training is done</p> <p>Parameters:</p> Name Type Description Default <code>sample_str</code> <code>str</code> <p>Sample string to encode molecule</p> required <code>ids_only</code> <code>bool</code> <p>whether to return only the ids or the encoding objet</p> <code>True</code> <p>Returns:</p> Name Type Description <code>object</code> <code>list</code> <p>Returns encoded list of IDs</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -&gt; list:\n    r\"\"\"\n    Encodes a given molecule string once training is done\n\n    Args:\n        sample_str: Sample string to encode molecule\n        ids_only: whether to return only the ids or the encoding objet\n\n    Returns:\n        object: Returns encoded list of IDs\n    \"\"\"\n    if isinstance(sample_str, str):\n        enc = self.tokenizer.encode(sample_str, **kwargs)\n        if ids_only:\n            return enc.ids\n        return enc\n\n    encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n    if ids_only:\n        return [enc.ids for enc in encs]\n    return encs\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_dict","title":"<code>from_dict(data)</code>  <code>classmethod</code>","text":"<p>Load tokenizer from dict</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>dict</code> <p>dictionary containing the tokenizer info</p> required Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef from_dict(cls, data: dict):\n    \"\"\"Load tokenizer from dict\n\n    Args:\n        data: dictionary containing the tokenizer info\n    \"\"\"\n    tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n    tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n    custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n    tokenizer = Tokenizer.from_str(json.dumps(data))\n    if custom_pre_tokenizer:\n        tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n    mol_tokenizer = cls(tokenizer_type)\n    mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n    if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n        mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n    return mol_tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_pretrained","title":"<code>from_pretrained(pretrained_model_name_or_path, cache_dir=None, force_download=False, local_files_only=False, token=None, return_fast_tokenizer=False, proxies=None, **kwargs)</code>  <code>classmethod</code>","text":"<p>Instantiate a [<code>~tokenization_utils_base.PreTrainedTokenizerBase</code>] (or a derived class) from a predefined tokenizer.</p> <p>Parameters:</p> Name Type Description Default <code>pretrained_model_name_or_path</code> <code>Union[str, PathLike]</code> <p>Can be either:</p> <ul> <li>A string, the model id of a predefined tokenizer hosted inside a model repo on huggingface.co.   Valid model ids can be located at the root-level, like <code>bert-base-uncased</code>, or namespaced under a   user or organization name, like <code>dbmdz/bert-base-german-cased</code>.</li> <li>A path to a directory containing vocabulary files required by the tokenizer, for instance saved   using the [<code>~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained</code>] method, e.g.,   <code>./my_model_directory/</code>.</li> <li>(Deprecated, not applicable to all derived classes) A path or url to a single saved vocabulary   file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,   <code>./my_model_directory/vocab.txt</code>.</li> </ul> required <code>cache_dir</code> <code>Optional[Union[str, PathLike]]</code> <p>Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.</p> <code>None</code> <code>force_download</code> <code>bool</code> <p>Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.</p> <code>False</code> <code>proxies</code> <code>Optional[Dict[str, str]]</code> <p>A dictionary of proxy servers to use by protocol or endpoint, e.g., <code>{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}</code>. The proxies are used on each request.</p> <code>None</code> <code>token</code> <code>Optional[Union[str, bool]]</code> <p>The token to use as HTTP bearer authorization for remote files. If <code>True</code>, will use the token generated when running <code>huggingface-cli login</code> (stored in <code>~/.huggingface</code>).</p> <code>None</code> <code>local_files_only</code> <code>bool</code> <p>Whether or not to only rely on local files and not to attempt to download any files.</p> <code>False</code> <code>return_fast_tokenizer</code> <code>Optional[bool]</code> <p>Whether to return fast tokenizer or not.</p> <code>False</code> <p>Examples:</p> <pre><code>    # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n    # Download vocabulary from huggingface.co and cache.\n    tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n    # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n    tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n    # If the tokenizer uses a single vocabulary file, you can point directly to this file\n    tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n</code></pre> Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef from_pretrained(\n    cls,\n    pretrained_model_name_or_path: Union[str, os.PathLike],\n    cache_dir: Optional[Union[str, os.PathLike]] = None,\n    force_download: bool = False,\n    local_files_only: bool = False,\n    token: Optional[Union[str, bool]] = None,\n    return_fast_tokenizer: Optional[bool] = False,\n    proxies: Optional[Dict[str, str]] = None,\n    **kwargs,\n):\n    r\"\"\"\n    Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n    tokenizer.\n\n    Args:\n        pretrained_model_name_or_path:\n            Can be either:\n\n            - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n              Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n              user or organization name, like `dbmdz/bert-base-german-cased`.\n            - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n              using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n              `./my_model_directory/`.\n            - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n              file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n              `./my_model_directory/vocab.txt`.\n        cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n            standard cache should not be used.\n        force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n        proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n            `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n        token: The token to use as HTTP bearer authorization for remote files.\n            If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n        local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n        return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n    Examples:\n    ``` py\n        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n        # Download vocabulary from huggingface.co and cache.\n        tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n        tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n        # If the tokenizer uses a single vocabulary file, you can point directly to this file\n        tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n    ```\n    \"\"\"\n    resume_download = kwargs.pop(\"resume_download\", False)\n    use_auth_token = kwargs.pop(\"use_auth_token\", None)\n    subfolder = kwargs.pop(\"subfolder\", None)\n    from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n    from_auto_class = kwargs.pop(\"_from_auto\", False)\n    commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n    if use_auth_token is not None:\n        warnings.warn(\n            \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n            FutureWarning,\n        )\n        if token is not None:\n            raise ValueError(\n                \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n            )\n        token = use_auth_token\n\n    user_agent = {\n        \"file_type\": \"tokenizer\",\n        \"from_auto_class\": from_auto_class,\n        \"is_fast\": \"Fast\" in cls.__name__,\n    }\n    if from_pipeline is not None:\n        user_agent[\"using_pipeline\"] = from_pipeline\n\n    if is_offline_mode() and not local_files_only:\n        logger.info(\"Offline mode: forcing local_files_only=True\")\n        local_files_only = True\n\n    pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n    os.path.isdir(pretrained_model_name_or_path)\n    file_path = None\n    if os.path.isfile(pretrained_model_name_or_path):\n        file_path = pretrained_model_name_or_path\n    elif is_remote_url(pretrained_model_name_or_path):\n        file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n    else:\n        # EN: remove this when transformers package has uniform API\n        cached_file_extra_kwargs = {\"use_auth_token\": token}\n        if packaging.version.parse(transformers_version) &gt;= packaging.version.parse(\"5.0\"):\n            cached_file_extra_kwargs = {\"token\": token}\n        # Try to get the tokenizer config to see if there are versioned tokenizer files.\n        resolved_vocab_files = cached_file(\n            pretrained_model_name_or_path,\n            cls.vocab_files_names,\n            cache_dir=cache_dir,\n            force_download=force_download,\n            resume_download=resume_download,\n            proxies=proxies,\n            local_files_only=local_files_only,\n            subfolder=subfolder,\n            user_agent=user_agent,\n            _raise_exceptions_for_missing_entries=False,\n            _raise_exceptions_for_connection_errors=False,\n            _commit_hash=commit_hash,\n            **cached_file_extra_kwargs,\n        )\n        commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n        file_path = resolved_vocab_files\n\n    if not os.path.isfile(file_path):\n        logger.info(\n            f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n        )\n\n    tokenizer = cls.load(file_path)\n    if return_fast_tokenizer:\n        return tokenizer.get_pretrained()\n    return tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.get_pretrained","title":"<code>get_pretrained(**kwargs)</code>","text":"<p>Get a pretrained tokenizer from this tokenizer</p> <p>Returns:</p> Type Description <code>PreTrainedTokenizerFast</code> <p>Returns pre-trained fast tokenizer for hugging face models.</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def get_pretrained(self, **kwargs) -&gt; PreTrainedTokenizerFast:\n    r\"\"\"\n    Get a pretrained tokenizer from this tokenizer\n\n    Returns:\n        Returns pre-trained fast tokenizer for hugging face models.\n    \"\"\"\n    with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n        tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n    tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n    # now we need to add special_tokens\n    tk.add_special_tokens(\n        {\n            \"cls_token\": self.tokenizer.cls_token,\n            \"bos_token\": self.tokenizer.bos_token,\n            \"eos_token\": self.tokenizer.eos_token,\n            \"mask_token\": self.tokenizer.mask_token,\n            \"pad_token\": self.tokenizer.pad_token,\n            \"unk_token\": self.tokenizer.unk_token,\n            \"sep_token\": self.tokenizer.sep_token,\n        }\n    )\n    if (\n        tk.model_max_length is None\n        or tk.model_max_length &gt; 1e8\n        and hasattr(self.tokenizer, \"model_max_length\")\n    ):\n        tk.model_max_length = self.tokenizer.model_max_length\n        setattr(\n            tk,\n            \"model_max_length\",\n            getattr(self.tokenizer, \"model_max_length\"),\n        )\n    return tk\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.load","title":"<code>load(file_name)</code>  <code>classmethod</code>","text":"<p>Load the current tokenizer from file</p> Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef load(cls, file_name):\n    \"\"\"Load the current tokenizer from file\"\"\"\n    with fsspec.open(file_name, \"r\") as OUT:\n        data_str = OUT.read()\n    data = json.loads(data_str)\n    # EN: the rust json parser of tokenizers has a predefined structure\n    # the next two lines are important\n    return cls.from_dict(data)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.push_to_hub","title":"<code>push_to_hub(repo_id, use_temp_dir=None, commit_message=None, private=None, token=None, max_shard_size='10GB', create_pr=False, safe_serialization=False, **deprecated_kwargs)</code>","text":"<p>Upload the tokenizer to the \ud83e\udd17 Model Hub.</p> <p>Parameters:</p> Name Type Description Default <code>repo_id</code> <code>str</code> <p>The name of the repository you want to push your {object} to. It should contain your organization name when pushing to a given organization.</p> required <code>use_temp_dir</code> <code>Optional[bool]</code> <p>Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub. Will default to <code>True</code> if there is no directory named like <code>repo_id</code>, <code>False</code> otherwise.</p> <code>None</code> <code>commit_message</code> <code>Optional[str]</code> <p>Message to commit while pushing. Will default to <code>\"Upload {object}\"</code>.</p> <code>None</code> <code>private</code> <code>Optional[bool]</code> <p>Whether or not the repository created should be private.</p> <code>None</code> <code>token</code> <code>Optional[Union[bool, str]]</code> <p>The token to use as HTTP bearer authorization for remote files. If <code>True</code>, will use the token generated when running <code>huggingface-cli login</code> (stored in <code>~/.huggingface</code>). Will default to <code>True</code> if <code>repo_url</code> is not specified.</p> <code>None</code> <code>max_shard_size</code> <code>Optional[Union[int, str]]</code> <p>Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size lower than this size. If expressed as a string, needs to be digits followed by a unit (like <code>\"5MB\"</code>).</p> <code>'10GB'</code> <code>create_pr</code> <code>bool</code> <p>Whether or not to create a PR with the uploaded files or directly commit.</p> <code>False</code> <code>safe_serialization</code> <code>bool</code> <p>Whether or not to convert the model weights in safetensors format for safer serialization.</p> <code>False</code> Source code in <code>safe/tokenizer.py</code> <pre><code>def push_to_hub(\n    self,\n    repo_id: str,\n    use_temp_dir: Optional[bool] = None,\n    commit_message: Optional[str] = None,\n    private: Optional[bool] = None,\n    token: Optional[Union[bool, str]] = None,\n    max_shard_size: Optional[Union[int, str]] = \"10GB\",\n    create_pr: bool = False,\n    safe_serialization: bool = False,\n    **deprecated_kwargs,\n) -&gt; str:\n    \"\"\"\n    Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n    Args:\n        repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n            when pushing to a given organization.\n        use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n            Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n        commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n        private: Whether or not the repository created should be private.\n        token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n            when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n            is not specified.\n        max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n            will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n            by a unit (like `\"5MB\"`).\n        create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n        safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n    \"\"\"\n    use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n    if use_auth_token is not None:\n        warnings.warn(\n            \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n            FutureWarning,\n        )\n        if token is not None:\n            raise ValueError(\n                \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n            )\n        token = use_auth_token\n\n    repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n    if repo_path_or_name is not None:\n        # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n        # repo_id from the folder path, if it exists.\n        warnings.warn(\n            \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n            \"`repo_id` instead.\",\n            FutureWarning,\n        )\n        if repo_id is not None:\n            raise ValueError(\n                \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n            )\n        if os.path.isdir(repo_path_or_name):\n            # repo_path: infer repo_id from the path\n            repo_id = repo_id.split(os.path.sep)[-1]\n            working_dir = repo_id\n        else:\n            # repo_name: use it as repo_id\n            repo_id = repo_path_or_name\n            working_dir = repo_id.split(\"/\")[-1]\n    else:\n        # Repo_id is passed correctly: infer working_dir from it\n        working_dir = repo_id.split(\"/\")[-1]\n\n    # Deprecation warning will be sent after for repo_url and organization\n    repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n    organization = deprecated_kwargs.pop(\"organization\", None)\n\n    repo_id = self._create_repo(\n        repo_id, private, token, repo_url=repo_url, organization=organization\n    )\n\n    if use_temp_dir is None:\n        use_temp_dir = not os.path.isdir(working_dir)\n\n    with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n        files_timestamps = self._get_files_timestamps(work_dir)\n\n        # Save all files.\n        with contextlib.suppress(Exception):\n            self.save_pretrained(\n                work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n            )\n\n        self.save(os.path.join(work_dir, self.vocab_files_names))\n\n        return self._upload_modified_files(\n            work_dir,\n            repo_id,\n            files_timestamps,\n            commit_message=commit_message,\n            token=token,\n            create_pr=create_pr,\n        )\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save","title":"<code>save(file_name=None)</code>","text":"<p>Saves the :class:<code>~tokenizers.Tokenizer</code> to the file at the given path.</p> <p>Parameters:</p> Name Type Description Default <code>file_name</code> <code>str</code> <p>File where to save tokenizer</p> <code>None</code> Source code in <code>safe/tokenizer.py</code> <pre><code>def save(self, file_name=None):\n    r\"\"\"\n    Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n    Args:\n        file_name (str, optional): File where to save tokenizer\n    \"\"\"\n    # EN: whole logic here assumes noone is going to mess with the special token\n    tk_data = self.to_dict()\n    with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n        out_str = json.dumps(tk_data, ensure_ascii=False)\n        OUT.write(out_str)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save_pretrained","title":"<code>save_pretrained(*args, **kwargs)</code>","text":"<p>Save pretrained tokenizer</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def save_pretrained(self, *args, **kwargs):\n    \"\"\"Save pretrained tokenizer\"\"\"\n    self.tokenizer.save_pretrained(*args, **kwargs)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.set_special_tokens","title":"<code>set_special_tokens(tokenizer, bos_token=CLS_TOKEN, eos_token=SEP_TOKEN)</code>  <code>classmethod</code>","text":"<p>Set special tokens for a tokenizer</p> <p>Parameters:</p> Name Type Description Default <code>tokenizer</code> <code>Tokenizer</code> <p>tokenizer for which special tokens will be set</p> required <code>bos_token</code> <code>str</code> <p>Optional bos token to use</p> <code>CLS_TOKEN</code> <code>eos_token</code> <code>str</code> <p>Optional eos token to use</p> <code>SEP_TOKEN</code> Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef set_special_tokens(\n    cls,\n    tokenizer: Tokenizer,\n    bos_token: str = CLS_TOKEN,\n    eos_token: str = SEP_TOKEN,\n):\n    \"\"\"Set special tokens for a tokenizer\n\n    Args:\n        tokenizer: tokenizer for which special tokens will be set\n        bos_token: Optional bos token to use\n        eos_token: Optional eos token to use\n    \"\"\"\n    tokenizer.pad_token = PADDING_TOKEN\n    tokenizer.cls_token = CLS_TOKEN\n    tokenizer.sep_token = SEP_TOKEN\n    tokenizer.mask_token = MASK_TOKEN\n    tokenizer.unk_token = UNK_TOKEN\n    tokenizer.eos_token = eos_token\n    tokenizer.bos_token = bos_token\n\n    if isinstance(tokenizer, Tokenizer):\n        tokenizer.add_special_tokens(\n            [\n                PADDING_TOKEN,\n                CLS_TOKEN,\n                SEP_TOKEN,\n                MASK_TOKEN,\n                UNK_TOKEN,\n                eos_token,\n                bos_token,\n            ]\n        )\n    return tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.to_dict","title":"<code>to_dict(**kwargs)</code>","text":"<p>Convert tokenizer to dict</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def to_dict(self, **kwargs):\n    \"\"\"Convert tokenizer to dict\"\"\"\n    # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n    if self.splitter is None:\n        tk_data = json.loads(self.tokenizer.to_str())\n    else:\n        with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n            # temporary replace pre tokenizer with whitespace\n            tk_data = json.loads(self.tokenizer.to_str())\n            tk_data[\"custom_pre_tokenizer\"] = True\n    tk_data[\"tokenizer_type\"] = self.tokenizer_type\n    tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n    return tk_data\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train","title":"<code>train(files, **kwargs)</code>","text":"<p>This is to train a new tokenizer from either a list of file or some input data</p> <p>Args     files (str): file in which your molecules are separated by new line     kwargs (dict): optional args for the tokenizer <code>train</code></p> Source code in <code>safe/tokenizer.py</code> <pre><code>def train(self, files: Optional[List[str]], **kwargs):\n    r\"\"\"\n    This is to train a new tokenizer from either a list of file or some input data\n\n    Args\n        files (str): file in which your molecules are separated by new line\n        kwargs (dict): optional args for the tokenizer `train`\n    \"\"\"\n    if isinstance(files, str):\n        files = [files]\n    self.tokenizer.train(files=files, trainer=self.trainer)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train_from_iterator","title":"<code>train_from_iterator(data, **kwargs)</code>","text":"<p>Train the Tokenizer using the provided iterator.</p> <p>You can provide anything that is a Python Iterator     * A list of sequences :obj:<code>List[str]</code>     * A generator that yields :obj:<code>str</code> or :obj:<code>List[str]</code>     * A Numpy array of strings</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>Iterator</code> <p>data iterator</p> required <code>**kwargs</code> <code>Any</code> <p>additional keyword argument for the tokenizer <code>train_from_iterator</code></p> <code>{}</code> Source code in <code>safe/tokenizer.py</code> <pre><code>def train_from_iterator(self, data: Iterator, **kwargs: Any):\n    \"\"\"Train the Tokenizer using the provided iterator.\n\n    You can provide anything that is a Python Iterator\n        * A list of sequences :obj:`List[str]`\n        * A generator that yields :obj:`str` or :obj:`List[str]`\n        * A Numpy array of strings\n\n    Args:\n        data: data iterator\n        **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n    \"\"\"\n    self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n</code></pre>"},{"location":"api/safe.html#utils","title":"Utils","text":""},{"location":"api/safe.html#safe.utils.MolSlicer","title":"<code>MolSlicer</code>","text":"<p>Slice a molecule into head-linker-tail</p> Source code in <code>safe/utils.py</code> <pre><code>class MolSlicer:\n    \"\"\"Slice a molecule into head-linker-tail\"\"\"\n\n    BOND_SPLITTERS = [\n        # two atoms connected by a non ring single bond, one of each is not in a ring and at least two heavy neighbor\n        \"[R:1]-&amp;!@[!R;!D1:2]\",\n        # two atoms in different rings linked by a non-ring single bond\n        \"[R:1]-&amp;!@[R:2]\",\n    ]\n    _BOND_BUFFER = 1  # buffer around substructure match size.\n    MAX_CUTS = 2  # maximum number of cuts. Here we need two cuts for head-linker-tail.\n\n    _MERGING_RXN = dm.reactions.rxn_from_smarts(\n        \"[#0][*:1].[#0][*:4].([#0][*:2].[#0][*:3])&gt;&gt;([*:1][*:2].[*:3][*:4])\"\n    )\n\n    def __init__(\n        self,\n        shortest_linker: bool = False,\n        min_linker_size: int = 0,\n        require_ring_system: bool = True,\n        verbose: bool = False,\n    ):\n        \"\"\"\n        Constructor of bond slicer.\n\n        Args:\n            shortest_linker: whether to consider longuest or shortest linker.\n                Does not have any effect when expected_head group is provided during splitting\n            min_linker_size: minimum linker size\n            require_ring_system: whether all fragment needs to have a ring system\n            verbose: whether to allow verbosity in logging\n        \"\"\"\n\n        self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n        self.shortest_linker = shortest_linker\n        self.min_linker_size = min_linker_size\n        self.require_ring_system = require_ring_system\n        self.verbose = verbose\n\n    def get_ring_system(self, mol: dm.Mol):\n        \"\"\"Get the list of ring system from a molecule\n\n        Args:\n            mol: input molecule for which we are computing the ring system\n        \"\"\"\n        mol.UpdatePropertyCache()\n        ri = mol.GetRingInfo()\n        systems = []\n        for ring in ri.AtomRings():\n            ring_atoms = set(ring)\n            cur_system = []  # keep a track of ring system\n            for system in systems:\n                if len(ring_atoms.intersection(system)) &gt; 0:\n                    ring_atoms = ring_atoms.union(system)  # merge ring system that overlap\n                else:\n                    cur_system.append(system)\n            cur_system.append(ring_atoms)\n            systems = cur_system\n        return systems\n\n    def _bond_selection_from_max_cuts(self, bond_list: List[int], dist_mat: np.ndarray):\n        \"\"\"Select bonds based on maximum number of cuts allowed\"\"\"\n        # for now we are just implementing to 2 max cuts algorithms\n        if self.MAX_CUTS != 2:\n            raise ValueError(f\"Only MAX_CUTS=2 is supported, got {self.MAX_CUTS}\")\n\n        bond_pdist = np.full((len(bond_list), len(bond_list)), -1)\n        for i in range(len(bond_list)):\n            for j in range(i, len(bond_list)):\n                # we get the minimum topological distance between bond to cut\n                bond_pdist[i, j] = bond_pdist[j, i] = min(\n                    [dist_mat[a1, a2] for a1, a2 in itertools.product(bond_list[i], bond_list[j])]\n                )\n\n        masked_bond_pdist = np.ma.masked_less_equal(bond_pdist, self.min_linker_size)\n\n        if self.shortest_linker:\n            return np.unravel_index(np.ma.argmin(masked_bond_pdist), bond_pdist.shape)\n        return np.unravel_index(np.ma.argmax(masked_bond_pdist), bond_pdist.shape)\n\n    def _get_bonds_to_cut(self, mol: dm.Mol):\n        \"\"\"Get possible bond to cuts\n\n        Args:\n            mol: input molecule\n        \"\"\"\n        # use this if you want to enumerate yourself the possible cuts\n\n        ring_systems = self.get_ring_system(mol)\n        candidate_bonds = []\n        ring_query = Chem.rdqueries.IsInRingQueryAtom()\n\n        for query in self.bond_splitters:\n            bonds = mol.GetSubstructMatches(query, uniquify=True)\n            cur_unique_bonds = [set(cbond) for cbond in candidate_bonds]\n            # do not accept bonds part of the same ring system or already known\n            for b in bonds:\n                bond_id = mol.GetBondBetweenAtoms(*b).GetIdx()\n                bond_cut = Chem.GetMolFrags(\n                    Chem.FragmentOnBonds(mol, [bond_id], addDummies=False), asMols=True\n                )\n                can_add = not self.require_ring_system or all(\n                    len(frag.GetAtomsMatchingQuery(ring_query)) &gt; 0 for frag in bond_cut\n                )\n                if can_add and not (\n                    set(b) in cur_unique_bonds or any(x.issuperset(set(b)) for x in ring_systems)\n                ):\n                    candidate_bonds.append(b)\n        return candidate_bonds\n\n    def _fragment_mol(self, mol: dm.Mol, bonds: List[dm.Bond]):\n        \"\"\"Fragment molecules on bonds and return head, linker, tail combination\n\n        Args:\n            mol: input molecule\n            bonds: list of bonds to cut\n        \"\"\"\n        tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in bonds])\n        _frags = list(Chem.GetMolFrags(tmp, asMols=True))\n        # linker is the one with 2 dummy atoms\n        linker_pos = 0\n        for pos, _frag in enumerate(_frags):\n            if sum([at.GetSymbol() == \"*\" for at in _frag.GetAtoms()]) == 2:\n                linker_pos = pos\n                break\n        linker = _frags.pop(linker_pos)\n        head, tail = _frags\n        return (head, linker, tail)\n\n    def _compute_linker_score(self, linker: dm.Mol):\n        \"\"\"Compute the score of a linker to help select between linkers\"\"\"\n\n        # we need to take into account\n        # case where we require the linker to have a ring system\n        # case where we want the linker to be longuest or shortest\n\n        # find shortest path\n        attach1, attach2, *_ = [at.GetIdx() for at in linker.GetAtoms() if at.GetSymbol() == \"*\"]\n        score = len(Chem.rdmolops.GetShortestPath(linker, attach1, attach2))\n        ring_query = Chem.rdqueries.IsInRingQueryAtom()\n        linker_ring_count = len(linker.GetAtomsMatchingQuery(ring_query))\n        if self.require_ring_system:\n            score *= int(linker_ring_count &gt; 0)\n        if score == 0:\n            return float(\"inf\")\n        if not self.shortest_linker:\n            score = 1 / score\n        return score\n\n    def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n        \"\"\"Perform slicing of the input molecule\n\n        Args:\n            mol: input molecule\n            expected_head: substructure that should be part of the head.\n                The small fragment containing this substructure would be kept as head\n        \"\"\"\n\n        mol = dm.to_mol(mol)\n        # remove salt and solution\n        mol = dm.keep_largest_fragment(mol)\n        Chem.rdDepictor.Compute2DCoords(mol)\n        dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n        if expected_head is not None:\n            if isinstance(expected_head, str):\n                expected_head = dm.to_mol(expected_head)\n            if not mol.HasSubstructMatch(expected_head):\n                if self.verbose:\n                    logger.info(\n                        \"Expected head was provided, but does not match molecules. It will be ignored\"\n                    )\n                expected_head = None\n\n        candidate_bonds = self._get_bonds_to_cut(mol)\n\n        # we have all the candidate bonds we can cut\n        # now we need to pick the most plausible bonds\n        selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n        # CASE 1: no bond to cut ==&gt; only head\n        if len(selected_bonds) == 0:\n            return (mol, None, None)\n\n        # CASE 2: only one bond ==&gt; linker is empty\n        if len(selected_bonds) == 1:\n            # there is not linker\n            tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n            head, tail = Chem.GetMolFrags(tmp, asMols=True)\n            return (head, None, tail)\n\n        # CASE 3a: we select the most plausible bond to cut on ourselves\n        if expected_head is None:\n            choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n            selected_bonds = [selected_bonds[c] for c in choice]\n            return self._fragment_mol(mol, selected_bonds)\n\n        # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n        # provided substructure\n        bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n        bond_score = float(\"inf\")\n        linker_score = float(\"inf\")\n        head, linker, tail = (None, None, None)\n        for split_bonds in bond_combination:\n            cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n            # head can also be tail\n            head_match = cur_head.GetSubstructMatch(expected_head)\n            tail_match = cur_tail.GetSubstructMatch(expected_head)\n            if not head_match and not tail_match:\n                continue\n            if not head_match and tail_match:\n                cur_head, cur_tail = cur_tail, cur_head\n            cur_bond_score = cur_head.GetNumHeavyAtoms()\n            # compute linker score\n            cur_linker_score = self._compute_linker_score(cur_linker)\n            if (cur_bond_score &lt; bond_score) or (\n                cur_bond_score &lt; self._BOND_BUFFER + bond_score and cur_linker_score &lt; linker_score\n            ):\n                head, linker, tail = cur_head, cur_linker, cur_tail\n                bond_score = cur_bond_score\n                linker_score = cur_linker_score\n\n        return (head, linker, tail)\n\n    @classmethod\n    def link_fragments(\n        cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n    ):\n        \"\"\"Link fragments together using the provided linker\n\n        Args:\n            linker: linker to use\n            head: head fragment\n            tail: tail fragment\n        \"\"\"\n        if isinstance(linker, dm.Mol):\n            linker = dm.to_smiles(linker)\n        linker = standardize_attach(linker)\n        reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n        return dm.reactions.apply_reaction(\n            cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n        )\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.__call__","title":"<code>__call__(mol, expected_head=None)</code>","text":"<p>Perform slicing of the input molecule</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Union[Mol, str]</code> <p>input molecule</p> required <code>expected_head</code> <code>Union[Mol, str]</code> <p>substructure that should be part of the head. The small fragment containing this substructure would be kept as head</p> <code>None</code> Source code in <code>safe/utils.py</code> <pre><code>def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n    \"\"\"Perform slicing of the input molecule\n\n    Args:\n        mol: input molecule\n        expected_head: substructure that should be part of the head.\n            The small fragment containing this substructure would be kept as head\n    \"\"\"\n\n    mol = dm.to_mol(mol)\n    # remove salt and solution\n    mol = dm.keep_largest_fragment(mol)\n    Chem.rdDepictor.Compute2DCoords(mol)\n    dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n    if expected_head is not None:\n        if isinstance(expected_head, str):\n            expected_head = dm.to_mol(expected_head)\n        if not mol.HasSubstructMatch(expected_head):\n            if self.verbose:\n                logger.info(\n                    \"Expected head was provided, but does not match molecules. It will be ignored\"\n                )\n            expected_head = None\n\n    candidate_bonds = self._get_bonds_to_cut(mol)\n\n    # we have all the candidate bonds we can cut\n    # now we need to pick the most plausible bonds\n    selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n    # CASE 1: no bond to cut ==&gt; only head\n    if len(selected_bonds) == 0:\n        return (mol, None, None)\n\n    # CASE 2: only one bond ==&gt; linker is empty\n    if len(selected_bonds) == 1:\n        # there is not linker\n        tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n        head, tail = Chem.GetMolFrags(tmp, asMols=True)\n        return (head, None, tail)\n\n    # CASE 3a: we select the most plausible bond to cut on ourselves\n    if expected_head is None:\n        choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n        selected_bonds = [selected_bonds[c] for c in choice]\n        return self._fragment_mol(mol, selected_bonds)\n\n    # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n    # provided substructure\n    bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n    bond_score = float(\"inf\")\n    linker_score = float(\"inf\")\n    head, linker, tail = (None, None, None)\n    for split_bonds in bond_combination:\n        cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n        # head can also be tail\n        head_match = cur_head.GetSubstructMatch(expected_head)\n        tail_match = cur_tail.GetSubstructMatch(expected_head)\n        if not head_match and not tail_match:\n            continue\n        if not head_match and tail_match:\n            cur_head, cur_tail = cur_tail, cur_head\n        cur_bond_score = cur_head.GetNumHeavyAtoms()\n        # compute linker score\n        cur_linker_score = self._compute_linker_score(cur_linker)\n        if (cur_bond_score &lt; bond_score) or (\n            cur_bond_score &lt; self._BOND_BUFFER + bond_score and cur_linker_score &lt; linker_score\n        ):\n            head, linker, tail = cur_head, cur_linker, cur_tail\n            bond_score = cur_bond_score\n            linker_score = cur_linker_score\n\n    return (head, linker, tail)\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.__init__","title":"<code>__init__(shortest_linker=False, min_linker_size=0, require_ring_system=True, verbose=False)</code>","text":"<p>Constructor of bond slicer.</p> <p>Parameters:</p> Name Type Description Default <code>shortest_linker</code> <code>bool</code> <p>whether to consider longuest or shortest linker. Does not have any effect when expected_head group is provided during splitting</p> <code>False</code> <code>min_linker_size</code> <code>int</code> <p>minimum linker size</p> <code>0</code> <code>require_ring_system</code> <code>bool</code> <p>whether all fragment needs to have a ring system</p> <code>True</code> <code>verbose</code> <code>bool</code> <p>whether to allow verbosity in logging</p> <code>False</code> Source code in <code>safe/utils.py</code> <pre><code>def __init__(\n    self,\n    shortest_linker: bool = False,\n    min_linker_size: int = 0,\n    require_ring_system: bool = True,\n    verbose: bool = False,\n):\n    \"\"\"\n    Constructor of bond slicer.\n\n    Args:\n        shortest_linker: whether to consider longuest or shortest linker.\n            Does not have any effect when expected_head group is provided during splitting\n        min_linker_size: minimum linker size\n        require_ring_system: whether all fragment needs to have a ring system\n        verbose: whether to allow verbosity in logging\n    \"\"\"\n\n    self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n    self.shortest_linker = shortest_linker\n    self.min_linker_size = min_linker_size\n    self.require_ring_system = require_ring_system\n    self.verbose = verbose\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.get_ring_system","title":"<code>get_ring_system(mol)</code>","text":"<p>Get the list of ring system from a molecule</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>input molecule for which we are computing the ring system</p> required Source code in <code>safe/utils.py</code> <pre><code>def get_ring_system(self, mol: dm.Mol):\n    \"\"\"Get the list of ring system from a molecule\n\n    Args:\n        mol: input molecule for which we are computing the ring system\n    \"\"\"\n    mol.UpdatePropertyCache()\n    ri = mol.GetRingInfo()\n    systems = []\n    for ring in ri.AtomRings():\n        ring_atoms = set(ring)\n        cur_system = []  # keep a track of ring system\n        for system in systems:\n            if len(ring_atoms.intersection(system)) &gt; 0:\n                ring_atoms = ring_atoms.union(system)  # merge ring system that overlap\n            else:\n                cur_system.append(system)\n        cur_system.append(ring_atoms)\n        systems = cur_system\n    return systems\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.link_fragments","title":"<code>link_fragments(linker, head, tail)</code>  <code>classmethod</code>","text":"<p>Link fragments together using the provided linker</p> <p>Parameters:</p> Name Type Description Default <code>linker</code> <code>Union[Mol, str]</code> <p>linker to use</p> required <code>head</code> <code>Union[Mol, str]</code> <p>head fragment</p> required <code>tail</code> <code>Union[Mol, str]</code> <p>tail fragment</p> required Source code in <code>safe/utils.py</code> <pre><code>@classmethod\ndef link_fragments(\n    cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n):\n    \"\"\"Link fragments together using the provided linker\n\n    Args:\n        linker: linker to use\n        head: head fragment\n        tail: tail fragment\n    \"\"\"\n    if isinstance(linker, dm.Mol):\n        linker = dm.to_smiles(linker)\n    linker = standardize_attach(linker)\n    reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n    return dm.reactions.apply_reaction(\n        cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n    )\n</code></pre>"},{"location":"api/safe.html#safe.utils.attr_as","title":"<code>attr_as(obj, field, value)</code>","text":"<p>Temporary replace the value of an object</p> <p>Parameters:</p> Name Type Description Default <code>obj</code> <code>Any</code> <p>object to temporary patch</p> required <code>field</code> <code>str</code> <p>name of the key to change</p> required <code>value</code> <code>Any</code> <p>value of key to be temporary changed</p> required Source code in <code>safe/utils.py</code> <pre><code>@contextmanager\ndef attr_as(obj: Any, field: str, value: Any):\n    \"\"\"Temporary replace the value of an object\n\n    Args:\n        obj: object to temporary patch\n        field: name of the key to change\n        value: value of key to be temporary changed\n    \"\"\"\n    old_value = getattr(obj, field, None)\n    setattr(obj, field, value)\n    yield\n    with suppress(TypeError):\n        setattr(obj, field, old_value)\n</code></pre>"},{"location":"api/safe.html#safe.utils.compute_side_chains","title":"<code>compute_side_chains(mol, core, label_by_index=False)</code>","text":"<p>Compute the side chain of a molecule given a core</p> <p>Finding the side chains</p> <p>The algorithm to find the side chains from core assumes that the core we get as input has attachment points. Those attachment points are never considered as part of the query, rather they are used to define the attachment points on the side chains. Removing the attachment points from the core is exactly the same as keeping them.</p> <p><pre><code>mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\ncore0 = \"CC1(C)CN2C(CC2=O)S1\"\ncore1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\ncore2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\nside_chain = compute_side_chain(core=core0, mol=mol)\ndm.to_image([side_chain, core0, mol])\n</code></pre> Therefore on the above, core0 and core1 are equivalent for the molecule <code>mol</code>, but core2 is not.</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to split</p> required <code>core</code> <code>Mol</code> <p>core to use for deriving the side chains</p> required Source code in <code>safe/utils.py</code> <pre><code>def compute_side_chains(mol: dm.Mol, core: dm.Mol, label_by_index: bool = False):\n    \"\"\"Compute the side chain of a molecule given a core\n\n    !!! note \"Finding the side chains\"\n        The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n        Those attachment points are never considered as part of the query, rather they are used to define the attachment points\n        on the side chains. Removing the attachment points from the core is exactly the same as keeping them.\n\n        ```python\n        mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\n        core0 = \"CC1(C)CN2C(CC2=O)S1\"\n        core1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\n        core2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\n        side_chain = compute_side_chain(core=core0, mol=mol)\n        dm.to_image([side_chain, core0, mol])\n        ```\n        Therefore on the above, core0 and core1 are equivalent for the molecule `mol`, but core2 is not.\n\n    Args:\n        mol: molecule to split\n        core: core to use for deriving the side chains\n    \"\"\"\n\n    if isinstance(mol, str):\n        mol = dm.to_mol(mol)\n    if isinstance(core, str):\n        core = dm.to_mol(core)\n    core_query_param = AdjustQueryParameters()\n    core_query_param.makeDummiesQueries = True\n    core_query_param.adjustDegree = False\n    core_query_param.aromatizeIfPossible = True\n    core_query_param.makeBondsGeneric = False\n    core_query = AdjustQueryProperties(core, core_query_param)\n    return ReplaceCore(\n        mol, core_query, labelByIndex=label_by_index, replaceDummies=False, requireDummyMatch=False\n    )\n</code></pre>"},{"location":"api/safe.html#safe.utils.convert_to_safe","title":"<code>convert_to_safe(mol, canonical=False, randomize=False, seed=1, slicer='brics', split_fragment=True, fraction_hs=None, resolution=0.5)</code>","text":"<p>Convert a molecule to a safe representation</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to convert</p> required <code>canonical</code> <code>bool</code> <p>whether to use canonical encoding</p> <code>False</code> <code>randomize</code> <code>bool</code> <p>whether to randomize the encoding</p> <code>False</code> <code>seed</code> <code>Optional[int]</code> <p>random seed</p> <code>1</code> <code>slicer</code> <code>str</code> <p>the slicer to use for fragmentation</p> <code>'brics'</code> <code>split_fragment</code> <code>bool</code> <p>whether to split fragments</p> <code>True</code> <code>fraction_hs</code> <code>bool</code> <p>proportion of random atom to which we will add explicit hydrogens</p> <code>None</code> <code>resolution</code> <code>Optional[float]</code> <p>resolution for the partitioning algorithm</p> <code>0.5</code> <code>seed</code> <code>Optional[int]</code> <p>random seed</p> <code>1</code> Source code in <code>safe/utils.py</code> <pre><code>def convert_to_safe(\n    mol: dm.Mol,\n    canonical: bool = False,\n    randomize: bool = False,\n    seed: Optional[int] = 1,\n    slicer: str = \"brics\",\n    split_fragment: bool = True,\n    fraction_hs: bool = None,\n    resolution: Optional[float] = 0.5,\n):\n    \"\"\"Convert a molecule to a safe representation\n\n    Args:\n        mol: molecule to convert\n        canonical: whether to use canonical encoding\n        randomize: whether to randomize the encoding\n        seed: random seed\n        slicer: the slicer to use for fragmentation\n        split_fragment: whether to split fragments\n        fraction_hs: proportion of random atom to which we will add explicit hydrogens\n        resolution: resolution for the partitioning algorithm\n        seed: random seed\n    \"\"\"\n    x = None\n    try:\n        x = sf.encode(mol, canonical=canonical, randomize=randomize, slicer=slicer, seed=seed)\n    except sf.SAFEFragmentationError:\n        if split_fragment:\n            if \".\" in mol:\n                return None\n            try:\n                x = sf.encode(\n                    mol,\n                    canonical=False,\n                    randomize=randomize,\n                    seed=seed,\n                    slicer=partial(\n                        fragment_aware_spliting,\n                        fraction_hs=fraction_hs,\n                        resolution=resolution,\n                        seed=seed,\n                    ),\n                )\n            except (sf.SAFEEncodeError, sf.SAFEFragmentationError):\n                # logger.exception(e)\n                return x\n        # we need to resplit using attachment point but here we are only adding\n    except sf.SAFEEncodeError:\n        return x\n    return x\n</code></pre>"},{"location":"api/safe.html#safe.utils.filter_by_substructure_constraints","title":"<code>filter_by_substructure_constraints(sequences, substruct, n_jobs=-1)</code>","text":"<p>Check whether the input substructures are present in each of the molecule in the sequences</p> <p>Parameters:</p> Name Type Description Default <code>sequences</code> <code>List[Union[str, Mol]]</code> <p>list of molecules to validate</p> required <code>substruct</code> <code>Union[str, Mol]</code> <p>substructure to use as query</p> required <code>n_jobs</code> <code>int</code> <p>number of jobs to use for parallelization</p> <code>-1</code> Source code in <code>safe/utils.py</code> <pre><code>def filter_by_substructure_constraints(\n    sequences: List[Union[str, dm.Mol]], substruct: Union[str, dm.Mol], n_jobs: int = -1\n):\n    \"\"\"Check whether the input substructures are present in each of the molecule in the sequences\n\n    Args:\n        sequences: list of molecules to validate\n        substruct: substructure to use as query\n        n_jobs: number of jobs to use for parallelization\n\n    \"\"\"\n\n    if isinstance(substruct, str):\n        substruct = standardize_attach(substruct)\n        substruct = dm.from_smarts(substruct)\n\n    def _check_match(mol):\n        with suppress(Exception):\n            mol = dm.to_mol(mol)\n            return mol.HasSubstructMatch(substruct)\n        return False\n\n    matches = dm.parallelized(_check_match, sequences, n_jobs=n_jobs)\n    return list(compress(sequences, matches))\n</code></pre>"},{"location":"api/safe.html#safe.utils.find_partition_edges","title":"<code>find_partition_edges(G, partition)</code>","text":"<p>Find the edges connecting the subgraphs in a given partition of a graph.</p> <p>Parameters:</p> Name Type Description Default <code>G</code> <code>Graph</code> <p>The original graph.</p> required <code>partition</code> <code>list of list of nodes</code> <p>The partition of the graph where each element is a list of nodes representing a subgraph.</p> required <p>Returns:</p> Name Type Description <code>list</code> <code>List[Tuple]</code> <p>A list of edges connecting the subgraphs in the partition.</p> Source code in <code>safe/utils.py</code> <pre><code>def find_partition_edges(G: nx.Graph, partition: List[List]) -&gt; List[Tuple]:\n    \"\"\"\n    Find the edges connecting the subgraphs in a given partition of a graph.\n\n    Args:\n        G (networkx.Graph): The original graph.\n        partition (list of list of nodes): The partition of the graph where each element is a list of nodes representing a subgraph.\n\n    Returns:\n        list: A list of edges connecting the subgraphs in the partition.\n    \"\"\"\n    partition_edges = []\n    for subgraph1, subgraph2 in combinations(partition, 2):\n        edges = nx.edge_boundary(G, subgraph1, subgraph2)\n        partition_edges.extend(edges)\n    return partition_edges\n</code></pre>"},{"location":"api/safe.html#safe.utils.fragment_aware_spliting","title":"<code>fragment_aware_spliting(mol, fraction_hs=None, **kwargs)</code>","text":"<p>Custom splitting algorithm for dataset building.</p> <p>This slicing strategy will cut any bond including bonding with hydrogens However, only one cut per atom is allowed</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to split</p> required <code>fraction_hs</code> <code>Optional[bool]</code> <p>proportion of random atom to which we will add explicit hydrogens</p> <code>None</code> <code>kwargs</code> <code>Any</code> <p>additional arguments to pass to the partitioning algorithm</p> <code>{}</code> Source code in <code>safe/utils.py</code> <pre><code>def fragment_aware_spliting(mol: dm.Mol, fraction_hs: Optional[bool] = None, **kwargs: Any):\n    \"\"\"Custom splitting algorithm for dataset building.\n\n    This slicing strategy will cut any bond including bonding with hydrogens\n    However, only one cut per atom is allowed\n\n    Args:\n        mol: molecule to split\n        fraction_hs: proportion of random atom to which we will add explicit hydrogens\n        kwargs: additional arguments to pass to the partitioning algorithm\n    \"\"\"\n    random.seed(kwargs.get(\"seed\", 1))\n    mol = dm.to_mol(mol, remove_hs=False)\n    mol = _selective_add_hs(mol, fraction_hs=fraction_hs)\n    graph = dm.graph.to_graph(mol)\n    d = mol_partition(mol, **kwargs)\n    q = deque(d)\n    partition = q.pop()\n    return find_partition_edges(graph, partition)\n</code></pre>"},{"location":"api/safe.html#safe.utils.list_individual_attach_points","title":"<code>list_individual_attach_points(mol, depth=None)</code>","text":"<p>List all individual attachement points.</p> <p>We do not allow multiple attachment points per substitution position.</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule for which we need to open the attachment points</p> required Source code in <code>safe/utils.py</code> <pre><code>def list_individual_attach_points(mol: dm.Mol, depth: Optional[int] = None):\n    \"\"\"List all individual attachement points.\n\n    We do not allow multiple attachment points per substitution position.\n\n    Args:\n        mol: molecule for which we need to open the attachment points\n\n    \"\"\"\n    ATTACHING_RXN = ReactionFromSmarts(\"[*;h;!$([*][#0]):1]&gt;&gt;[*:1][*]\")\n    mols = [mol]\n    curated_prods = set()\n    num_attachs = len(mol.GetSubstructMatches(dm.from_smarts(\"[*;h:1]\"), uniquify=True))\n    depth = depth or 1\n    depth = min(max(depth, 1), num_attachs)\n    while depth &gt; 0:\n        prods = set()\n        for mol in mols:\n            mol = dm.to_mol(mol)\n            for p in ATTACHING_RXN.RunReactants((mol,)):\n                try:\n                    m = dm.sanitize_mol(p[0])\n                    sm = dm.to_smiles(m, canonical=True)\n                    sm = dm.reactions.add_brackets_to_attachment_points(sm)\n                    prods.add(dm.reactions.convert_attach_to_isotope(sm, as_smiles=True))\n                except Exception as e:\n                    logger.error(e)\n        curated_prods.update(prods)\n        mols = prods\n        depth -= 1\n    return list(curated_prods)\n</code></pre>"},{"location":"api/safe.html#safe.utils.mol_partition","title":"<code>mol_partition(mol, query=None, seed=None, **kwargs)</code>","text":"<p>Partition a molecule into fragments using a bond query</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to split</p> required <code>query</code> <code>Optional[Mol]</code> <p>bond query to use for splitting</p> <code>None</code> <code>seed</code> <code>Optional[int]</code> <p>random seed</p> <code>None</code> <code>kwargs</code> <code>Any</code> <p>additional arguments to pass to the partitioning algorithm</p> <code>{}</code> Source code in <code>safe/utils.py</code> <pre><code>@py_random_state(\"seed\")\ndef mol_partition(\n    mol: dm.Mol, query: Optional[dm.Mol] = None, seed: Optional[int] = None, **kwargs: Any\n):\n    \"\"\"Partition a molecule into fragments using a bond query\n\n    Args:\n        mol: molecule to split\n        query: bond query to use for splitting\n        seed: random seed\n        kwargs: additional arguments to pass to the partitioning algorithm\n\n    \"\"\"\n    resolution = kwargs.get(\"resolution\", 1.0)\n    threshold = kwargs.get(\"threshold\", 1e-7)\n    weight = kwargs.get(\"weight\", \"weight\")\n\n    if query is None:\n        query = __mmpa_query\n\n    G = dm.graph.to_graph(mol)\n    bond_partition = [\n        tuple(sorted(match)) for match in mol.GetSubstructMatches(query, uniquify=True)\n    ]\n\n    def get_relevant_edges(e1, e2):\n        return tuple(sorted([e1, e2])) not in bond_partition\n\n    subgraphs = nx.subgraph_view(G, filter_edge=get_relevant_edges)\n\n    partition = [{u} for u in G.nodes()]\n    inner_partition = sorted(nx.connected_components(subgraphs), key=lambda x: min(x))\n    mod = nx.algorithms.community.modularity(\n        G, inner_partition, resolution=resolution, weight=weight\n    )\n    is_directed = G.is_directed()\n    graph = G.__class__()\n    graph.add_nodes_from(G)\n    graph.add_weighted_edges_from(G.edges(data=weight, default=1))\n    graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n    m = graph.size(weight=\"weight\")\n    partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n        graph, m, inner_partition, resolution, is_directed, seed\n    )\n    improvement = True\n    while improvement:\n        # gh-5901 protect the sets in the yielded list from further manipulation here\n        yield [s.copy() for s in partition]\n        new_mod = nx.algorithms.community.modularity(\n            graph, inner_partition, resolution=resolution, weight=\"weight\"\n        )\n        if new_mod - mod &lt;= threshold:\n            return\n        mod = new_mod\n        graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n        partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n            graph, m, partition, resolution, is_directed, seed\n        )\n</code></pre>"},{"location":"api/safe.html#safe.utils.standardize_attach","title":"<code>standardize_attach(inputs, standard_attach='[*]')</code>","text":"<p>Standardize the attachment points of a molecule</p> <p>Parameters:</p> Name Type Description Default <code>inputs</code> <code>str</code> <p>input molecule</p> required <code>standard_attach</code> <code>str</code> <p>standard attachment point to use</p> <code>'[*]'</code> Source code in <code>safe/utils.py</code> <pre><code>def standardize_attach(inputs: str, standard_attach: str = \"[*]\"):\n    \"\"\"Standardize the attachment points of a molecule\n\n    Args:\n        inputs: input molecule\n        standard_attach: standard attachment point to use\n    \"\"\"\n\n    for attach_regex in _SMILES_ATTACHMENT_POINTS:\n        inputs = re.sub(attach_regex, standard_attach, inputs)\n    return inputs\n</code></pre>"},{"location":"api/safe.models.html","title":"Model training","text":""},{"location":"api/safe.models.html#config-file","title":"Config File","text":"<p>The input config file for training a <code>SAFE</code> model is very similar to the GPT2 config file, with the addition of an optional <code>num_labels</code> attribute for training with descriptors regularization.</p> <pre><code>{\n  \"activation_function\": \"gelu_new\",\n  \"attn_pdrop\": 0.1,\n  \"bos_token_id\": 10000,\n  \"embd_pdrop\": 0.1,\n  \"eos_token_id\": 1,\n  \"initializer_range\": 0.02,\n  \"layer_norm_epsilon\": 1e-05,\n  \"model_type\": \"gpt2\",\n  \"n_embd\": 768,\n  \"n_head\": 12,\n  \"n_inner\": null,\n  \"n_layer\": 12,\n  \"n_positions\": 1024,\n  \"reorder_and_upcast_attn\": false,\n  \"resid_pdrop\": 0.1,\n  \"scale_attn_by_inverse_layer_idx\": false,\n  \"scale_attn_weights\": true,\n  \"summary_activation\": \"tanh\",\n  \"summary_first_dropout\": 0.1,\n  \"summary_proj_to_labels\": true,\n  \"summary_type\": \"cls_index\",\n  \"summary_hidden_size\": 128,\n  \"summary_use_proj\": true,\n  \"transformers_version\": \"4.31.0\",\n  \"use_cache\": true,\n  \"vocab_size\": 10000,\n  \"num_labels\": 9\n}\n</code></pre>"},{"location":"api/safe.models.html#safe-model","title":"SAFE Model","text":""},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead","title":"<code>PropertyHead</code>","text":"<p>               Bases: <code>Module</code></p> <p>Compute a single vector summary of a sequence hidden states.</p> <p>Parameters:</p> Name Type Description Default <code>config</code> <code>[`PretrainedConfig`]</code> <p>The config used by the model. Relevant arguments in the config class of the model are (refer to the actual config class of your model for the default values it uses):</p> <ul> <li>summary_type (<code>str</code>) -- The method to use to make this summary. Accepted values are:<p>- <code>\"last\"</code> -- Take the last token hidden state (like XLNet)   - <code>\"first\"</code> -- Take the first token hidden state (like Bert)   - <code>\"mean\"</code> -- Take the mean of all tokens hidden states   - <code>\"cls_index\"</code> -- Supply a Tensor of classification token position (GPT/GPT-2)</p> </li> </ul> <ul> <li>summary_activation (<code>Optional[str]</code>) -- Set to <code>\"tanh\"</code> to add a tanh activation to the output,   another string, or <code>None</code> to add no activation.</li> </ul> required Source code in <code>safe/trainer/model.py</code> <pre><code>class PropertyHead(torch.nn.Module):\n    r\"\"\"\n    Compute a single vector summary of a sequence hidden states.\n\n    Args:\n        config ([`PretrainedConfig`]):\n            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual\n            config class of your model for the default values it uses):\n\n            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:\n\n                - `\"last\"` -- Take the last token hidden state (like XLNet)\n                - `\"first\"` -- Take the first token hidden state (like Bert)\n                - `\"mean\"` -- Take the mean of all tokens hidden states\n                - `\"cls_index\"` -- Supply a Tensor of classification token position (GPT/GPT-2)\n\n            - **summary_activation** (`Optional[str]`) -- Set to `\"tanh\"` to add a tanh activation to the output,\n              another string, or `None` to add no activation.\n    \"\"\"\n\n    def __init__(self, config: PretrainedConfig):\n        super().__init__()\n\n        self.summary_type = getattr(config, \"summary_type\", \"cls_index\")\n        self.summary = torch.nn.Identity()\n        last_hidden_size = config.hidden_size\n\n        if getattr(config, \"summary_hidden_size\", None) and config.summary_hidden_size &gt; 0:\n            self.summary = nn.Linear(config.hidden_size, config.summary_hidden_size)\n            last_hidden_size = config.summary_hidden_size\n\n        activation_string = getattr(config, \"summary_activation\", None)\n        self.activation: Callable = (\n            get_activation(activation_string) if activation_string else nn.Identity()\n        )\n\n        self.out = torch.nn.Identity()\n        if getattr(config, \"num_labels\", None) and config.num_labels &gt; 0:\n            num_labels = config.num_labels\n            self.out = nn.Linear(last_hidden_size, num_labels)\n\n    def forward(\n        self,\n        hidden_states: torch.FloatTensor,\n        cls_index: Optional[torch.LongTensor] = None,\n    ) -&gt; torch.FloatTensor:\n        \"\"\"\n        Compute a single vector summary of a sequence hidden states.\n\n        Args:\n            hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n                The hidden states of the last layer.\n            cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n                where ... are optional leading dimensions of `hidden_states`, *optional*\n                Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n        Returns:\n            `torch.FloatTensor`: The summary of the sequence hidden states.\n        \"\"\"\n        if self.summary_type == \"last\":\n            output = hidden_states[:, -1]\n        elif self.summary_type == \"first\":\n            output = hidden_states[:, 0]\n        elif self.summary_type == \"mean\":\n            output = hidden_states.mean(dim=1)\n        elif self.summary_type == \"cls_index\":\n            # if cls_index is None:\n            #     cls_index = torch.full_like(\n            #         hidden_states[..., :1, :],\n            #         hidden_states.shape[-2] - 1,\n            #         dtype=torch.long,\n            #     )\n            # else:\n            #     cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n            #     cls_index = cls_index.expand(\n            #         (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n            #     )\n\n            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n            # output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)\n            batch_size = hidden_states.shape[0]\n            output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n        else:\n            raise NotImplementedError\n\n        output = self.summary(output)\n        output = self.activation(output)\n        return self.out(output)\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead.forward","title":"<code>forward(hidden_states, cls_index=None)</code>","text":"<p>Compute a single vector summary of a sequence hidden states.</p> <p>Parameters:</p> Name Type Description Default <code>hidden_states</code> <code>FloatTensor</code> <p><code>torch.FloatTensor</code> of shape <code>[batch_size, seq_len, hidden_size]</code>) The hidden states of the last layer.</p> required <code>cls_index</code> <code>Optional[LongTensor]</code> <p><code>torch.LongTensor</code> of shape <code>[batch_size]</code> or <code>[batch_size, ...]</code> where ... are optional leading dimensions of <code>hidden_states</code>, optional Used if <code>summary_type == \"cls_index\"</code> and takes the last token of the sequence as classification token.</p> <code>None</code> <p>Returns:</p> Type Description <code>FloatTensor</code> <p><code>torch.FloatTensor</code>: The summary of the sequence hidden states.</p> Source code in <code>safe/trainer/model.py</code> <pre><code>def forward(\n    self,\n    hidden_states: torch.FloatTensor,\n    cls_index: Optional[torch.LongTensor] = None,\n) -&gt; torch.FloatTensor:\n    \"\"\"\n    Compute a single vector summary of a sequence hidden states.\n\n    Args:\n        hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n            The hidden states of the last layer.\n        cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n            where ... are optional leading dimensions of `hidden_states`, *optional*\n            Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n    Returns:\n        `torch.FloatTensor`: The summary of the sequence hidden states.\n    \"\"\"\n    if self.summary_type == \"last\":\n        output = hidden_states[:, -1]\n    elif self.summary_type == \"first\":\n        output = hidden_states[:, 0]\n    elif self.summary_type == \"mean\":\n        output = hidden_states.mean(dim=1)\n    elif self.summary_type == \"cls_index\":\n        # if cls_index is None:\n        #     cls_index = torch.full_like(\n        #         hidden_states[..., :1, :],\n        #         hidden_states.shape[-2] - 1,\n        #         dtype=torch.long,\n        #     )\n        # else:\n        #     cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n        #     cls_index = cls_index.expand(\n        #         (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n        #     )\n\n        # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n        # output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)\n        batch_size = hidden_states.shape[0]\n        output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n    else:\n        raise NotImplementedError\n\n    output = self.summary(output)\n    output = self.activation(output)\n    return self.out(output)\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel","title":"<code>SAFEDoubleHeadsModel</code>","text":"<p>               Bases: <code>GPT2DoubleHeadsModel</code></p> <p>The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head</p> Source code in <code>safe/trainer/model.py</code> <pre><code>class SAFEDoubleHeadsModel(GPT2DoubleHeadsModel):\n    \"\"\"The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head\"\"\"\n\n    def __init__(self, config):\n        self.num_labels = getattr(config, \"num_labels\", None)\n        super().__init__(config)\n        self.config.num_labels = self.num_labels\n        del self.multiple_choice_head\n        self.multiple_choice_head = PropertyHead(config)\n\n    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\n    def forward(\n        self,\n        input_ids: Optional[torch.LongTensor] = None,\n        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n        attention_mask: Optional[torch.FloatTensor] = None,\n        token_type_ids: Optional[torch.LongTensor] = None,\n        position_ids: Optional[torch.LongTensor] = None,\n        head_mask: Optional[torch.FloatTensor] = None,\n        inputs_embeds: Optional[torch.FloatTensor] = None,\n        mc_token_ids: Optional[torch.LongTensor] = None,\n        labels: Optional[torch.LongTensor] = None,\n        mc_labels: Optional[torch.LongTensor] = None,\n        use_cache: Optional[bool] = None,\n        output_attentions: Optional[bool] = None,\n        output_hidden_states: Optional[bool] = None,\n        return_dict: Optional[bool] = None,\n        inputs: Optional[Any] = None,  # do not remove because of trainer\n        encoder_hidden_states: Optional[torch.Tensor] = None,\n        **kwargs,\n    ) -&gt; Union[Tuple, GPT2DoubleHeadsModelOutput]:\n        r\"\"\"\n\n        Args:\n            mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n                Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n                1]`.\n            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n                Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n                `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n                `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n            mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n                Labels for computing the supervized loss for regularization.\n            inputs: List of inputs, put here because the trainer removes information not in signature\n        Returns:\n            output (GPT2DoubleHeadsModelOutput): output of the model\n        \"\"\"\n        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n        transformer_outputs = self.transformer(\n            input_ids,\n            past_key_values=past_key_values,\n            attention_mask=attention_mask,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            head_mask=head_mask,\n            inputs_embeds=inputs_embeds,\n            use_cache=use_cache,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict,\n            encoder_hidden_states=encoder_hidden_states,\n        )\n\n        hidden_states = transformer_outputs[0]\n        lm_logits = self.lm_head(hidden_states)\n\n        if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n            mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n                lm_logits.device\n            )\n\n        # Set device for model parallelism\n        if self.model_parallel:\n            torch.cuda.set_device(self.transformer.first_device)\n            hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n        mc_loss = None\n        mc_logits = None\n        if mc_labels is not None and getattr(self.config, \"num_labels\", 0) &gt; 0:\n            mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n            mc_labels = mc_labels.to(mc_logits.device)\n            loss_fct = MSELoss()\n            mc_loss = loss_fct(\n                mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n            )\n\n        lm_loss = None\n        if labels is not None:\n            labels = labels.to(lm_logits.device)\n            shift_logits = lm_logits[..., :-1, :].contiguous()\n            shift_labels = labels[..., 1:].contiguous()\n            loss_fct = CrossEntropyLoss()\n            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n        if not return_dict:\n            output = (lm_logits, mc_logits) + transformer_outputs[1:]\n            return (\n                lm_loss,\n                mc_loss,\n            ) + output\n\n        return GPT2DoubleHeadsModelOutput(\n            loss=lm_loss,\n            mc_loss=mc_loss,\n            logits=lm_logits,\n            mc_logits=mc_logits,\n            past_key_values=transformer_outputs.past_key_values,\n            hidden_states=transformer_outputs.hidden_states,\n            attentions=transformer_outputs.attentions,\n        )\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel.forward","title":"<code>forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, labels=None, mc_labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, inputs=None, encoder_hidden_states=None, **kwargs)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>mc_token_ids</code> <code>`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input</code> <p>Index of the classification token in each input sequence. Selected in the range <code>[0, input_ids.size(-1) - 1]</code>.</p> <code>None</code> <code>labels</code> <code>`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*</code> <p>Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set <code>labels = input_ids</code>. Indices are selected in <code>[-100, 0, ..., config.vocab_size - 1]</code>. All labels set to <code>-100</code> are ignored (masked), the loss is only computed for labels in <code>[0, ..., config.vocab_size - 1]</code></p> <code>None</code> <code>mc_labels</code> <code>`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*</code> <p>Labels for computing the supervized loss for regularization.</p> <code>None</code> <code>inputs</code> <code>Optional[Any]</code> <p>List of inputs, put here because the trainer removes information not in signature</p> <code>None</code> <p>Returns:     output (GPT2DoubleHeadsModelOutput): output of the model</p> Source code in <code>safe/trainer/model.py</code> <pre><code>@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n@replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\ndef forward(\n    self,\n    input_ids: Optional[torch.LongTensor] = None,\n    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n    attention_mask: Optional[torch.FloatTensor] = None,\n    token_type_ids: Optional[torch.LongTensor] = None,\n    position_ids: Optional[torch.LongTensor] = None,\n    head_mask: Optional[torch.FloatTensor] = None,\n    inputs_embeds: Optional[torch.FloatTensor] = None,\n    mc_token_ids: Optional[torch.LongTensor] = None,\n    labels: Optional[torch.LongTensor] = None,\n    mc_labels: Optional[torch.LongTensor] = None,\n    use_cache: Optional[bool] = None,\n    output_attentions: Optional[bool] = None,\n    output_hidden_states: Optional[bool] = None,\n    return_dict: Optional[bool] = None,\n    inputs: Optional[Any] = None,  # do not remove because of trainer\n    encoder_hidden_states: Optional[torch.Tensor] = None,\n    **kwargs,\n) -&gt; Union[Tuple, GPT2DoubleHeadsModelOutput]:\n    r\"\"\"\n\n    Args:\n        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n            1]`.\n        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n        mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n            Labels for computing the supervized loss for regularization.\n        inputs: List of inputs, put here because the trainer removes information not in signature\n    Returns:\n        output (GPT2DoubleHeadsModelOutput): output of the model\n    \"\"\"\n    return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n    transformer_outputs = self.transformer(\n        input_ids,\n        past_key_values=past_key_values,\n        attention_mask=attention_mask,\n        token_type_ids=token_type_ids,\n        position_ids=position_ids,\n        head_mask=head_mask,\n        inputs_embeds=inputs_embeds,\n        use_cache=use_cache,\n        output_attentions=output_attentions,\n        output_hidden_states=output_hidden_states,\n        return_dict=return_dict,\n        encoder_hidden_states=encoder_hidden_states,\n    )\n\n    hidden_states = transformer_outputs[0]\n    lm_logits = self.lm_head(hidden_states)\n\n    if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n        mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n            lm_logits.device\n        )\n\n    # Set device for model parallelism\n    if self.model_parallel:\n        torch.cuda.set_device(self.transformer.first_device)\n        hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n    mc_loss = None\n    mc_logits = None\n    if mc_labels is not None and getattr(self.config, \"num_labels\", 0) &gt; 0:\n        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n        mc_labels = mc_labels.to(mc_logits.device)\n        loss_fct = MSELoss()\n        mc_loss = loss_fct(\n            mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n        )\n\n    lm_loss = None\n    if labels is not None:\n        labels = labels.to(lm_logits.device)\n        shift_logits = lm_logits[..., :-1, :].contiguous()\n        shift_labels = labels[..., 1:].contiguous()\n        loss_fct = CrossEntropyLoss()\n        lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n    if not return_dict:\n        output = (lm_logits, mc_logits) + transformer_outputs[1:]\n        return (\n            lm_loss,\n            mc_loss,\n        ) + output\n\n    return GPT2DoubleHeadsModelOutput(\n        loss=lm_loss,\n        mc_loss=mc_loss,\n        logits=lm_logits,\n        mc_logits=mc_logits,\n        past_key_values=transformer_outputs.past_key_values,\n        hidden_states=transformer_outputs.hidden_states,\n        attentions=transformer_outputs.attentions,\n    )\n</code></pre>"},{"location":"api/safe.models.html#trainer","title":"Trainer","text":""},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer","title":"<code>SAFETrainer</code>","text":"<p>               Bases: <code>Trainer</code></p> <p>Custom trainer for training SAFE model.</p> <p>This custom trainer changes the loss function to support the property head</p> Source code in <code>safe/trainer/trainer_utils.py</code> <pre><code>class SAFETrainer(Trainer):\n    \"\"\"\n    Custom trainer for training SAFE model.\n\n    This custom trainer changes the loss function to support the property head\n\n    \"\"\"\n\n    def __init__(self, *args, prop_loss_coeff: float = 1e-3, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.prop_loss_coeff = prop_loss_coeff\n\n    def compute_loss(self, model, inputs, return_outputs=False):\n        \"\"\"\n        How the loss is computed by Trainer. By default, all models return the loss in the first element.\n        \"\"\"\n        labels = (\n            inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n        )\n\n        outputs = model(**inputs)\n        # Save past state if it exists\n        # TODO: this needs to be fixed and made cleaner later.\n        if self.args.past_index &gt;= 0:\n            self._past = outputs[self.args.past_index]\n\n        if labels is not None:\n            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n                loss = self.label_smoother(outputs, labels, shift_labels=True)\n            else:\n                loss = self.label_smoother(outputs, labels)\n        else:\n            if isinstance(outputs, dict) and \"loss\" not in outputs:\n                raise ValueError(\n                    \"The model did not return a loss from the inputs, only the following keys: \"\n                    f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n                )\n            # We don't use .loss here since the model may return tuples instead of ModelOutput.\n            loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n        mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n        if mc_loss is not None:\n            loss = loss + self.prop_loss_coeff * mc_loss\n        return (loss, outputs) if return_outputs else loss\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer.compute_loss","title":"<code>compute_loss(model, inputs, return_outputs=False)</code>","text":"<p>How the loss is computed by Trainer. By default, all models return the loss in the first element.</p> Source code in <code>safe/trainer/trainer_utils.py</code> <pre><code>def compute_loss(self, model, inputs, return_outputs=False):\n    \"\"\"\n    How the loss is computed by Trainer. By default, all models return the loss in the first element.\n    \"\"\"\n    labels = (\n        inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n    )\n\n    outputs = model(**inputs)\n    # Save past state if it exists\n    # TODO: this needs to be fixed and made cleaner later.\n    if self.args.past_index &gt;= 0:\n        self._past = outputs[self.args.past_index]\n\n    if labels is not None:\n        if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n            loss = self.label_smoother(outputs, labels, shift_labels=True)\n        else:\n            loss = self.label_smoother(outputs, labels)\n    else:\n        if isinstance(outputs, dict) and \"loss\" not in outputs:\n            raise ValueError(\n                \"The model did not return a loss from the inputs, only the following keys: \"\n                f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n            )\n        # We don't use .loss here since the model may return tuples instead of ModelOutput.\n        loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n    mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n    if mc_loss is not None:\n        loss = loss + self.prop_loss_coeff * mc_loss\n    return (loss, outputs) if return_outputs else loss\n</code></pre>"},{"location":"api/safe.models.html#data-collator","title":"Data Collator","text":""},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator","title":"<code>SAFECollator</code>","text":"<p>Collate function for language modelling tasks</p> <p>Note</p> <p>The collate function is based on the default DataCollatorForLanguageModeling in huggingface see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py</p> Source code in <code>safe/trainer/collator.py</code> <pre><code>class SAFECollator:\n    \"\"\"Collate function for language modelling tasks\n\n\n    !!! note\n        The collate function is based on the default DataCollatorForLanguageModeling in huggingface\n        see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py\n    \"\"\"\n\n    def __init__(\n        self,\n        tokenizer: Tokenizer,\n        pad_to_multiple_of: Optional[int] = None,\n        input_key: str = \"inputs\",\n        label_key: str = \"labels\",\n        property_key: str = \"descriptors\",\n        include_descriptors: bool = False,\n        max_length: Optional[int] = None,\n    ):\n        \"\"\"\n        Default collator for huggingface transformers in izanagi.\n\n        Args:\n            tokenizer: Huggingface tokenizer\n            input_key: key to use for input ids\n            label_key: key to use for labels\n            property_key: key to use for properties\n            include_descriptors: whether to include training on descriptors or not\n            pad_to_multiple_of: pad to multiple of this value\n        \"\"\"\n\n        self.tokenizer = tokenizer\n        self.pad_to_multiple_of = pad_to_multiple_of\n        self.input_key = input_key\n        self.label_key = label_key\n        self.property_key = property_key\n        self.include_descriptors = include_descriptors\n        self.max_length = max_length\n\n    @functools.lru_cache()\n    def get_tokenizer(self):\n        \"\"\"Get underlying tokenizer\"\"\"\n        if isinstance(self.tokenizer, SAFETokenizer):\n            return self.tokenizer.get_pretrained()\n        return self.tokenizer\n\n    def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n        \"\"\"\n        Call collate function\n\n        Args:\n            samples: list of examples\n        \"\"\"\n        # Handle dict or lists with proper padding and conversion to tensor.\n        tokenizer = self.get_tokenizer()\n\n        # examples = samples\n        examples = copy.deepcopy(samples)\n        inputs = [example.pop(self.input_key, None) for example in examples]\n        mc_labels = (\n            torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n            if self.property_key in examples[0]\n            else None\n        )\n\n        if \"input_ids\" not in examples[0] and inputs is not None:\n            batch = tokenizer(\n                inputs,\n                return_tensors=\"pt\",\n                padding=True,\n                truncation=True,\n                max_length=self.max_length,\n                pad_to_multiple_of=self.pad_to_multiple_of,\n            )\n        else:\n            batch = tokenizer.pad(\n                examples,\n                return_tensors=\"pt\",\n                padding=True,\n                pad_to_multiple_of=self.pad_to_multiple_of,\n                max_length=self.max_length,\n            )\n\n        # If special token mask has been preprocessed, pop it from the dict.\n        batch.pop(\"special_tokens_mask\", None)\n        labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n        if tokenizer.pad_token_id is not None:\n            labels[labels == tokenizer.pad_token_id] = -100\n        batch[\"labels\"] = labels\n\n        if mc_labels is not None and self.include_descriptors:\n            batch.update(\n                {\n                    \"mc_labels\": mc_labels,\n                    # \"input_text\": inputs,\n                }\n            )\n        return batch\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__call__","title":"<code>__call__(samples)</code>","text":"<p>Call collate function</p> <p>Parameters:</p> Name Type Description Default <code>samples</code> <code>List[Union[List[int], Any, Dict[str, Any]]]</code> <p>list of examples</p> required Source code in <code>safe/trainer/collator.py</code> <pre><code>def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n    \"\"\"\n    Call collate function\n\n    Args:\n        samples: list of examples\n    \"\"\"\n    # Handle dict or lists with proper padding and conversion to tensor.\n    tokenizer = self.get_tokenizer()\n\n    # examples = samples\n    examples = copy.deepcopy(samples)\n    inputs = [example.pop(self.input_key, None) for example in examples]\n    mc_labels = (\n        torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n        if self.property_key in examples[0]\n        else None\n    )\n\n    if \"input_ids\" not in examples[0] and inputs is not None:\n        batch = tokenizer(\n            inputs,\n            return_tensors=\"pt\",\n            padding=True,\n            truncation=True,\n            max_length=self.max_length,\n            pad_to_multiple_of=self.pad_to_multiple_of,\n        )\n    else:\n        batch = tokenizer.pad(\n            examples,\n            return_tensors=\"pt\",\n            padding=True,\n            pad_to_multiple_of=self.pad_to_multiple_of,\n            max_length=self.max_length,\n        )\n\n    # If special token mask has been preprocessed, pop it from the dict.\n    batch.pop(\"special_tokens_mask\", None)\n    labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n    if tokenizer.pad_token_id is not None:\n        labels[labels == tokenizer.pad_token_id] = -100\n    batch[\"labels\"] = labels\n\n    if mc_labels is not None and self.include_descriptors:\n        batch.update(\n            {\n                \"mc_labels\": mc_labels,\n                # \"input_text\": inputs,\n            }\n        )\n    return batch\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__init__","title":"<code>__init__(tokenizer, pad_to_multiple_of=None, input_key='inputs', label_key='labels', property_key='descriptors', include_descriptors=False, max_length=None)</code>","text":"<p>Default collator for huggingface transformers in izanagi.</p> <p>Parameters:</p> Name Type Description Default <code>tokenizer</code> <code>Tokenizer</code> <p>Huggingface tokenizer</p> required <code>input_key</code> <code>str</code> <p>key to use for input ids</p> <code>'inputs'</code> <code>label_key</code> <code>str</code> <p>key to use for labels</p> <code>'labels'</code> <code>property_key</code> <code>str</code> <p>key to use for properties</p> <code>'descriptors'</code> <code>include_descriptors</code> <code>bool</code> <p>whether to include training on descriptors or not</p> <code>False</code> <code>pad_to_multiple_of</code> <code>Optional[int]</code> <p>pad to multiple of this value</p> <code>None</code> Source code in <code>safe/trainer/collator.py</code> <pre><code>def __init__(\n    self,\n    tokenizer: Tokenizer,\n    pad_to_multiple_of: Optional[int] = None,\n    input_key: str = \"inputs\",\n    label_key: str = \"labels\",\n    property_key: str = \"descriptors\",\n    include_descriptors: bool = False,\n    max_length: Optional[int] = None,\n):\n    \"\"\"\n    Default collator for huggingface transformers in izanagi.\n\n    Args:\n        tokenizer: Huggingface tokenizer\n        input_key: key to use for input ids\n        label_key: key to use for labels\n        property_key: key to use for properties\n        include_descriptors: whether to include training on descriptors or not\n        pad_to_multiple_of: pad to multiple of this value\n    \"\"\"\n\n    self.tokenizer = tokenizer\n    self.pad_to_multiple_of = pad_to_multiple_of\n    self.input_key = input_key\n    self.label_key = label_key\n    self.property_key = property_key\n    self.include_descriptors = include_descriptors\n    self.max_length = max_length\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.get_tokenizer","title":"<code>get_tokenizer()</code>  <code>cached</code>","text":"<p>Get underlying tokenizer</p> Source code in <code>safe/trainer/collator.py</code> <pre><code>@functools.lru_cache()\ndef get_tokenizer(self):\n    \"\"\"Get underlying tokenizer\"\"\"\n    if isinstance(self.tokenizer, SAFETokenizer):\n        return self.tokenizer.get_pretrained()\n    return self.tokenizer\n</code></pre>"},{"location":"api/safe.models.html#data-utils","title":"Data Utils","text":""},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset","title":"<code>get_dataset(data_path, name=None, tokenizer=None, cache_dir=None, streaming=True, use_auth_token=False, tokenize_column='inputs', property_column='descriptors', max_length=None, num_shards=1024)</code>","text":"<p>Get the datasets from the config file</p> Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def get_dataset(\n    data_path,\n    name: Optional[str] = None,\n    tokenizer: Optional[Callable] = None,\n    cache_dir: Optional[str] = None,\n    streaming: bool = True,\n    use_auth_token: bool = False,\n    tokenize_column: Optional[str] = \"inputs\",\n    property_column: Optional[str] = \"descriptors\",\n    max_length: Optional[int] = None,\n    num_shards=1024,\n):\n    \"\"\"Get the datasets from the config file\"\"\"\n    raw_datasets = {}\n    if data_path is not None:\n        data_path = upath.UPath(str(data_path))\n\n        if data_path.exists():\n            # then we need to load from disk\n            data_path = str(data_path)\n            # for some reason, the datasets package is not able to load the dataset\n            # because the split where not originally proposed\n            raw_datasets = datasets.load_from_disk(data_path)\n\n            if streaming:\n                if isinstance(raw_datasets, datasets.DatasetDict):\n                    previous_num_examples = {k: len(dt) for k, dt in raw_datasets.items()}\n                    raw_datasets = datasets.IterableDatasetDict(\n                        {\n                            k: dt.to_iterable_dataset(num_shards=num_shards)\n                            for k, dt in raw_datasets.items()\n                        }\n                    )\n                    for k, dt in raw_datasets.items():\n                        if previous_num_examples[k] is not None:\n                            setattr(dt, \"num_examples\", previous_num_examples[k])\n                else:\n                    num_examples = len(raw_datasets)\n                    raw_datasets = raw_datasets.to_iterable_dataset(num_shards=num_shards)\n                    setattr(raw_datasets, \"num_examples\", num_examples)\n\n        else:\n            data_path = str(data_path)\n            raw_datasets = datasets.load_dataset(\n                data_path,\n                name=name,\n                cache_dir=cache_dir,\n                use_auth_token=True if use_auth_token else None,\n                streaming=streaming,\n            )\n    # that means we need to return a tokenized version of the dataset\n\n    if property_column not in [\"mc_labels\", None]:\n        raw_datasets = raw_datasets.rename_column(property_column, \"mc_labels\")\n\n    columns_to_remove = None\n    if tokenize_column is not None:\n        columns_to_remove = [\n            x\n            for x in (get_dataset_column_names(raw_datasets) or [])\n            if x not in [tokenize_column, \"mc_labels\"] and \"label\" not in x\n        ] or None\n\n    if tokenizer is None:\n        if columns_to_remove is not None:\n            raw_datasets = raw_datasets.remove_columns(columns_to_remove)\n        return raw_datasets\n\n    return raw_datasets.map(\n        partial(\n            tokenize_fn,\n            tokenizer=tokenizer,\n            tokenize_column=tokenize_column,\n            max_length=max_length,\n        ),\n        batched=True,\n        remove_columns=columns_to_remove,\n    )\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset_column_names","title":"<code>get_dataset_column_names(dataset)</code>","text":"<p>Get the column names in a dataset</p> <p>Parameters:</p> Name Type Description Default <code>dataset</code> <code>Union[Dataset, IterableDataset, Mapping]</code> <p>dataset to get the column names from</p> required Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def get_dataset_column_names(dataset: Union[datasets.Dataset, datasets.IterableDataset, Mapping]):\n    \"\"\"Get the column names in a dataset\n\n    Args:\n        dataset: dataset to get the column names from\n\n    \"\"\"\n    if isinstance(dataset, (datasets.IterableDatasetDict, Mapping)):\n        column_names = {split: dataset[split].column_names for split in dataset}\n    else:\n        column_names = dataset.column_names\n    if isinstance(column_names, dict):\n        column_names = list(column_names.values())[0]\n    return column_names\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.data_utils.take","title":"<code>take(n, iterable)</code>","text":"<p>Return first n items of the iterable as a list</p> Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def take(n, iterable):\n    \"Return first n items of the iterable as a list\"\n    return list(itertools.islice(iterable, n))\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.data_utils.tokenize_fn","title":"<code>tokenize_fn(row, tokenizer, tokenize_column='inputs', max_length=None, padding=False)</code>","text":"<p>Perform the tokenization of a row Args:     row: row to tokenize     tokenizer: tokenizer to use     tokenize_column: column to tokenize     max_length: maximum size of the tokenized sequence     padding: whether to pad the sequence</p> Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def tokenize_fn(\n    row: Dict[str, Any],\n    tokenizer: Callable,\n    tokenize_column: str = \"inputs\",\n    max_length: Optional[int] = None,\n    padding: bool = False,\n):\n    \"\"\"Perform the tokenization of a row\n    Args:\n        row: row to tokenize\n        tokenizer: tokenizer to use\n        tokenize_column: column to tokenize\n        max_length: maximum size of the tokenized sequence\n        padding: whether to pad the sequence\n    \"\"\"\n    # there's probably a way to do this with the tokenizer settings\n    # but again, gotta move fast\n\n    fast_tokenizer = (\n        tokenizer.get_pretrained() if isinstance(tokenizer, SAFETokenizer) else tokenizer\n    )\n\n    return fast_tokenizer(\n        row[tokenize_column],\n        truncation=(max_length is not None),\n        max_length=max_length,\n        padding=padding,\n        return_tensors=None,\n    )\n</code></pre>"},{"location":"api/safe.viz.html","title":"Visualization","text":""},{"location":"api/safe.viz.html#safe.viz.to_image","title":"<code>to_image(safe_str, fragments=None, legend=None, mol_size=(300, 300), use_svg=True, highlight_mode='lasso', highlight_bond_width_multiplier=12, **kwargs)</code>","text":"<p>Display a safe string by highlighting the fragments that make it.</p> <p>Parameters:</p> Name Type Description Default <code>safe_str</code> <code>str</code> <p>the safe string to display</p> required <code>fragments</code> <code>Optional[Union[str, Mol]]</code> <p>list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.</p> <code>None</code> <code>legend</code> <code>Union[str, None]</code> <p>A string to use as the legend under the molecule.</p> <code>None</code> <code>mol_size</code> <code>Union[Tuple[int, int], int]</code> <p>The size of the image to be returned</p> <code>(300, 300)</code> <code>use_svg</code> <code>Optional[bool]</code> <p>Whether to return an svg or png image</p> <code>True</code> <code>highlight_mode</code> <code>Optional[str]</code> <p>the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown</p> <code>'lasso'</code> <code>highlight_bond_width_multiplier</code> <code>int</code> <p>the multiplier to use for the bond width when using the 'fill' mode</p> <code>12</code> <code>**kwargs</code> <code>Any</code> <p>Additional arguments to pass to the drawing function. See RDKit documentation related to <code>MolDrawOptions</code> for more details at https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.</p> <code>{}</code> Source code in <code>safe/viz.py</code> <pre><code>def to_image(\n    safe_str: str,\n    fragments: Optional[Union[str, dm.Mol]] = None,\n    legend: Union[str, None] = None,\n    mol_size: Union[Tuple[int, int], int] = (300, 300),\n    use_svg: Optional[bool] = True,\n    highlight_mode: Optional[str] = \"lasso\",\n    highlight_bond_width_multiplier: int = 12,\n    **kwargs: Any,\n):\n    \"\"\"Display a safe string by highlighting the fragments that make it.\n\n    Args:\n        safe_str: the safe string to display\n        fragments: list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.\n        legend: A string to use as the legend under the molecule.\n        mol_size: The size of the image to be returned\n        use_svg: Whether to return an svg or png image\n        highlight_mode: the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown\n        highlight_bond_width_multiplier: the multiplier to use for the bond width when using the 'fill' mode\n        **kwargs: Additional arguments to pass to the drawing function. See RDKit\n            documentation related to `MolDrawOptions` for more details at\n            https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.\n\n    \"\"\"\n\n    kwargs[\"legends\"] = legend\n    kwargs[\"mol_size\"] = mol_size\n    kwargs[\"use_svg\"] = use_svg\n    if highlight_bond_width_multiplier is not None:\n        kwargs[\"highlightBondWidthMultiplier\"] = highlight_bond_width_multiplier\n\n    if highlight_mode == \"color\":\n        kwargs[\"continuousHighlight\"] = False\n        kwargs[\"circleAtoms\"] = kwargs.get(\"circleAtoms\", False) or False\n\n    if isinstance(fragments, (str, dm.Mol)):\n        fragments = [fragments]\n\n    if fragments is None and highlight_mode is not None:\n        fragments = [\n            sf.decode(x, as_mol=False, remove_dummies=False, ignore_errors=False)\n            for x in safe_str.split(\".\")\n        ]\n    elif fragments and len(fragments) &gt; 0:\n        parsed_fragments = []\n        for fg in fragments:\n            if isinstance(fg, str) and dm.to_mol(fg) is None:\n                fg = sf.decode(fg, as_mol=False, remove_dummies=False, ignore_errors=False)\n            parsed_fragments.append(fg)\n        fragments = parsed_fragments\n    else:\n        fragments = []\n    mol = dm.to_mol(safe_str, remove_hs=False)\n    cm = plt.get_cmap(\"gist_rainbow\")\n    current_colors = [cm(1.0 * i / len(fragments)) for i in range(len(fragments))]\n\n    if highlight_mode == \"lasso\":\n        return dm.viz.lasso_highlight_image(mol, fragments, **kwargs)\n\n    atom_indices = []\n    bond_indices = []\n    atom_colors = {}\n    bond_colors = {}\n\n    for i, frag in enumerate(fragments):\n        frag = dm.from_smarts(frag)\n        atom_matches, bond_matches = dm.substructure_matching_bonds(mol, frag)\n        atom_matches = list(itertools.chain(*atom_matches))\n        bond_matches = list(itertools.chain(*bond_matches))\n        atom_indices.extend(atom_matches)\n        bond_indices.extend(bond_matches)\n        atom_colors.update({x: current_colors[i] for x in atom_matches})\n        bond_colors.update({x: current_colors[i] for x in bond_matches})\n\n    return dm.viz.to_image(\n        mol,\n        highlight_atom=[atom_indices],\n        highlight_bond=[bond_indices],\n        highlightAtomColors=[atom_colors],\n        highlightBondColors=[bond_colors],\n        **kwargs,\n    )\n</code></pre>"},{"location":"tutorials/design-with-safe.html","title":"Molecular design","text":"In\u00a0[2]: Copied! <pre>import os\n\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n\n\nimport safe as sf\nimport datamol as dm\n</pre> import os   os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"   import safe as sf import datamol as dm  <p>Load the default pretrained Safe model.</p> <p>We will use this unique model for all the downstream molecular design tasks.</p> In\u00a0[3]: Copied! <pre>designer = sf.SAFEDesign.load_default(verbose=True)\n\ndesigner.model\n</pre> designer = sf.SAFEDesign.load_default(verbose=True)  designer.model  Out[3]: <pre>SAFEDoubleHeadsModel(\n  (transformer): GPT2Model(\n    (wte): Embedding(1880, 768)\n    (wpe): Embedding(1024, 768)\n    (drop): Dropout(p=0.1, inplace=False)\n    (h): ModuleList(\n      (0-11): 12 x GPT2Block(\n        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n        (attn): GPT2Attention(\n          (c_attn): Conv1D()\n          (c_proj): Conv1D()\n          (attn_dropout): Dropout(p=0.1, inplace=False)\n          (resid_dropout): Dropout(p=0.1, inplace=False)\n        )\n        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n        (mlp): GPT2MLP(\n          (c_fc): Conv1D()\n          (c_proj): Conv1D()\n          (act): NewGELUActivation()\n          (dropout): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n  )\n  (lm_head): Linear(in_features=768, out_features=1880, bias=False)\n  (multiple_choice_head): PropertyHead(\n    (summary): Linear(in_features=768, out_features=64, bias=True)\n    (activation): ReLU()\n    (out): Linear(in_features=64, out_features=1, bias=True)\n  )\n)</pre> <p>Let's start with the below molecule.</p> In\u00a0[4]: Copied! <pre>candidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\"\ncandidate_mol = dm.to_mol(candidate_smiles)\n\ndm.to_image(candidate_mol)\n</pre> candidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\" candidate_mol = dm.to_mol(candidate_smiles)  dm.to_image(candidate_mol)  Out[4]: In\u00a0[6]: Copied! <pre>generated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12)\n\ngenerated_smiles[:5]\n</pre> generated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12)  generated_smiles[:5]  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams&gt;1` or unset `early_stopping`.\n  warnings.warn(\n2023-10-28 11:37:25.393 | INFO     | safe.sample:de_novo_generation:581 - After sanitization, 82 / 100 (82.00 %) generated molecules are valid !\n</pre> Out[6]: <pre>['CCCCOc1c(Br)cc(C)cc1-c1nc(C2(CC)CCN(C(C)C)CC2)cn2nc(C)nc12',\n 'CC(C)(C)OC(=O)Nc1ccc(C[NH+]2CC[C@@H]3OCCC[C@H]3C2)cn1',\n 'Cc1ccc(Br)c(NCCC(C)C(C)C)c1',\n 'CCOC(=O)C1=C(C)N=c2s/c(=C/c3c(C)[nH]c4ccccc34)c(=O)n2[C@@H]1c1ccc(OC)cc1',\n 'CCc1ccccc1-n1cc(O)c(C(=O)Nc2ccc(Cl)c(F)c2)n1']</pre> In\u00a0[7]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[7]: In\u00a0[8]: Copied! <pre>scaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\"\n\ndm.to_image(scaffold)\n</pre> scaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\"  dm.to_image(scaffold)  Out[8]: In\u00a0[9]: Copied! <pre>generated_smiles = designer.scaffold_decoration(\n    scaffold=scaffold,\n    n_samples_per_trial=12,\n    n_trials=2,\n    sanitize=True,\n    do_not_fragment_further=True,\n)\n\ngenerated_mols = [dm.to_mol(x) for x in generated_smiles]\n</pre> generated_smiles = designer.scaffold_decoration(     scaffold=scaffold,     n_samples_per_trial=12,     n_trials=2,     sanitize=True,     do_not_fragment_further=True, )  generated_mols = [dm.to_mol(x) for x in generated_smiles]  <pre>  0%|          | 0/2 [00:00&lt;?, ?it/s]</pre> <pre>/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams&gt;1` or unset `early_stopping`.\n  warnings.warn(\n2023-10-28 11:37:48.620 | INFO     | safe.sample:scaffold_decoration:542 - After sanitization, 21 / 24 (87.50 %)  generated molecules are valid !\n</pre> In\u00a0[10]: Copied! <pre>dm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1)\n</pre> dm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1)  Out[10]: In\u00a0[11]: Copied! <pre>superstructure = \"c1ccc2ncncc2c1\"\n\ndm.to_image(superstructure)\n</pre> superstructure = \"c1ccc2ncncc2c1\"  dm.to_image(superstructure)  Out[11]: In\u00a0[12]: Copied! <pre>generated_smiles = designer.super_structure(\n    core=superstructure,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    attachment_point_depth=3,\n)\n\ngenerated_smiles\n</pre> generated_smiles = designer.super_structure(     core=superstructure,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     attachment_point_depth=3, )  generated_smiles  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams&gt;1` or unset `early_stopping`.\n  warnings.warn(\n2023-10-28 11:38:24.884 | INFO     | safe.sample:super_structure:496 - After sanitization, 12 / 12 (100.00 %)  generated molecules are valid !\n</pre> Out[12]: <pre>['c1ncc2c(N3CCOCC3)ccc(N3CCNCC3)c2n1',\n 'N[C@H](CNc1ccc(C(F)(F)F)c2ncncc12)C(F)(F)F',\n 'C=CCCCNC(=S)Nc1ccc(C(F)(F)F)c2cncnc12',\n 'O=C(N[C@@H](CO)CCF)c1ccc(C(=O)[O-])c2ncncc12',\n 'O=C(CC=Nc1ccc(OC(F)(F)F)c2ncncc12)C(F)(F)F',\n 'NC(=Nc1ccc([N+](=O)[O-])c2cncnc12)C(F)(F)F',\n 'O=C(CCC(F)=C(F)F)Nc1ccc(C(F)(F)F)c2ncncc12',\n 'O=S(=O)(CCC(F)(F)F)Nc1cccc2cncnc12',\n 'O=S(=O)(Cl)c1ccc(C(F)(F)F)c2ncncc12',\n 'c1ncc2c(N3CCCCCC3)ccc(-c3cn[nH]c3)c2n1',\n 'NC(=O)CSCC(=O)Nc1ccc(C(=O)[O-])c2ncncc12',\n 'c1ncc2c(-n3cncn3)ccc(C3CCCCN3)c2n1']</pre> In\u00a0[14]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[14]: In\u00a0[15]: Copied! <pre>motif = \"[*]-N1CCCCC1\"\n\ndm.to_image(motif)\n</pre> motif = \"[*]-N1CCCCC1\"  dm.to_image(motif)  Out[15]: In\u00a0[26]: Copied! <pre># let's make some long sequence\ngenerated_smiles = designer.motif_extension(\n    motif=motif,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    min_length=25,\n    max_length=80,\n)\n\ngenerated_smiles\n</pre> # let's make some long sequence generated_smiles = designer.motif_extension(     motif=motif,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     min_length=25,     max_length=80, )  generated_smiles  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>2023-10-28 11:41:52.959 | INFO     | safe.sample:scaffold_decoration:542 - After sanitization, 10 / 12 (83.33 %)  generated molecules are valid !\n</pre> Out[26]: <pre>['C1CCN([C@@H]2CCCC[C@@H]2[NH+]2CCOCC2)CC1',\n 'FC(F)(F)C(F)(F)CN1CCCCC1',\n 'O=NN(/C(=C/N1CCCCC1)N1CCCCC1)c1ccccc1',\n 'C1CCC(CC2(CC3CCCC3)CCCCC2C2CCCCCC2N2CCCCC2)CC1',\n '[Na+].[Na+].[O-]S(=S)(=S)N1CCCCC1',\n 'NC(CS)C(O)=NC(O)C(=O)N1CCCCC1',\n 'O=P(O)(O)CCOCCOP(=O)(O)SCCN1CCCCC1',\n 'C1CCN(N=c2nn[nH][nH]2)CC1.O.O',\n 'N.N#CC1C=CCN1N1CCCCC1',\n 'O=C1CCCCC1.O=C1COCCCN1N1CCCCC1']</pre> In\u00a0[27]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[27]: In\u00a0[28]: Copied! <pre>side_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\"\n\ndm.to_image(side_chains)\n</pre> side_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\"  dm.to_image(side_chains)  Out[28]: In\u00a0[29]: Copied! <pre>generated_smiles = designer.scaffold_morphing(\n    side_chains=side_chains,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    random_seed=100,\n)\n\ndm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> generated_smiles = designer.scaffold_morphing(     side_chains=side_chains,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     random_seed=100, )  dm.to_image(generated_smiles[:12], mol_size=(350, 200))  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>2023-10-28 11:42:05.888 | INFO     | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %)  generated molecules are valid !\n</pre> Out[29]: In\u00a0[30]: Copied! <pre>linker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"]\n\ndm.to_image(linker_generation)\n</pre> linker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"]  dm.to_image(linker_generation)  Out[30]: In\u00a0[31]: Copied! <pre>generated_smiles = designer.linker_generation(\n    *linker_generation,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    random_seed=100,\n)\n\ngenerated_smiles\n</pre> generated_smiles = designer.linker_generation(     *linker_generation,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     random_seed=100, )  generated_smiles  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>2023-10-28 11:42:14.034 | INFO     | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %)  generated molecules are valid !\n</pre> Out[31]: <pre>['O=C(Oc1cccc(-c2nc(N3CCCCC3)nc3c2CCN3)c1)c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'O=C(Oc1cccc(-c2nc(-c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)nc3c2CCN3)c1)N1CCCCC1',\n 'N=C(N)NCCCN1C(=O)N(CN2CCCCC2)C(=O)C2CC(c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)CC21',\n 'N=C(N)NCCCN1C(=O)N(Cc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)C(=O)C2CC(N3CCCCC3)CC21',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc5c4oc4c6ccccc6c(Nc6cccc(N7CCCCC7)c6)cc54)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc(Nc5cc6c7cccc(N8CCCCC8)c7oc6c6ccccc56)c4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cc(-c5nc6n(n5)CC=C[C@H]6N5CCCCC5)ncn4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc([C@@H]4C=CCn5nc(-c6cc(N7CCCCC7)ncn6)nc54)cc23)c1',\n 'O=C1C[C@@H]2C[C@H]3[C@H](N4CCCCC4)CC4COCCC42O[C@@H]3CC(CCc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)O1',\n 'O=C1C[C@@H]2C[C@@H]3[C@@H](CC(CCN4CCCCC4)O1)OC21CCOCC1C[C@H]3c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'Brc1cccc(Nc2ncnc3ccc(NNc4ccc(SCCCCCCc5ccc(N6CCCCC6)cc5)cc4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4ccc(CCCCCCSc5ccc(NNN6CCCCC6)cc5)cc4)cc23)c1']</pre> In\u00a0[32]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[32]: <p>The End !</p>"},{"location":"tutorials/design-with-safe.html#de-novo-generation","title":"De novo generation\u00b6","text":"<p>Generation of novel molecules without any constraints.</p>"},{"location":"tutorials/design-with-safe.html#scaffold-decoration","title":"Scaffold Decoration\u00b6","text":"<p>For scaffold decoration, we wish to generate new molecules that would contain a given scaffold as core. Usually, the attachment point on the scaffold should dictate where the new vectors will be added.</p>"},{"location":"tutorials/design-with-safe.html#super-structure-generation","title":"Super structure generation\u00b6","text":"<p>In super structure generation, we just want to generate superstructure of a molecular subgraph</p>"},{"location":"tutorials/design-with-safe.html#motif-extension","title":"Motif Extension\u00b6","text":"<p>In motif extension, we are interested in generating a molecule containing a given motif as starting point.</p>"},{"location":"tutorials/design-with-safe.html#scaffold-morphing","title":"Scaffold Morphing\u00b6","text":"<p>In scaffold morphing, we wish to replace a scaffold by another one in a molecule. The process requires as input that the user provides either the side chains or the input molecules and the core</p>"},{"location":"tutorials/design-with-safe.html#linker-generation","title":"Linker generation\u00b6","text":"<p>Linker generation is mostly the same thing as scaffold morphing ...</p>"},{"location":"tutorials/extracting-representation-molfeat.html","title":"so really we just need our custom converter","text":"In\u00a0[1]: Copied! <pre>%load_ext autoreload\n%autoreload 2\n</pre> %load_ext autoreload %autoreload 2 In\u00a0[2]: Copied! <pre>import safe\nimport torch\nimport datamol as dm\nimport types\nfrom molfeat.trans.pretrained import PretrainedMolTransformer\nfrom molfeat.trans.pretrained import PretrainedHFTransformer\n\nfrom molfeat.trans.pretrained.hf_transformers import HFModel\nfrom safe.trainer.model import SAFEDoubleHeadsModel\nfrom safe.tokenizer import SAFETokenizer\n</pre> import safe import torch import datamol as dm import types from molfeat.trans.pretrained import PretrainedMolTransformer from molfeat.trans.pretrained import PretrainedHFTransformer  from molfeat.trans.pretrained.hf_transformers import HFModel from safe.trainer.model import SAFEDoubleHeadsModel from safe.tokenizer import SAFETokenizer  In\u00a0[3]: Copied! <pre>safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\nsafe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n</pre> safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\") safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\") <p>We now need to build the <code>molfeat</code>'s <code>HFModel</code> instance by wrapping our model.</p> In\u00a0[4]: Copied! <pre>safe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\n</pre> safe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained()) <p>You can put the above process in the <code>__init__</code> of the <code>SAFEMolTransformer</code> if you wish as we will be doing below.</p> In\u00a0[20]: Copied! <pre>class SAFEMolTransformer(PretrainedHFTransformer):\n    \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is \n    how we convert the input molecules into the safe format\"\"\"\n    def __init__(self, kind=None, notation=\"safe\", **kwargs):\n        if kind is None:\n            # we load the default SAFE model if the exact SAFE GPT model \n            # to use is not provided\n            safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\n            safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n            kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\n        super().__init__(kind, notation=None, **kwargs)\n        # now we change the internal converter\n        # overriding the internal converter of SmilesConverter leverages the exception handling\n        # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS\n        # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds.\n        self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe)\n        # you could also do any of the following:\n        # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode)\n        # self.converter = safe # the safe module\n</pre> class SAFEMolTransformer(PretrainedHFTransformer):     \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is      how we convert the input molecules into the safe format\"\"\"     def __init__(self, kind=None, notation=\"safe\", **kwargs):         if kind is None:             # we load the default SAFE model if the exact SAFE GPT model              # to use is not provided             safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")             safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")             kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())         super().__init__(kind, notation=None, **kwargs)         # now we change the internal converter         # overriding the internal converter of SmilesConverter leverages the exception handling         # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS         # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds.         self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe)         # you could also do any of the following:         # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode)         # self.converter = safe # the safe module  <pre>2023-12-20 22:57:39.310 | WARNING  | molfeat.trans.base:__init__:51 - The 'SAFEMolTransformer' interaction has been superseded by a new class with id 0x2ad77d6a0\n</pre> <p>Let's use the GPT pooler which uses the last non padding token (often <code>eos</code>) since the model is GPT2 like. For other options, see: https://molfeat-docs.datamol.io/stable/api/molfeat.utils.html#pooling</p> In\u00a0[116]: Copied! <pre># Let's use the GPT pooling method and only take the last hidden layer\nsafe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1])\nsafe_transformers\n</pre> # Let's use the GPT pooling method and only take the last hidden layer safe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1]) safe_transformers Out[116]: <pre>SAFEMolTransformer(dtype=np.float32)</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.SAFEMolTransformer<pre>SAFEMolTransformer(dtype=np.float32)</pre> In\u00a0[117]: Copied! <pre>mols = dm.data.freesolv().iloc[:10].smiles.values\n</pre> mols = dm.data.freesolv().iloc[:10].smiles.values In\u00a0[118]: Copied! <pre>safe_transformers(mols)\n</pre> safe_transformers(mols) Out[118]: <pre>array([[ 0.05216356,  0.10754181,  0.07509107, ...,  0.04756968,\n        -0.08228929, -0.11568106],\n       [ 0.02449008,  0.04048932,  0.14489463, ...,  0.11410899,\n        -0.02203353,  0.08706839],\n       [-0.07425696,  0.11859665,  0.19010407, ...,  0.10526019,\n         0.08878426, -0.06609854],\n       ...,\n       [ 0.07867863,  0.19300285,  0.23054805, ..., -0.00737952,\n         0.07542405,  0.00289541],\n       [ 0.12092628, -0.01785688,  0.19791883, ...,  0.13796932,\n         0.11520796, -0.15333697],\n       [-0.02005584,  0.13946685,  0.18568742, ...,  0.07080407,\n         0.06991849, -0.07151204]], dtype=float32)</pre> In\u00a0[119]: Copied! <pre>from sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import Pipeline\n\ndf = dm.data.freesolv()\ndf[\"safe\"]  = df[\"smiles\"].apply(safe_transformers.converter.encode)\ndf = df.dropna(subset=\"safe\")\n# we have to remove the molecules that cannot be converted \n# (no breakable bonds with our default methodology)\n</pre> from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline  df = dm.data.freesolv() df[\"safe\"]  = df[\"smiles\"].apply(safe_transformers.converter.encode) df = df.dropna(subset=\"safe\") # we have to remove the molecules that cannot be converted  # (no breakable bonds with our default methodology)  In\u00a0[120]: Copied! <pre>X, y = df[\"smiles\"].values, df[\"expt\"].values\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2)\n\n# The Molfeat transformer seemingly integrates with Scikit-learn Pipeline!\npipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())])\n</pre>  X, y = df[\"smiles\"].values, df[\"expt\"].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2)  # The Molfeat transformer seemingly integrates with Scikit-learn Pipeline! pipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())]) In\u00a0[121]: Copied! <pre>with dm.without_rdkit_log():\n    pipe.fit(X_train, y_train)\n    score = pipe.score(X_test, y_test)\n    y_pred = pipe.predict(X_test)\n</pre> with dm.without_rdkit_log():     pipe.fit(X_train, y_train)     score = pipe.score(X_test, y_test)     y_pred = pipe.predict(X_test)  In\u00a0[122]: Copied! <pre>print(\"R2 score:\", score)\n</pre> print(\"R2 score:\", score) <pre>R2 score: 0.4971483821661925\n</pre> In\u00a0[123]: Copied! <pre>import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots()\nax.scatter(y_test, y_pred)\nax.set_xlabel(\"Target\")\nax.set_ylabel(\"Preds\")\n</pre> import matplotlib.pyplot as plt  fig, ax = plt.subplots() ax.scatter(y_test, y_pred) ax.set_xlabel(\"Target\") ax.set_ylabel(\"Preds\") Out[123]: <pre>Text(0, 0.5, 'Preds')</pre> <p>Not really a great result. Any other model in <code>molfeat</code> would do better.</p>"},{"location":"tutorials/extracting-representation-molfeat.html#loading-the-safe-gpt-model-into-molfeat","title":"Loading the SAFE-GPT model into molfeat\u00b6","text":"<p>Because the SAFE model is not a standard HuggingFace <code>transformers</code> model, we need to wrap it.</p> <p>Why are we doing this ? Because we want to leverage the structure of <code>molfeat</code> and not have to write our own pooling for the model. This can be done by using the huggingface molecule transformer <code>PretrainedHFTransformer</code>  rather than the general purpose pretrained model class <code>PretrainedMolTransformer</code> where we will have to define our own <code>_embed</code> and <code>_convert</code> function.</p>"},{"location":"tutorials/extracting-representation-molfeat.html#building-the-safe-molecule-transformers","title":"Building the SAFE Molecule Transformers\u00b6","text":"<p>We have multiple options here, we can override the <code>_convert</code> method or even the <code>_embed</code> method but the best thing about <code>molfeat</code> is how flexible it is and all the shortcuts it provides.</p> <p>In this case, we just need to change the custom</p>"},{"location":"tutorials/extracting-representation-molfeat.html#so-really-we-just-need-our-custom-converter","title":"so really we just need our custom converter\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#basic-test","title":"Basic Test\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#tips","title":"Tips\u00b6","text":"<ol> <li>Make sure that your inputs are SMILES or RDKit Molecules.</li> <li>If you are getting an error coming from some tokenization step, that means that you are likely getting <code>None</code> molecules at some steps in the conversion to SAFE. This can happen if there your slicing algorithm of choice is not working. In that case, please filter your datasets to remove molecules that fails the encoding steps first. You can always use the very robus <code>safe.utils.convert_to_safe</code>, which augment default BRICS slicing with some graph partitioning algorithm.</li> </ol>"},{"location":"tutorials/getting-started.html","title":"Getting Started with SAFE","text":"In\u00a0[2]: Copied! <pre>import safe as sf\nimport datamol as dm\n\ncelecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\"\ncelecoxib_mol = dm.to_mol(celecoxib)\n\ndisplay(dm.to_image(celecoxib_mol))\n</pre> import safe as sf import datamol as dm  celecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\" celecoxib_mol = dm.to_mol(celecoxib)  display(dm.to_image(celecoxib_mol))  In\u00a0[3]: Copied! <pre>safe_str = sf.encode(celecoxib_mol)\n\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n</pre> safe_str = sf.encode(celecoxib_mol)  print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")  <pre>c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\nRepresentation using 4 fragments\n</pre> <p>SAFE string are SMILES</p> <p>Any SAFE string is a valid SMILES and can be read by RDKit without any decoding trick.</p> In\u00a0[4]: Copied! <pre>reconstructed = dm.to_mol(safe_str)\n\ndisplay(dm.to_image(reconstructed))\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\n</pre> reconstructed = dm.to_mol(safe_str)  display(dm.to_image(reconstructed))  assert dm.same_mol(celecoxib_mol, reconstructed)  <p>SAFE supports randomization</p> <p>You can generate randomized SAFE strings.</p> In\u00a0[5]: Copied! <pre>random_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True)\n\nprint(random_safe_str)\n\nreconstructed = dm.to_mol(safe_str)\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\n</pre> random_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True)  print(random_safe_str)  reconstructed = dm.to_mol(safe_str)  assert dm.same_mol(celecoxib_mol, reconstructed)  <pre>c15ccc(S(N)(=O)=O)cc1.c16cc4nn15.C4(F)(F)F.c16ccc(C)cc1\n</pre> <p>Fragment order in SAFE does not matter</p> <p>Any permutation of the fragment order in a SAFE string preserve the molecule identity</p> In\u00a0[6]: Copied! <pre>import numpy as np\n\nfragments = safe_str.split(\".\")\nrandomized_fragment_safe_str = np.random.permutation(fragments).tolist()\nrandomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str)\n\nprint(randomized_fragment_safe_str, safe_str)\nassert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)\n</pre> import numpy as np  fragments = safe_str.split(\".\") randomized_fragment_safe_str = np.random.permutation(fragments).tolist() randomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str)  print(randomized_fragment_safe_str, safe_str) assert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)  <pre>c14ccc(S(N)(=O)=O)cc1.c15cc3nn14.Cc1ccc5cc1.C3(F)(F)F c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\n</pre> <p>Use your own slicing logic</p> <p>By default SAFE strings are generated using <code>BRICS</code>, however, the following are supported:</p> <ul> <li>Hussain-Rea (<code>hr</code>)</li> <li>RECAP (<code>recap</code>)</li> <li>RDKit's MMPA (<code>mmpa</code>)</li> <li>Any possible attachment points (<code>attach</code>)</li> </ul> <p>Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.</p> In\u00a0[7]: Copied! <pre>def my_slicer(mol):\n    \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\"\n    for bond in mol.GetBonds():\n        if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()):\n            yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())\n</pre> def my_slicer(mol):     \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\"     for bond in mol.GetBonds():         if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()):             yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())  In\u00a0[9]: Copied! <pre>safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n</pre> safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")  <pre>c14cc(C(F)(F)F)nn13.c13ccc(S(N)(=O)=O)cc1.Cc1ccc4cc1\nRepresentation using 3 fragments\n</pre> <p>Or simply use a SMARTS or a list of SMARTS.</p> In\u00a0[11]: Copied! <pre># The above is equivalent to using the following SMARTS:\nsmart_slicer = [\"[r]-;!@[r]\"]\nsafe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n</pre> # The above is equivalent to using the following SMARTS: smart_slicer = [\"[r]-;!@[r]\"] safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")  <pre>c13cc(C(F)(F)F)nn14.c14ccc(S(N)(=O)=O)cc1.Cc1ccc3cc1\nRepresentation using 3 fragments\n</pre> In\u00a0[13]: Copied! <pre>safe_fragment = safe_str.split(\".\")\nsafe_fragment\n</pre> safe_fragment = safe_str.split(\".\") safe_fragment  Out[13]: <pre>['c13cc(C(F)(F)F)nn14', 'c14ccc(S(N)(=O)=O)cc1', 'Cc1ccc3cc1']</pre> In\u00a0[14]: Copied! <pre># the following will fail\ndm.to_mol(safe_fragment[0])\n</pre> # the following will fail dm.to_mol(safe_fragment[0])  <pre>[11:20:14] SMILES Parse Error: unclosed ring for input: 'c13cc(C(F)(F)F)nn14'\n</pre> In\u00a0[15]: Copied! <pre># while this works\nsf.decode(safe_fragment[0], as_mol=True)\n</pre> # while this works sf.decode(safe_fragment[0], as_mol=True)  Out[15]: In\u00a0[16]: Copied! <pre># if you want to keep the attachment points, then use remove_dummies=False\nsf.decode(safe_fragment[0], as_mol=True, remove_dummies=False)\n</pre> # if you want to keep the attachment points, then use remove_dummies=False sf.decode(safe_fragment[0], as_mol=True, remove_dummies=False)  Out[16]: In\u00a0[17]: Copied! <pre>sf.to_image(safe_str)\n</pre> sf.to_image(safe_str)  Out[17]: <p>There are 3 display modes for highlighting the fragments in a SAFE string. The difference between those modes is highlighted below using two different slicing algorithm.</p> <p>Overlapping fragments</p> <p>Note that because some fragment might be matching overlapping substructure of the molecules (for example the same fragment appearing multiple time in the molecule), the highlighting might assigned the same color to these fragments.</p> In\u00a0[18]: Copied! <pre>from IPython.display import display\nfrom ipywidgets import widgets, HBox\n\ndef display_image(safe_str):\n    image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml')\n    image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml')\n    image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml')\n    hbox = HBox([image_lasso, image_fill, image_color])\n    display(hbox)\n</pre>  from IPython.display import display from ipywidgets import widgets, HBox  def display_image(safe_str):     image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml')     image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml')     image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml')     hbox = HBox([image_lasso, image_fill, image_color])     display(hbox)  In\u00a0[19]: Copied! <pre># display for brics\nsafe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\")\ndisplay_image(safe_str_brics)\n</pre> # display for brics safe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\") display_image(safe_str_brics)  <pre>HBox(children=(Image(value=b'&lt;svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'&lt;s\u2026</pre> In\u00a0[20]: Copied! <pre># display with HR\nsafe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\")\ndisplay_image(safe_str_hr)\n</pre> # display with HR safe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\") display_image(safe_str_hr)  <pre>HBox(children=(Image(value=b'&lt;svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'&lt;s\u2026</pre> <p>The End !</p>"},{"location":"tutorials/getting-started.html#getting-started-with-safe","title":"Getting Started with SAFE\u00b6","text":"<p>The SAFE encoding format is a rewriting of SMILES to ensure that any molecule can be written as a sequence of fragments where atoms or tokens corresponding to given fragments form a substring (ontiguous sequence) in the line notation representation.</p> <p>SAFE addresses some of the limitation of SMILES strings when it comes to generative design:</p> Safe Others - native support for (sub)structure-constrained design - different generative models for different generative tasks  - extensive substructure matching for filtering after generation - multiple steps generative process (e.g Liao et al. 2023 )  - graph based approaches with their limitations - any molecule generation as a simple NLP task (sequence completion or mask filling)  - a single autoregressive sequence model for both linker generation and scaffold decoration. - complex training and decoding schemes for scaffold-constrained generation (e.g Ar\u00fas-Pous et al. 2020 )  - complex sampling algorithms for scaffold-constrained generation (e.g Langevin et al. 2020) - SAFE strings are SMILES strings - requires a different chemical language (e.g Krenn et al. 2022)"},{"location":"tutorials/getting-started.html#using-safe","title":"Using SAFE\u00b6","text":"<p>In the following we will highlight how to use SAFE and some of the properties of SAFE strings.</p>"},{"location":"tutorials/getting-started.html#encoding","title":"Encoding\u00b6","text":"<p>SAFE represents fragments</p> <p>SAFE represents molecules as a set of N [Fragment_1].[Fragment_i].[Fragment_N]</p>"},{"location":"tutorials/getting-started.html#decoding","title":"Decoding\u00b6","text":"<p>Fragment order in SAFE does not matter</p> <p>Each <code>SAFE fragment</code> is a valid molecule itself, however, you need to use the decoder to recover molecules where all attachment point are not fullfiled.</p>"},{"location":"tutorials/getting-started.html#displaying-a-safe-encoding","title":"Displaying a SAFE encoding\u00b6","text":"<p>We provide a visualization module to display a safe string, with highlight of all the fragments that compose it.</p>"},{"location":"tutorials/how-it-works.html","title":"How SAFE encoding works?","text":"In\u00a0[1]: Copied! <pre>import datamol as dm\n\nfrom rdkit import Chem\nfrom rdkit.Chem.Draw import rdDepictor\nfrom rdkit.Chem import rdChemReactions as rdr\nrdDepictor.SetPreferCoordGen(True)\n</pre> import datamol as dm  from rdkit import Chem from rdkit.Chem.Draw import rdDepictor from rdkit.Chem import rdChemReactions as rdr rdDepictor.SetPreferCoordGen(True)  In\u00a0[2]: Copied! <pre>smiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"]\nlegends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"]\ndm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True)\n</pre> smiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"] legends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"] dm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True)  Out[2]: <p>In the example above, we can see that <code>phenol</code> can be represented as two fragments that can be connected given proper attachment point.</p> <p>To achieve this we are interested in attaching 2 fragments together (the <code>methoxy</code> and the <code>phenyl</code> groups). In RDKit, this can usually be achieved using chemical reactions. For convenience, we will prefer a standardized representation of attachment points that includes an atom mapping.</p> In\u00a0[3]: Copied! <pre>smiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] #\ndm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True)\n</pre> smiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] # dm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True)  Out[3]: <p>To attach the two fragments, I can write a simple chemical transformation. Since smarts and smiles syntax do not mix very well when it comes to <code>*</code> I will assume an isotopic representation <code>[1*]</code> instead of <code>[*:1]</code></p> In\u00a0[4]: Copied! <pre>rxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]&gt;&gt;[*:1][*:2]\")\nrxn\n</pre> rxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]&gt;&gt;[*:1][*:2]\") rxn  Out[4]: In\u00a0[5]: Copied! <pre># replace atom map by isotopes\nphenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\")\nmethoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\")\n\n# runreactions\nprod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy)))\nprod[0][0]\n</pre> # replace atom map by isotopes phenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\") methoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\")  # runreactions prod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy))) prod[0][0]  Out[5]: <p>We can achieve the same result by using rdkit API in an slightly more tedious way.</p> In\u00a0[6]: Copied! <pre>replacement_sub =  Chem.MolFromSmarts(\"[1*]\")\nprod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0)\nprod = dm.remove_dummies(prod[0], dummy=\"[1*]\")\nprod\n</pre> replacement_sub =  Chem.MolFromSmarts(\"[1*]\") prod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0) prod = dm.remove_dummies(prod[0], dummy=\"[1*]\") prod  <pre>[11:14:08] WARNING: not removing hydrogen atom without neighbors\n</pre> Out[6]: <p>But wait, could we attach the fragment using only the string operations on the smiles ?</p> <p>Well, it's not possible by trying to perform substring replacement, but recall we just said that <code>numbers in smiles represents connectivity points</code> ?</p> In\u00a0[7]: Copied! <pre>phenyl = \"c1cc([*:1])ccc1\"\nmethoxy = \"O([*:1])C\"\ncomposite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C\ncompo = dm.to_mol(composite)\n</pre> phenyl = \"c1cc([*:1])ccc1\" methoxy = \"O([*:1])C\" composite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C compo = dm.to_mol(composite)  <p>Since <code>1</code> \"connectivity point\" is already present in the phenyl group. We need to start by opening a new connectivity point: <code>2</code></p> In\u00a0[8]: Copied! <pre>attached_composite = composite.replace(\"[*:1]\", \"2\")\ndm.to_mol(attached_composite)\n</pre> attached_composite = composite.replace(\"[*:1]\", \"2\") dm.to_mol(attached_composite)  <pre>[11:14:10] SMILES Parse Error: syntax error while parsing: c1cc(2)ccc1.O(2)C\n[11:14:10] SMILES Parse Error: Failed parsing SMILES 'c1cc(2)ccc1.O(2)C' for input: 'c1cc(2)ccc1.O(2)C'\n</pre> <p>The previous line does not work because of violation in the smiles syntax. As we are not taking into account the branching bracket surrounding the attachment point.</p> <p>We could try to regenerate the smiles or scan the sequence and remove the brackets when it's possible, but we want to limit the operations to <code>str.replace</code>. So let's try again.</p> In\u00a0[9]: Copied! <pre>attached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\")\ndm.to_image(attached_composite, legends=[attached_composite])\n</pre> attached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\") dm.to_image(attached_composite, legends=[attached_composite])  Out[9]: <p>You can see that the phenol molecule is represented as two \"fragments\" <code>[Fragment1].[Fragment2]</code>. That is what SAFE is about.</p> <p>In summary, to build a SAFE string, we just need to follow the step below:</p> <p></p> <p>The End !</p>"},{"location":"tutorials/how-it-works.html#how-safe-encoding-works","title":"How SAFE encoding works?\u00b6","text":"<p>The intuition behind safe is quite simple: we want to represent any molecule as a  <code>set of connected fragments</code>.</p>"},{"location":"tutorials/how-it-works.html#preliminary-on-smiles","title":"Preliminary on SMILES.\u00b6","text":"<p>Let's start first by revisiting some information about the SMILES syntax:</p> <ul> <li><p>An asterisk <code>*</code> in a smiles is usually employed to indicate any atom OR an attachment point of any group. It's particularly useful for smarts matching.</p> </li> <li><p>Number in smiles syntax indicates connectivity points between two atoms. For 2 digits numbers they would need to be preceeded by <code>%</code>.</p> </li> </ul> <p>This is partially explained on the wikipedia ring section of SMILES.</p> <ul> <li>A dot <code>.</code> in smiles indicates the presence of additional fragments and is used to separate them.</li> </ul> <p>A good ressource on the subject is the DAYLIGHT page.</p> <p>We illustrate these informations below !</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"Overview","text":"\ud83e\uddba SAFE  Sequential Attachment-based Fragment Embedding (SAFE) is a novel molecular line notation that represents molecules as an unordered sequence of fragment blocks to improve molecule design using generative models. <p>        Paper    |          Docs    |        \ud83e\udd17 Model    |        \ud83e\udd17 Training Dataset    </p> <p></p> <p> </p> <p> </p>"},{"location":"index.html#overview-of-safe","title":"Overview of SAFE","text":"<p>SAFE is the  deep learning molecular representation. It's an encoding leveraging a peculiarity in the decoding schemes of SMILES, to allow representation of molecules as a contiguous sequence of connected fragments. SAFE strings are valid SMILES strings, and thus are able to preserve the same amount of information. The intuitive representation of molecules as an ordered sequence of connected fragments greatly simplifies the following tasks often encountered in molecular design:</p> <ul> <li>de novo design</li> <li>superstructure generation</li> <li>scaffold decoration</li> <li>motif extension</li> <li>linker generation</li> <li>scaffold morphing.</li> </ul> <p>The construction of a SAFE strings requires defining a molecular fragmentation algorithm. By default, we use [BRICS], but any other fragmentation algorithm can be used. The image below illustrates the process of building a SAFE string. The resulting string is a valid SMILES that can be read by datamol or RDKit.</p> <p></p>"},{"location":"index.html#news","title":"News \ud83d\ude80","text":""},{"location":"index.html#20240115","title":"\ud83d\udca5 2024/01/15 \ud83d\udca5","text":"<ol> <li>@IanAWatson has a C++ implementation of SAFE in LillyMol that is quite fast and use a custom fragmentation algorithm. Follow the installation instruction on the repo and checkout the docs of the CLI here: docs/Molecule_Tools/SAFE.md</li> </ol>"},{"location":"index.html#installation","title":"Installation","text":"<p>You can install <code>safe</code> using pip:</p> <pre><code>pip install safe-mol\n</code></pre> <p>You can use conda/mamba:</p> <pre><code>mamba install -c conda-forge safe-mol\n</code></pre>"},{"location":"index.html#datasets-and-models","title":"Datasets and Models","text":"Type Name Infos Size Comment Model datamol-io/safe-gpt 87M params 350M Default model Training Dataset datamol-io/safe-gpt 1.1B rows 250GB Training dataset Drug Benchmark Dataset datamol-io/safe-drugs 26 rows 20 kB Benchmarking dataset"},{"location":"index.html#usage","title":"Usage","text":"<p>The tutorials in the documentation can help you get started with <code>safe</code> and <code>SAFE-GPT</code>.</p>"},{"location":"index.html#api","title":"API","text":"<p>We summarize some key functions provided by the <code>safe</code> package below.</p> Function Description <code>safe.encode</code> Translates a SMILES string into its corresponding SAFE string. <code>safe.decode</code> Translates a SAFE string into its corresponding SMILES string. The SAFE decoder just augment RDKit's <code>Chem.MolFromSmiles</code> with an optional correction argument to take care of missing hydrogens bonds. <code>safe.split</code> Tokenizes a SAFE string to build a generative model."},{"location":"index.html#examples","title":"Examples","text":""},{"location":"index.html#translation-between-safe-and-smiles-representations","title":"Translation between SAFE and SMILES representations","text":"<pre><code>import safe\n\nibuprofen = \"CC(Cc1ccc(cc1)C(C(=O)O)C)C\"\n\n# SMILES -&gt; SAFE -&gt; SMILES translation\ntry:\n    ibuprofen_sf = safe.encode(ibuprofen)  # c12ccc3cc1.C3(C)C(=O)O.CC(C)C2\n    ibuprofen_smi = safe.decode(ibuprofen_sf, canonical=True)  # CC(C)Cc1ccc(C(C)C(=O)O)cc1\nexcept safe.EncoderError:\n    pass\nexcept safe.DecoderError:\n    pass\n\nibuprofen_tokens = list(safe.split(ibuprofen_sf))\n</code></pre>"},{"location":"index.html#trainingfinetuning-a-new-model","title":"Training/Finetuning a (new) model","text":"<p>A command line interface is available to train a new model, please run <code>safe-train --help</code>. You can also provide an existing checkpoint to continue training or finetune on you own dataset.</p> <p>For example:</p> <pre><code>safe-train --config &lt;path to config&gt; \\\n    --model-path &lt;path to model&gt; \\\n    --tokenizer  &lt;path to tokenizer&gt; \\\n    --dataset &lt;path to dataset&gt; \\\n    --num_labels 9 \\\n    --torch_compile True \\\n    --optim \"adamw_torch\" \\\n    --learning_rate 1e-5 \\\n    --prop_loss_coeff 1e-3 \\\n    --gradient_accumulation_steps 1 \\\n    --output_dir \"&lt;path to outputdir&gt;\" \\\n    --max_steps 5\n</code></pre>"},{"location":"index.html#references","title":"References","text":"<p>If you use this repository, please cite the following related paper:</p> <pre><code>@misc{noutahi2023gotta,\n      title={Gotta be SAFE: A New Framework for Molecular Design},\n      author={Emmanuel Noutahi and Cristian Gabellini and Michael Craig and Jonathan S. C Lim and Prudencio Tossou},\n      year={2023},\n      eprint={2310.10773},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG}\n}\n</code></pre>"},{"location":"index.html#license","title":"License","text":"<p>Note that all data and model weights of SAFE are exclusively licensed for research purposes. The accompanying dataset is licensed under CC BY 4.0, which permits solely non-commercial usage. See DATA_LICENSE for details.</p> <p>This code base is licensed under the Apache-2.0 license. See LICENSE for details.</p>"},{"location":"index.html#development-lifecycle","title":"Development lifecycle","text":""},{"location":"index.html#setup-dev-environment","title":"Setup dev environment","text":"<pre><code>mamba create -n safe -f env.yml\nmamba activate safe\n\npip install --no-deps -e .\n</code></pre>"},{"location":"index.html#tests","title":"Tests","text":"<p>You can run tests locally with:</p> <pre><code>pytest\n</code></pre>"},{"location":"cli.html","title":"CLI for model Training","text":"<p>You can train a new <code>SAFE</code> generative models using the provided CLI, which uses \ud83e\udd17 Transformers !</p> <p>Usage: </p> <pre><code>safe-train [-h] [--model_path MODEL_PATH] [--config CONFIG] [--tokenizer TOKENIZER] [--num_labels NUM_LABELS]\n              [--include_descriptors [INCLUDE_DESCRIPTORS]] [--no_include_descriptors] [--prop_loss_coeff PROP_LOSS_COEFF]\n              [--wandb_project WANDB_PROJECT] [--wandb_watch {gradients,all}] [--cache_dir CACHE_DIR]\n              [--torch_dtype {auto,bfloat16,float16,float32}] [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] [--model_max_length MODEL_MAX_LENGTH]\n              [--dataset DATASET] [--is_tokenized [IS_TOKENIZED]] [--streaming [STREAMING]] [--text_column TEXT_COLUMN] --output_dir\n              OUTPUT_DIR [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]\n              [--do_predict [DO_PREDICT]] [--evaluation_strategy {no,steps,epoch}] [--prediction_loss_only [PREDICTION_LOSS_ONLY]]\n              [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]\n              [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]\n              [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]\n              [--eval_delay EVAL_DELAY] [--learning_rate LEARNING_RATE] [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]\n              [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--num_train_epochs NUM_TRAIN_EPOCHS]\n              [--max_steps MAX_STEPS]\n              [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}]\n              [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] [--log_level {debug,info,warning,error,critical,passive}]\n              [--log_level_replica {debug,info,warning,error,critical,passive}] [--log_on_each_node [LOG_ON_EACH_NODE]]\n              [--no_log_on_each_node] [--logging_dir LOGGING_DIR] [--logging_strategy {no,steps,epoch}]\n              [--logging_first_step [LOGGING_FIRST_STEP]] [--logging_steps LOGGING_STEPS] [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]\n              [--no_logging_nan_inf_filter] [--save_strategy {no,steps,epoch}] [--save_steps SAVE_STEPS] [--save_total_limit SAVE_TOTAL_LIMIT]\n              [--save_safetensors [SAVE_SAFETENSORS]] [--save_on_each_node [SAVE_ON_EACH_NODE]] [--no_cuda [NO_CUDA]]\n              [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]\n              [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] [--fp16_opt_level FP16_OPT_LEVEL]\n              [--half_precision_backend {auto,cuda_amp,apex,cpu_amp}] [--bf16_full_eval [BF16_FULL_EVAL]] [--fp16_full_eval [FP16_FULL_EVAL]]\n              [--tf32 TF32] [--local_rank LOCAL_RANK] [--ddp_backend {nccl,gloo,mpi,ccl}] [--tpu_num_cores TPU_NUM_CORES]\n              [--tpu_metrics_debug [TPU_METRICS_DEBUG]] [--debug DEBUG [DEBUG ...]] [--dataloader_drop_last [DATALOADER_DROP_LAST]]\n              [--eval_steps EVAL_STEPS] [--dataloader_num_workers DATALOADER_NUM_WORKERS] [--past_index PAST_INDEX] [--run_name RUN_NAME]\n              [--disable_tqdm DISABLE_TQDM] [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] [--no_remove_unused_columns]\n              [--label_names LABEL_NAMES [LABEL_NAMES ...]] [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]\n              [--metric_for_best_model METRIC_FOR_BEST_MODEL] [--greater_is_better GREATER_IS_BETTER] [--ignore_data_skip [IGNORE_DATA_SKIP]]\n              [--sharded_ddp SHARDED_DDP] [--fsdp FSDP] [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] [--fsdp_config FSDP_CONFIG]\n              [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] [--deepspeed DEEPSPEED]\n              [--label_smoothing_factor LABEL_SMOOTHING_FACTOR]\n              [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}]\n              [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] [--group_by_length [GROUP_BY_LENGTH]]\n              [--length_column_name LENGTH_COLUMN_NAME] [--report_to REPORT_TO [REPORT_TO ...]]\n              [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]\n              [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] [--no_dataloader_pin_memory]\n              [--skip_memory_metrics [SKIP_MEMORY_METRICS]] [--no_skip_memory_metrics]\n              [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] [--push_to_hub [PUSH_TO_HUB]]\n              [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--hub_model_id HUB_MODEL_ID]\n              [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] [--hub_token HUB_TOKEN] [--hub_private_repo [HUB_PRIVATE_REPO]]\n              [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]\n              [--fp16_backend {auto,cuda_amp,apex,cpu_amp}] [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]\n              [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] [--push_to_hub_token PUSH_TO_HUB_TOKEN] [--mp_parameters MP_PARAMETERS]\n              [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] [--full_determinism [FULL_DETERMINISM]] [--torchdynamo TORCHDYNAMO]\n              [--ray_scope RAY_SCOPE] [--ddp_timeout DDP_TIMEOUT] [--torch_compile [TORCH_COMPILE]]\n              [--torch_compile_backend TORCH_COMPILE_BACKEND] [--torch_compile_mode TORCH_COMPILE_MODE] [--xpu_backend {mpi,ccl,gloo}]\n</code></pre> <p>Options:</p> <pre><code>-h, --help            show this help message and exit\n--model_path MODEL_PATH\n                        Optional model path or model name to use as a starting point for the safe model (default: None)\n--config CONFIG       Path to the default config file to use for the safe model (default: None)\n--tokenizer TOKENIZER\n--num_labels NUM_LABELS\n                        Optional number of labels for the descriptors (default: None)\n--include_descriptors [INCLUDE_DESCRIPTORS]\n                        Whether to train with descriptors if they are available or Not (default: True)\n--no_include_descriptors\n                        Whether to train with descriptors if they are available or Not (default: False)\n--prop_loss_coeff PROP_LOSS_COEFF\n                        coefficient for the propery loss (default: 0.01)\n--wandb_project WANDB_PROJECT\n                        Name of the wandb project to use to log the SAFE model parameter (default: safe-gpt2)\n--wandb_watch {gradients,all}\n                        Whether to watch the wandb models or not (default: None)\n--cache_dir CACHE_DIR\n                        Where do you want to store the pretrained models downloaded from s3 (default: None)\n--torch_dtype {auto,bfloat16,float16,float32}\n                        Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the dtype will be\n                        automatically derived from the model's weights. (default: None)\n--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]\n                        It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights\n                        are loaded.set True will benefit LLM loading time and RAM consumption. Only valid when loading a pretrained model\n                        (default: False)\n--model_max_length MODEL_MAX_LENGTH\n                        Maximum sequence length. Sequences will be right padded (and possibly truncated) up to that value. (default: 1024)\n--dataset DATASET     Path to the preprocessed dataset to use for the safe model building (default: None)\n--is_tokenized [IS_TOKENIZED]\n                        whether the dataset submitted as input is already tokenized or not (default: False)\n--streaming [STREAMING]\n                        Whether to use a streaming dataset or not (default: False)\n--text_column TEXT_COLUMN\n                        Column containing text data to process. (default: inputs)\n--output_dir OUTPUT_DIR\n                        The output directory where the model predictions and checkpoints will be written. (default: None)\n--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]\n                        Overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint\n                        directory. (default: False)\n--do_train [DO_TRAIN]\n                        Whether to run training. (default: False)\n--do_eval [DO_EVAL]   Whether to run eval on the dev set. (default: False)\n--do_predict [DO_PREDICT]\n                        Whether to run predictions on the test set. (default: False)\n--evaluation_strategy {no,steps,epoch}\n                        The evaluation strategy to use. (default: no)\n--prediction_loss_only [PREDICTION_LOSS_ONLY]\n                        When performing evaluation and predictions, only returns the loss. (default: False)\n--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE\n                        Batch size per GPU/TPU core/CPU for training. (default: 8)\n--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE\n                        Batch size per GPU/TPU core/CPU for evaluation. (default: 8)\n--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE\n                        Deprecated, the use of `--per_device_train_batch_size` is preferred. Batch size per GPU/TPU core/CPU for training.\n                        (default: None)\n--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE\n                        Deprecated, the use of `--per_device_eval_batch_size` is preferred. Batch size per GPU/TPU core/CPU for evaluation.\n                        (default: None)\n--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS\n                        Number of updates steps to accumulate before performing a backward/update pass. (default: 1)\n--eval_accumulation_steps EVAL_ACCUMULATION_STEPS\n                        Number of predictions steps to accumulate before moving the tensors to the CPU. (default: None)\n--eval_delay EVAL_DELAY\n                        Number of epochs or steps to wait for before the first evaluation can be performed, depending on the evaluation_strategy.\n                        (default: 0)\n--learning_rate LEARNING_RATE\n                        The initial learning rate for AdamW. (default: 5e-05)\n--weight_decay WEIGHT_DECAY\n                        Weight decay for AdamW if we apply some. (default: 0.0)\n--adam_beta1 ADAM_BETA1\n                        Beta1 for AdamW optimizer (default: 0.9)\n--adam_beta2 ADAM_BETA2\n                        Beta2 for AdamW optimizer (default: 0.999)\n--adam_epsilon ADAM_EPSILON\n                        Epsilon for AdamW optimizer. (default: 1e-08)\n--max_grad_norm MAX_GRAD_NORM\n                        Max gradient norm. (default: 1.0)\n--num_train_epochs NUM_TRAIN_EPOCHS\n                        Total number of training epochs to perform. (default: 3.0)\n--max_steps MAX_STEPS\n                        If &gt; 0: set total number of training steps to perform. Override num_train_epochs. (default: -1)\n--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau}\n                        The scheduler type to use. (default: linear)\n--warmup_ratio WARMUP_RATIO\n                        Linear warmup over warmup_ratio fraction of total steps. (default: 0.0)\n--warmup_steps WARMUP_STEPS\n                        Linear warmup over warmup_steps. (default: 0)\n--log_level {debug,info,warning,error,critical,passive}\n                        Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug', 'info', 'warning',\n                        'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the application set the level. Defaults\n                        to 'passive'. (default: passive)\n--log_level_replica {debug,info,warning,error,critical,passive}\n                        Logger log level to use on replica nodes. Same choices and defaults as ``log_level`` (default: warning)\n--log_on_each_node [LOG_ON_EACH_NODE]\n                        When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: True)\n--no_log_on_each_node\n                        When doing a multinode distributed training, whether to log once per node or just once on the main node. (default: False)\n--logging_dir LOGGING_DIR\n                        Tensorboard log dir. (default: None)\n--logging_strategy {no,steps,epoch}\n                        The logging strategy to use. (default: steps)\n--logging_first_step [LOGGING_FIRST_STEP]\n                        Log the first global_step (default: False)\n--logging_steps LOGGING_STEPS\n                        Log every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted as\n                        ratio of total training steps. (default: 500)\n--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]\n                        Filter nan and inf losses for logging. (default: True)\n--no_logging_nan_inf_filter\n                        Filter nan and inf losses for logging. (default: False)\n--save_strategy {no,steps,epoch}\n                        The checkpoint save strategy to use. (default: steps)\n--save_steps SAVE_STEPS\n                        Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be\n                        interpreted as ratio of total training steps. (default: 500)\n--save_total_limit SAVE_TOTAL_LIMIT\n                        If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. When\n                        `load_best_model_at_end` is enabled, the 'best' checkpoint according to `metric_for_best_model` will always be retained in\n                        addition to the most recent ones. For example, for `save_total_limit=5` and `load_best_model_at_end=True`, the four last\n                        checkpoints will always be retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,\n                        it is possible that two checkpoints are saved: the last one and the best one (if they are different). Default is unlimited\n                        checkpoints (default: None)\n--save_safetensors [SAVE_SAFETENSORS]\n                        Use safetensors saving and loading for state dicts instead of default torch.load and torch.save. (default: False)\n--save_on_each_node [SAVE_ON_EACH_NODE]\n                        When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one\n                        (default: False)\n--no_cuda [NO_CUDA]   Do not use CUDA even when it is available (default: False)\n--use_mps_device [USE_MPS_DEVICE]\n                        This argument is deprecated. `mps` device will be used if available similar to `cuda` device. It will be removed in\n                        version 5.0 of \ud83e\udd17 Transformers (default: False)\n--seed SEED           Random seed that will be set at the beginning of training. (default: 42)\n--data_seed DATA_SEED\n                        Random seed to be used with data samplers. (default: None)\n--jit_mode_eval [JIT_MODE_EVAL]\n                        Whether or not to use PyTorch jit trace for inference (default: False)\n--use_ipex [USE_IPEX]\n                        Use Intel extension for PyTorch when it is available, installation: 'https://github.com/intel/intel-extension-for-pytorch'\n                        (default: False)\n--bf16 [BF16]         Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA architecture or using CPU\n                        (no_cuda). This is an experimental API and it may change. (default: False)\n--fp16 [FP16]         Whether to use fp16 (mixed) precision instead of 32-bit (default: False)\n--fp16_opt_level FP16_OPT_LEVEL\n                        For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at\n                        https://nvidia.github.io/apex/amp.html (default: O1)\n--half_precision_backend {auto,cuda_amp,apex,cpu_amp}\n                        The backend to be used for half precision. (default: auto)\n--bf16_full_eval [BF16_FULL_EVAL]\n                        Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may change. (default: False)\n--fp16_full_eval [FP16_FULL_EVAL]\n                        Whether to use full float16 evaluation instead of 32-bit (default: False)\n--tf32 TF32           Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API and it may\n                        change. (default: None)\n--local_rank LOCAL_RANK\n                        For distributed training: local_rank (default: -1)\n--ddp_backend {nccl,gloo,mpi,ccl}\n                        The backend to be used for distributed training (default: None)\n--tpu_num_cores TPU_NUM_CORES\n                        TPU: Number of TPU cores (automatically passed by launcher script) (default: None)\n--tpu_metrics_debug [TPU_METRICS_DEBUG]\n                        Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU: Whether to print debug metrics (default: False)\n--debug DEBUG [DEBUG ...]\n                        Whether or not to enable debug mode. Current options: `underflow_overflow` (Detect underflow and overflow in activations\n                        and weights), `tpu_metrics_debug` (print debug metrics on TPU). (default: None)\n--dataloader_drop_last [DATALOADER_DROP_LAST]\n                        Drop the last incomplete batch if it is not divisible by the batch size. (default: False)\n--eval_steps EVAL_STEPS\n                        Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`.If smaller than 1, will be interpreted\n                        as ratio of total training steps. (default: None)\n--dataloader_num_workers DATALOADER_NUM_WORKERS\n                        Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the main process.\n                        (default: 0)\n--past_index PAST_INDEX\n                        If &gt;=0, uses the corresponding part of the output as the past state for next step. (default: -1)\n--run_name RUN_NAME   An optional descriptor for the run. Notably used for wandb logging. (default: None)\n--disable_tqdm DISABLE_TQDM\n                        Whether or not to disable the tqdm progress bars. (default: None)\n--remove_unused_columns [REMOVE_UNUSED_COLUMNS]\n                        Remove columns not required by the model when using an nlp.Dataset. (default: True)\n--no_remove_unused_columns\n                        Remove columns not required by the model when using an nlp.Dataset. (default: False)\n--label_names LABEL_NAMES [LABEL_NAMES ...]\n                        The list of keys in your dictionary of inputs that correspond to the labels. (default: None)\n--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]\n                        Whether or not to load the best model found during training at the end of training. When this option is enabled, the best\n                        checkpoint will always be saved. See `save_total_limit` for more. (default: False)\n--metric_for_best_model METRIC_FOR_BEST_MODEL\n                        The metric to use to compare two different models. (default: None)\n--greater_is_better GREATER_IS_BETTER\n                        Whether the `metric_for_best_model` should be maximized or not. (default: None)\n--ignore_data_skip [IGNORE_DATA_SKIP]\n                        When resuming training, whether or not to skip the first epochs and batches to get to the same training data. (default:\n                        False)\n--sharded_ddp SHARDED_DDP\n                        Whether or not to use sharded DDP training (in distributed training only). The base option should be `simple`, `zero_dp_2`\n                        or `zero_dp_3` and you can add CPU-offload to `zero_dp_2` or `zero_dp_3` like this: zero_dp_2 offload` or `zero_dp_3\n                        offload`. You can add auto-wrap to `zero_dp_2` or `zero_dp_3` with the same syntax: zero_dp_2 auto_wrap` or `zero_dp_3\n                        auto_wrap`. (default: )\n--fsdp FSDP           Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training only). The base option\n                        should be `full_shard`, `shard_grad_op` or `no_shard` and you can add CPU-offload to `full_shard` or `shard_grad_op` like\n                        this: full_shard offload` or `shard_grad_op offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the\n                        same syntax: full_shard auto_wrap` or `shard_grad_op auto_wrap`. (default: )\n--fsdp_min_num_params FSDP_MIN_NUM_PARAMS\n                        This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp`\n                        field is passed). (default: 0)\n--fsdp_config FSDP_CONFIG\n                        Config to be used with FSDP (Pytorch Fully Sharded Data Parallel). The value is either afsdp json config file (e.g.,\n                        `fsdp_config.json`) or an already loaded json file as `dict`. (default: None)\n--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP\n                        This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,\n                        `T5Block` .... (useful only when `fsdp` flag is passed). (default: None)\n--deepspeed DEEPSPEED\n                        Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a\n                        dict (default: None)\n--label_smoothing_factor LABEL_SMOOTHING_FACTOR\n                        The label smoothing epsilon to apply (zero means no label smoothing). (default: 0.0)\n--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit}\n                        The optimizer to use. (default: adamw_hf)\n--optim_args OPTIM_ARGS\n                        Optional arguments to supply to optimizer. (default: None)\n--adafactor [ADAFACTOR]\n                        Whether or not to replace AdamW by Adafactor. (default: False)\n--group_by_length [GROUP_BY_LENGTH]\n                        Whether or not to group samples of roughly the same length together when batching. (default: False)\n--length_column_name LENGTH_COLUMN_NAME\n                        Column name with precomputed lengths to use when grouping by length. (default: length)\n--report_to REPORT_TO [REPORT_TO ...]\n                        The list of integrations to report the results and logs to. (default: None)\n--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS\n                        When using distributed training, the value of the flag `find_unused_parameters` passed to `DistributedDataParallel`.\n                        (default: None)\n--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB\n                        When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`. (default:\n                        None)\n--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS\n                        When using distributed training, the value of the flag `broadcast_buffers` passed to `DistributedDataParallel`. (default:\n                        None)\n--dataloader_pin_memory [DATALOADER_PIN_MEMORY]\n                        Whether or not to pin memory for DataLoader. (default: True)\n--no_dataloader_pin_memory\n                        Whether or not to pin memory for DataLoader. (default: False)\n--skip_memory_metrics [SKIP_MEMORY_METRICS]\n                        Whether or not to skip adding of memory profiler reports to metrics. (default: True)\n--no_skip_memory_metrics\n                        Whether or not to skip adding of memory profiler reports to metrics. (default: False)\n--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]\n                        Whether or not to use the legacy prediction_loop in the Trainer. (default: False)\n--push_to_hub [PUSH_TO_HUB]\n                        Whether or not to upload the trained model to the model hub after training. (default: False)\n--resume_from_checkpoint RESUME_FROM_CHECKPOINT\n                        The path to a folder with a valid checkpoint for your model. (default: None)\n--hub_model_id HUB_MODEL_ID\n                        The name of the repository to keep in sync with the local `output_dir`. (default: None)\n--hub_strategy {end,every_save,checkpoint,all_checkpoints}\n                        The hub strategy to use when `--push_to_hub` is activated. (default: every_save)\n--hub_token HUB_TOKEN\n                        The token to use to push to the Model Hub. (default: None)\n--hub_private_repo [HUB_PRIVATE_REPO]\n                        Whether the model repository is private or not. (default: False)\n--gradient_checkpointing [GRADIENT_CHECKPOINTING]\n                        If True, use gradient checkpointing to save memory at the expense of slower backward pass. (default: False)\n--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]\n                        Whether or not the inputs will be passed to the `compute_metrics` function. (default: False)\n--fp16_backend {auto,cuda_amp,apex,cpu_amp}\n                        Deprecated. Use half_precision_backend instead (default: auto)\n--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID\n                        The name of the repository to which push the `Trainer`. (default: None)\n--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION\n                        The name of the organization in with to which push the `Trainer`. (default: None)\n--push_to_hub_token PUSH_TO_HUB_TOKEN\n                        The token to use to push to the Model Hub. (default: None)\n--mp_parameters MP_PARAMETERS\n                        Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer (default: )\n--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]\n                        Whether to automatically decrease the batch size in half and rerun the training loop again each time a CUDA Out-of-Memory\n                        was reached (default: False)\n--full_determinism [FULL_DETERMINISM]\n                        Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed training. Important: this\n                        will negatively impact the performance, so only use it for debugging. (default: False)\n--torchdynamo TORCHDYNAMO\n                        This argument is deprecated, use `--torch_compile_backend` instead. (default: None)\n--ray_scope RAY_SCOPE\n                        The scope to use when doing hyperparameter search with Ray. By default, `\"last\"` will be used. Ray will then use the last\n                        checkpoint of all trials, compare those, and select the best one. However, other options are also available. See the Ray\n                        documentation (https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for\n                        more options. (default: last)\n--ddp_timeout DDP_TIMEOUT\n                        Overrides the default timeout for distributed training (value should be given in seconds). (default: 1800)\n--torch_compile [TORCH_COMPILE]\n                        If set to `True`, the model will be wrapped in `torch.compile`. (default: False)\n--torch_compile_backend TORCH_COMPILE_BACKEND\n                        Which backend to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--torch_compile_mode TORCH_COMPILE_MODE\n                        Which mode to use with `torch.compile`, passing one will trigger a model compilation. (default: None)\n--xpu_backend {mpi,ccl,gloo}\n                        The backend to be used for distributed training on Intel XPU. (default: None)\n</code></pre>"},{"location":"data_license.html","title":"Data License","text":"<pre><code># Creative Commons Attribution 4.0 International License (CC BY 4.0)\n\nThis work is licensed under the Creative Commons Attribution 4.0 International License.\n\nTo view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.\n</code></pre>"},{"location":"license.html","title":"License","text":"<pre><code>Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2023 Emmanuel Noutahi\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n</code></pre>"},{"location":"api/safe.html","title":"SAFE","text":""},{"location":"api/safe.html#safe-encoder-decoder","title":"SAFE Encoder-Decoder","text":""},{"location":"api/safe.html#safe.converter.SAFEConverter","title":"<code>SAFEConverter</code>","text":"<p>Molecule line notation conversion from SMILES to SAFE</p> <p>A SAFE representation is a string based representation of a molecule decomposition into fragment components, separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves, unless explicitely correct to add missing hydrogens.</p> <p>Slicing algorithms</p> <p>By default SAFE strings are generated using <code>BRICS</code>, however, the following alternative are supported:</p> <ul> <li>Hussain-Rea (<code>hr</code>)</li> <li>RECAP (<code>recap</code>)</li> <li>RDKit's MMPA (<code>mmpa</code>)</li> <li>Any possible attachment points (<code>attach</code>)</li> </ul> <p>Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.</p> Source code in <code>safe/converter.py</code> <pre><code>class SAFEConverter:\n    \"\"\"Molecule line notation conversion from SMILES to SAFE\n\n    A SAFE representation is a string based representation of a molecule decomposition into fragment components,\n    separated by a dot ('.'). Note that each component (fragment) might not be a valid molecule by themselves,\n    unless explicitely correct to add missing hydrogens.\n\n    !!! note \"Slicing algorithms\"\n\n        By default SAFE strings are generated using `BRICS`, however, the following alternative are supported:\n\n        * [Hussain-Rea (`hr`)](https://pubs.acs.org/doi/10.1021/ci900450m)\n        * [RECAP (`recap`)](https://pubmed.ncbi.nlm.nih.gov/9611787/)\n        * [RDKit's MMPA (`mmpa`)](https://www.rdkit.org/docs/source/rdkit.Chem.rdMMPA.html)\n        * Any possible attachment points (`attach`)\n\n        Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms\n        corresponding to the bonds to break.\n\n    \"\"\"\n\n    SUPPORTED_SLICERS = [\"hr\", \"rotatable\", \"recap\", \"mmpa\", \"attach\", \"brics\"]\n    __SLICE_SMARTS = {\n        \"hr\": [\"[*]!@-[*]\"],  # any non ring single bond\n        \"recap\": [\n            \"[$([C;!$(C([#7])[#7])](=!@[O]))]!@[$([#7;+0;!D1])]\",\n            \"[$(C=!@O)]!@[$([O;+0])]\",\n            \"[$([N;!D1;+0;!$(N-C=[#7,#8,#15,#16])](-!@[*]))]-!@[$([*])]\",\n            \"[$(C(=!@O)([#7;+0;D2,D3])!@[#7;+0;D2,D3])]!@[$([#7;+0;D2,D3])]\",\n            \"[$([O;+0](-!@[#6!$(C=O)])-!@[#6!$(C=O)])]-!@[$([#6!$(C=O)])]\",\n            \"C=!@C\",\n            \"[N;+1;D4]!@[#6]\",\n            \"[$([n;+0])]-!@C\",\n            \"[$([O]=[C]-@[N;+0])]-!@[$([C])]\",\n            \"c-!@c\",\n            \"[$([#7;+0;D2,D3])]-!@[$([S](=[O])=[O])]\",\n        ],\n        \"mmpa\": [\"[#6+0;!$(*=,#[!#6])]!@!=!#[*]\"],  # classical mmpa slicing smarts\n        \"attach\": [\"[*]!@[*]\"],  # any potential attachment point, including hydrogens when explicit\n        \"rotatable\": [\"[!$(*#*)&amp;!D1]-&amp;!@[!$(*#*)&amp;!D1]\"],\n    }\n\n    def __init__(\n        self,\n        slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n        require_hs: Optional[bool] = None,\n        use_original_opener_for_attach: bool = True,\n        ignore_stereo: bool = False,\n    ):\n        \"\"\"Constructor for the SAFE converter\n\n        Args:\n            slicer: slicer algorithm to use for encoding.\n                Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n                or a custom callable that returns the bond ids that can be sliced.\n            require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n                `attach` slicer requires adding hydrogens.\n            use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n                mapping number to attachment points, or use simple enumeration.\n            ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n        \"\"\"\n        self.slicer = slicer\n        if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n            self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n        if self.slicer != \"brics\" and isinstance(self.slicer, str):\n            self.slicer = [self.slicer]\n        if isinstance(self.slicer, (list, tuple)):\n            self.slicer = [dm.from_smarts(x) for x in self.slicer]\n            if any(x is None for x in self.slicer):\n                raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n        self.require_hs = require_hs or (slicer == \"attach\")\n        self.use_original_opener_for_attach = use_original_opener_for_attach\n        self.ignore_stereo = ignore_stereo\n\n    @staticmethod\n    def randomize(mol: dm.Mol, rng: Optional[int] = None):\n        \"\"\"Randomize the position of the atoms in a mol.\n\n        Args:\n            mol: molecules to randomize\n            rng: optional seed to use\n        \"\"\"\n        if isinstance(rng, int):\n            rng = np.random.default_rng(rng)\n        if mol.GetNumAtoms() == 0:\n            return mol\n        atom_indices = list(range(mol.GetNumAtoms()))\n        atom_indices = rng.permutation(atom_indices).tolist()\n        return Chem.RenumberAtoms(mol, atom_indices)\n\n    @classmethod\n    def _find_branch_number(cls, inp: str):\n        \"\"\"Find the branch number and ring closure in the SMILES representation using regexp\n\n        Args:\n            inp: input smiles\n        \"\"\"\n        inp = re.sub(r\"\\[.*?\\]\", \"\", inp)  # noqa\n        matching_groups = re.findall(r\"((?&lt;=%)\\d{2})|((?&lt;!%)\\d+)(?![^\\[]*\\])\", inp)\n        # first match is for multiple connection as multiple digits\n        # second match is for single connections requiring 2 digits\n        # SMILES does not support triple digits\n        branch_numbers = []\n        for m in matching_groups:\n            if m[0] == \"\":\n                branch_numbers.extend(int(mm) for mm in m[1])\n            elif m[1] == \"\":\n                branch_numbers.append(int(m[0].replace(\"%\", \"\")))\n        return branch_numbers\n\n    def _ensure_valid(self, inp: str):\n        \"\"\"Ensure that the input SAFE string is valid by fixing the missing attachment points\n\n        Args:\n            inp: input SAFE string\n\n        \"\"\"\n        missing_tokens = [inp]\n        branch_numbers = self._find_branch_number(inp)\n        # only use the set that have exactly 1 element\n        # any branch number that is not pairwise should receive a dummy atom to complete the attachment point\n        branch_numbers = Counter(branch_numbers)\n        for i, (bnum, bcount) in enumerate(branch_numbers.items()):\n            if bcount % 2 != 0:\n                bnum_str = str(bnum) if bnum &lt; 10 else f\"%{bnum}\"\n                _tk = f\"[*:{i+1}]{bnum_str}\"\n                if self.use_original_opener_for_attach:\n                    bnum_digit = bnum_str.strip(\"%\")  # strip out the % sign\n                    _tk = f\"[*:{bnum_digit}]{bnum_str}\"\n                missing_tokens.append(_tk)\n        return \".\".join(missing_tokens)\n\n    def decoder(\n        self,\n        inp: str,\n        as_mol: bool = False,\n        canonical: bool = False,\n        fix: bool = True,\n        remove_dummies: bool = True,\n        remove_added_hs: bool = True,\n    ):\n        \"\"\"Convert input SAFE representation to smiles\n\n        Args:\n            inp: input SAFE representation to decode as a valid molecule or smiles\n            as_mol: whether to return a molecule object or a smiles string\n            canonical: whether to return a canonical\n            fix: whether to fix the SAFE representation to take into account non-connected attachment points\n            remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n            remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n        \"\"\"\n\n        if fix:\n            inp = self._ensure_valid(inp)\n        mol = dm.to_mol(inp)\n        if remove_dummies:\n            with suppress(Exception):\n                du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n                out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n                mol = dm.remove_dummies(out)\n        if as_mol:\n            if remove_added_hs:\n                mol = dm.remove_hs(mol, update_explicit_count=True)\n            if canonical:\n                mol = dm.standardize_mol(mol)\n                mol = dm.canonical_tautomer(mol)\n            return mol\n        out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n        if canonical:\n            out = dm.standardize_smiles(out)\n        return out\n\n    def _fragment(self, mol: dm.Mol, allow_empty: bool = False):\n        \"\"\"\n        Perform bond cutting in place for the input molecule, given the slicing algorithm\n\n        Args:\n            mol: input molecule to split\n            allow_empty: whether to allow the slicing algorithm to return empty bonds\n        Raises:\n            SAFEFragmentationError: if the slicing algorithm return empty bonds\n        \"\"\"\n\n        if self.slicer is None:\n            matching_bonds = []\n\n        elif callable(self.slicer):\n            matching_bonds = self.slicer(mol)\n            matching_bonds = list(matching_bonds)\n\n        elif self.slicer == \"brics\":\n            matching_bonds = BRICS.FindBRICSBonds(mol)\n            matching_bonds = [brics_match[0] for brics_match in matching_bonds]\n\n        else:\n            matches = set()\n            for smarts in self.slicer:\n                matches |= {\n                    tuple(sorted(match)) for match in mol.GetSubstructMatches(smarts, uniquify=True)\n                }\n            matching_bonds = list(matches)\n\n        if matching_bonds is None or len(matching_bonds) == 0 and not allow_empty:\n            raise SAFEFragmentationError(\n                \"Slicing algorithms did not return any bonds that can be cut !\"\n            )\n        return matching_bonds or []\n\n    def encoder(\n        self,\n        inp: Union[str, dm.Mol],\n        canonical: bool = True,\n        randomize: Optional[bool] = False,\n        seed: Optional[int] = None,\n        constraints: Optional[List[dm.Mol]] = None,\n        allow_empty: bool = False,\n        rdkit_safe: bool = True,\n    ):\n        \"\"\"Convert input smiles to SAFE representation\n\n        Args:\n            inp: input smiles\n            canonical: whether to return canonical smiles string. Defaults to True\n            randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n            seed: optional seed to use when allowing randomization of the SAFE encoding.\n                Randomization happens at two steps:\n                1. at the original smiles representation by randomization the atoms.\n                2. at the SAFE conversion by randomizing fragment orders\n            constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n                happen outside of a substructure matching one of the patterns.\n            allow_empty: whether to allow the slicing algorithm to return empty bonds\n            rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n        \"\"\"\n        rng = None\n        if randomize:\n            rng = np.random.default_rng(seed)\n            if not canonical:\n                inp = dm.to_mol(inp, remove_hs=False)\n                inp = self.randomize(inp, rng)\n\n        if isinstance(inp, dm.Mol):\n            inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n        # EN: we first normalize the attachment if the molecule is a query:\n        # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n        # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n        # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n        branch_numbers = self._find_branch_number(inp)\n\n        mol = dm.to_mol(inp, remove_hs=False)\n        potential_stereos = Chem.FindPotentialStereo(mol)\n        has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n        if self.ignore_stereo:\n            mol = dm.remove_stereochemistry(mol)\n\n        bond_map_id = 1\n        for atom in mol.GetAtoms():\n            if atom.GetAtomicNum() == 0:\n                atom.SetAtomMapNum(0)\n                atom.SetIsotope(bond_map_id)\n                bond_map_id += 1\n\n        if self.require_hs:\n            mol = dm.add_hs(mol)\n        matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n        substructed_ignored = []\n        if constraints is not None:\n            substructed_ignored = list(\n                itertools.chain(\n                    *[\n                        mol.GetSubstructMatches(constraint, uniquify=True)\n                        for constraint in constraints\n                    ]\n                )\n            )\n\n        bonds = []\n        for i_a, i_b in matching_bonds:\n            # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n            # on the other end, a bond between two substructure to preserved independently is perfectly fine\n            if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n                continue\n            obond = mol.GetBondBetweenAtoms(i_a, i_b)\n            bonds.append(obond.GetIdx())\n\n        if len(bonds) &gt; 0:\n            mol = Chem.FragmentOnBonds(\n                mol,\n                bonds,\n                dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n            )\n        # here we need to be clever and disable rooted atom as the atom with mapping\n\n        frags = list(Chem.GetMolFrags(mol, asMols=True))\n        if randomize:\n            frags = rng.permutation(frags).tolist()\n        elif canonical:\n            frags = sorted(\n                frags,\n                key=lambda x: x.GetNumAtoms(),\n                reverse=True,\n            )\n\n        frags_str = []\n        for frag in frags:\n            non_map_atom_idxs = [\n                atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n            ]\n            frags_str.append(\n                Chem.MolToSmiles(\n                    frag,\n                    isomericSmiles=True,\n                    canonical=True,  # needs to always be true\n                    rootedAtAtom=non_map_atom_idxs[0],\n                )\n            )\n\n        scaffold_str = \".\".join(frags_str)\n        # EN: fix for https://github.com/datamol-io/safe/issues/37\n        # we were using the wrong branch number count which did not take into account\n        # possible change in digit utilization after bond slicing\n        scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n        # don't capture atom mapping in the scaffold\n        attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n        if canonical:\n            attach_pos = sorted(attach_pos)\n        starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n        for attach in attach_pos:\n            val = str(starting_num) if starting_num &lt; 10 else f\"%{starting_num}\"\n            # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n            attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n            scaffold_str = attach_regexp.sub(val, scaffold_str)\n            starting_num += 1\n\n        # now we need to remove all the parenthesis around digit only number\n        wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n        scaffold_str = wrong_attach.sub(r\"\\g&lt;1&gt;\", scaffold_str)\n        # furthermore, we autoapply rdkit-compatible digit standardization.\n        if rdkit_safe:\n            pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n            replacement = r\"\\g&lt;1&gt;\\g&lt;2&gt;\"\n            scaffold_str = re.sub(pattern, replacement, scaffold_str)\n        if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n            logger.warning(\n                \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n            )\n        return scaffold_str\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.__init__","title":"<code>__init__(slicer='brics', require_hs=None, use_original_opener_for_attach=True, ignore_stereo=False)</code>","text":"<p>Constructor for the SAFE converter</p> <p>Parameters:</p> Name Type Description Default <code>slicer</code> <code>Optional[Union[str, List[str], Callable]]</code> <p>slicer algorithm to use for encoding. Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS) or a custom callable that returns the bond ids that can be sliced.</p> <code>'brics'</code> <code>require_hs</code> <code>Optional[bool]</code> <p>whether the slicing algorithm require the molecule to have hydrogen explictly added. <code>attach</code> slicer requires adding hydrogens.</p> <code>None</code> <code>use_original_opener_for_attach</code> <code>bool</code> <p>whether to use the original branch opener digit when adding back mapping number to attachment points, or use simple enumeration.</p> <code>True</code> <code>ignore_stereo</code> <code>bool</code> <p>RDKIT does not support some particular SAFE subset when stereochemistry is defined.</p> <code>False</code> Source code in <code>safe/converter.py</code> <pre><code>def __init__(\n    self,\n    slicer: Optional[Union[str, List[str], Callable]] = \"brics\",\n    require_hs: Optional[bool] = None,\n    use_original_opener_for_attach: bool = True,\n    ignore_stereo: bool = False,\n):\n    \"\"\"Constructor for the SAFE converter\n\n    Args:\n        slicer: slicer algorithm to use for encoding.\n            Can either be one of the supported slicing algorithm (SUPPORTED_SLICERS)\n            or a custom callable that returns the bond ids that can be sliced.\n        require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n            `attach` slicer requires adding hydrogens.\n        use_original_opener_for_attach: whether to use the original branch opener digit when adding back\n            mapping number to attachment points, or use simple enumeration.\n        ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n\n    \"\"\"\n    self.slicer = slicer\n    if isinstance(slicer, str) and slicer.lower() in self.SUPPORTED_SLICERS:\n        self.slicer = self.__SLICE_SMARTS.get(slicer.lower(), slicer)\n    if self.slicer != \"brics\" and isinstance(self.slicer, str):\n        self.slicer = [self.slicer]\n    if isinstance(self.slicer, (list, tuple)):\n        self.slicer = [dm.from_smarts(x) for x in self.slicer]\n        if any(x is None for x in self.slicer):\n            raise ValueError(f\"Slicer: {slicer} cannot be valid\")\n    self.require_hs = require_hs or (slicer == \"attach\")\n    self.use_original_opener_for_attach = use_original_opener_for_attach\n    self.ignore_stereo = ignore_stereo\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.decoder","title":"<code>decoder(inp, as_mol=False, canonical=False, fix=True, remove_dummies=True, remove_added_hs=True)</code>","text":"<p>Convert input SAFE representation to smiles</p> <p>Parameters:</p> Name Type Description Default <code>inp</code> <code>str</code> <p>input SAFE representation to decode as a valid molecule or smiles</p> required <code>as_mol</code> <code>bool</code> <p>whether to return a molecule object or a smiles string</p> <code>False</code> <code>canonical</code> <code>bool</code> <p>whether to return a canonical</p> <code>False</code> <code>fix</code> <code>bool</code> <p>whether to fix the SAFE representation to take into account non-connected attachment points</p> <code>True</code> <code>remove_dummies</code> <code>bool</code> <p>whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with</p> <code>True</code> <code>remove_added_hs</code> <code>bool</code> <p>whether to remove all the added hydrogen atoms after applying dummy removal for recovery</p> <code>True</code> Source code in <code>safe/converter.py</code> <pre><code>def decoder(\n    self,\n    inp: str,\n    as_mol: bool = False,\n    canonical: bool = False,\n    fix: bool = True,\n    remove_dummies: bool = True,\n    remove_added_hs: bool = True,\n):\n    \"\"\"Convert input SAFE representation to smiles\n\n    Args:\n        inp: input SAFE representation to decode as a valid molecule or smiles\n        as_mol: whether to return a molecule object or a smiles string\n        canonical: whether to return a canonical\n        fix: whether to fix the SAFE representation to take into account non-connected attachment points\n        remove_dummies: whether to remove dummy atoms from the SAFE representation. Note that removing_dummies is incompatible with\n        remove_added_hs: whether to remove all the added hydrogen atoms after applying dummy removal for recovery\n    \"\"\"\n\n    if fix:\n        inp = self._ensure_valid(inp)\n    mol = dm.to_mol(inp)\n    if remove_dummies:\n        with suppress(Exception):\n            du = dm.from_smarts(\"[$([#0]!-!:*);$([#0;D1])]\")\n            out = Chem.ReplaceSubstructs(mol, du, dm.to_mol(\"C\"), True)[0]\n            mol = dm.remove_dummies(out)\n    if as_mol:\n        if remove_added_hs:\n            mol = dm.remove_hs(mol, update_explicit_count=True)\n        if canonical:\n            mol = dm.standardize_mol(mol)\n            mol = dm.canonical_tautomer(mol)\n        return mol\n    out = dm.to_smiles(mol, canonical=canonical, explicit_hs=(not remove_added_hs))\n    if canonical:\n        out = dm.standardize_smiles(out)\n    return out\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.encoder","title":"<code>encoder(inp, canonical=True, randomize=False, seed=None, constraints=None, allow_empty=False, rdkit_safe=True)</code>","text":"<p>Convert input smiles to SAFE representation</p> <p>Parameters:</p> Name Type Description Default <code>inp</code> <code>Union[str, Mol]</code> <p>input smiles</p> required <code>canonical</code> <code>bool</code> <p>whether to return canonical smiles string. Defaults to True</p> <code>True</code> <code>randomize</code> <code>Optional[bool]</code> <p>whether to randomize the safe string encoding. Will be ignored if canonical is provided</p> <code>False</code> <code>seed</code> <code>Optional[int]</code> <p>optional seed to use when allowing randomization of the SAFE encoding. Randomization happens at two steps: 1. at the original smiles representation by randomization the atoms. 2. at the SAFE conversion by randomizing fragment orders</p> <code>None</code> <code>constraints</code> <code>Optional[List[Mol]]</code> <p>List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would happen outside of a substructure matching one of the patterns.</p> <code>None</code> <code>allow_empty</code> <code>bool</code> <p>whether to allow the slicing algorithm to return empty bonds</p> <code>False</code> <code>rdkit_safe</code> <code>bool</code> <p>whether to apply rdkit-safe digit standardization to the output SAFE string.</p> <code>True</code> Source code in <code>safe/converter.py</code> <pre><code>def encoder(\n    self,\n    inp: Union[str, dm.Mol],\n    canonical: bool = True,\n    randomize: Optional[bool] = False,\n    seed: Optional[int] = None,\n    constraints: Optional[List[dm.Mol]] = None,\n    allow_empty: bool = False,\n    rdkit_safe: bool = True,\n):\n    \"\"\"Convert input smiles to SAFE representation\n\n    Args:\n        inp: input smiles\n        canonical: whether to return canonical smiles string. Defaults to True\n        randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n        seed: optional seed to use when allowing randomization of the SAFE encoding.\n            Randomization happens at two steps:\n            1. at the original smiles representation by randomization the atoms.\n            2. at the SAFE conversion by randomizing fragment orders\n        constraints: List of molecules or pattern to preserve during the SAFE construction. Any bond slicing would\n            happen outside of a substructure matching one of the patterns.\n        allow_empty: whether to allow the slicing algorithm to return empty bonds\n        rdkit_safe: whether to apply rdkit-safe digit standardization to the output SAFE string.\n    \"\"\"\n    rng = None\n    if randomize:\n        rng = np.random.default_rng(seed)\n        if not canonical:\n            inp = dm.to_mol(inp, remove_hs=False)\n            inp = self.randomize(inp, rng)\n\n    if isinstance(inp, dm.Mol):\n        inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)\n\n    # EN: we first normalize the attachment if the molecule is a query:\n    # inp = dm.reactions.convert_attach_to_isotope(inp, as_smiles=True)\n\n    # TODO(maclandrol): RDKit supports some extended form of ring closure, up to 5 digits\n    # https://www.rdkit.org/docs/RDKit_Book.html#ring-closures and I should try to include them\n    branch_numbers = self._find_branch_number(inp)\n\n    mol = dm.to_mol(inp, remove_hs=False)\n    potential_stereos = Chem.FindPotentialStereo(mol)\n    has_stereo_bonds = any(x.type == Chem.StereoType.Bond_Double for x in potential_stereos)\n    if self.ignore_stereo:\n        mol = dm.remove_stereochemistry(mol)\n\n    bond_map_id = 1\n    for atom in mol.GetAtoms():\n        if atom.GetAtomicNum() == 0:\n            atom.SetAtomMapNum(0)\n            atom.SetIsotope(bond_map_id)\n            bond_map_id += 1\n\n    if self.require_hs:\n        mol = dm.add_hs(mol)\n    matching_bonds = self._fragment(mol, allow_empty=allow_empty)\n    substructed_ignored = []\n    if constraints is not None:\n        substructed_ignored = list(\n            itertools.chain(\n                *[\n                    mol.GetSubstructMatches(constraint, uniquify=True)\n                    for constraint in constraints\n                ]\n            )\n        )\n\n    bonds = []\n    for i_a, i_b in matching_bonds:\n        # if both atoms of the bond are found in a disallowed substructure, we cannot consider them\n        # on the other end, a bond between two substructure to preserved independently is perfectly fine\n        if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):\n            continue\n        obond = mol.GetBondBetweenAtoms(i_a, i_b)\n        bonds.append(obond.GetIdx())\n\n    if len(bonds) &gt; 0:\n        mol = Chem.FragmentOnBonds(\n            mol,\n            bonds,\n            dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],\n        )\n    # here we need to be clever and disable rooted atom as the atom with mapping\n\n    frags = list(Chem.GetMolFrags(mol, asMols=True))\n    if randomize:\n        frags = rng.permutation(frags).tolist()\n    elif canonical:\n        frags = sorted(\n            frags,\n            key=lambda x: x.GetNumAtoms(),\n            reverse=True,\n        )\n\n    frags_str = []\n    for frag in frags:\n        non_map_atom_idxs = [\n            atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0\n        ]\n        frags_str.append(\n            Chem.MolToSmiles(\n                frag,\n                isomericSmiles=True,\n                canonical=True,  # needs to always be true\n                rootedAtAtom=non_map_atom_idxs[0],\n            )\n        )\n\n    scaffold_str = \".\".join(frags_str)\n    # EN: fix for https://github.com/datamol-io/safe/issues/37\n    # we were using the wrong branch number count which did not take into account\n    # possible change in digit utilization after bond slicing\n    scf_branch_num = self._find_branch_number(scaffold_str) + branch_numbers\n\n    # don't capture atom mapping in the scaffold\n    attach_pos = set(re.findall(r\"(\\[\\d+\\*\\]|!\\[[^:]*:\\d+\\])\", scaffold_str))\n    if canonical:\n        attach_pos = sorted(attach_pos)\n    starting_num = 1 if len(scf_branch_num) == 0 else max(scf_branch_num) + 1\n    for attach in attach_pos:\n        val = str(starting_num) if starting_num &lt; 10 else f\"%{starting_num}\"\n        # we cannot have anything of the form \"\\([@=-#-$/\\]*\\d+\\)\"\n        attach_regexp = re.compile(r\"(\" + re.escape(attach) + r\")\")\n        scaffold_str = attach_regexp.sub(val, scaffold_str)\n        starting_num += 1\n\n    # now we need to remove all the parenthesis around digit only number\n    wrong_attach = re.compile(r\"\\(([\\%\\d]*)\\)\")\n    scaffold_str = wrong_attach.sub(r\"\\g&lt;1&gt;\", scaffold_str)\n    # furthermore, we autoapply rdkit-compatible digit standardization.\n    if rdkit_safe:\n        pattern = r\"\\(([=-@#\\/\\\\]{0,2})(%?\\d{1,2})\\)\"\n        replacement = r\"\\g&lt;1&gt;\\g&lt;2&gt;\"\n        scaffold_str = re.sub(pattern, replacement, scaffold_str)\n    if not self.ignore_stereo and has_stereo_bonds and not dm.same_mol(scaffold_str, inp):\n        logger.warning(\n            \"Ignoring stereo is disabled, but molecule has stereochemistry interferring with SAFE representation\"\n        )\n    return scaffold_str\n</code></pre>"},{"location":"api/safe.html#safe.converter.SAFEConverter.randomize","title":"<code>randomize(mol, rng=None)</code>  <code>staticmethod</code>","text":"<p>Randomize the position of the atoms in a mol.</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecules to randomize</p> required <code>rng</code> <code>Optional[int]</code> <p>optional seed to use</p> <code>None</code> Source code in <code>safe/converter.py</code> <pre><code>@staticmethod\ndef randomize(mol: dm.Mol, rng: Optional[int] = None):\n    \"\"\"Randomize the position of the atoms in a mol.\n\n    Args:\n        mol: molecules to randomize\n        rng: optional seed to use\n    \"\"\"\n    if isinstance(rng, int):\n        rng = np.random.default_rng(rng)\n    if mol.GetNumAtoms() == 0:\n        return mol\n    atom_indices = list(range(mol.GetNumAtoms()))\n    atom_indices = rng.permutation(atom_indices).tolist()\n    return Chem.RenumberAtoms(mol, atom_indices)\n</code></pre>"},{"location":"api/safe.html#safe.converter.encode","title":"<code>encode(inp, canonical=True, randomize=False, seed=None, slicer=None, require_hs=None, constraints=None, ignore_stereo=False)</code>","text":"<p>Convert input smiles to SAFE representation</p> <p>Parameters:</p> Name Type Description Default <code>inp</code> <code>Union[str, Mol]</code> <p>input smiles</p> required <code>canonical</code> <code>bool</code> <p>whether to return canonical SAFE string. Defaults to True</p> <code>True</code> <code>randomize</code> <code>Optional[bool]</code> <p>whether to randomize the safe string encoding. Will be ignored if canonical is provided</p> <code>False</code> <code>seed</code> <code>Optional[int]</code> <p>optional seed to use when allowing randomization of the SAFE encoding.</p> <code>None</code> <code>slicer</code> <code>Optional[Union[List[str], str, Callable]]</code> <p>slicer algorithm to use for encoding. Defaults to \"brics\".</p> <code>None</code> <code>require_hs</code> <code>Optional[bool]</code> <p>whether the slicing algorithm require the molecule to have hydrogen explictly added.</p> <code>None</code> <code>constraints</code> <code>Optional[List[Mol]]</code> <p>List of molecules or pattern to preserve during the SAFE construction.</p> <code>None</code> <code>ignore_stereo</code> <code>Optional[bool]</code> <p>RDKIT does not support some particular SAFE subset when stereochemistry is defined.</p> <code>False</code> Source code in <code>safe/converter.py</code> <pre><code>def encode(\n    inp: Union[str, dm.Mol],\n    canonical: bool = True,\n    randomize: Optional[bool] = False,\n    seed: Optional[int] = None,\n    slicer: Optional[Union[List[str], str, Callable]] = None,\n    require_hs: Optional[bool] = None,\n    constraints: Optional[List[dm.Mol]] = None,\n    ignore_stereo: Optional[bool] = False,\n):\n    \"\"\"\n    Convert input smiles to SAFE representation\n\n    Args:\n        inp: input smiles\n        canonical: whether to return canonical SAFE string. Defaults to True\n        randomize: whether to randomize the safe string encoding. Will be ignored if canonical is provided\n        seed: optional seed to use when allowing randomization of the SAFE encoding.\n        slicer: slicer algorithm to use for encoding. Defaults to \"brics\".\n        require_hs: whether the slicing algorithm require the molecule to have hydrogen explictly added.\n        constraints: List of molecules or pattern to preserve during the SAFE construction.\n        ignore_stereo: RDKIT does not support some particular SAFE subset when stereochemistry is defined.\n    \"\"\"\n    if slicer is None:\n        slicer = \"brics\"\n    with dm.without_rdkit_log():\n        safe_obj = SAFEConverter(slicer=slicer, require_hs=require_hs, ignore_stereo=ignore_stereo)\n        try:\n            encoded = safe_obj.encoder(\n                inp,\n                canonical=canonical,\n                randomize=randomize,\n                constraints=constraints,\n                seed=seed,\n            )\n        except SAFEFragmentationError as e:\n            raise e\n        except Exception as e:\n            raise SAFEEncodeError(f\"Failed to encode {inp} with {slicer}\") from e\n        return encoded\n</code></pre>"},{"location":"api/safe.html#safe.converter.decode","title":"<code>decode(safe_str, as_mol=False, canonical=False, fix=True, remove_added_hs=True, remove_dummies=True, ignore_errors=False)</code>","text":"<p>Convert input SAFE representation to smiles Args:     safe_str: input SAFE representation to decode as a valid molecule or smiles     as_mol: whether to return a molecule object or a smiles string     canonical: whether to return a canonical smiles or a randomized smiles     fix: whether to fix the SAFE representation to take into account non-connected attachment points     remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string.     remove_dummies: whether to remove dummy atoms from the SAFE representation     ignore_errors: whether to ignore error and return None on decoding failure or raise an error</p> Source code in <code>safe/converter.py</code> <pre><code>def decode(\n    safe_str: str,\n    as_mol: bool = False,\n    canonical: bool = False,\n    fix: bool = True,\n    remove_added_hs: bool = True,\n    remove_dummies: bool = True,\n    ignore_errors: bool = False,\n):\n    \"\"\"Convert input SAFE representation to smiles\n    Args:\n        safe_str: input SAFE representation to decode as a valid molecule or smiles\n        as_mol: whether to return a molecule object or a smiles string\n        canonical: whether to return a canonical smiles or a randomized smiles\n        fix: whether to fix the SAFE representation to take into account non-connected attachment points\n        remove_added_hs: whether to remove the hydrogen atoms that have been added to fix the string.\n        remove_dummies: whether to remove dummy atoms from the SAFE representation\n        ignore_errors: whether to ignore error and return None on decoding failure or raise an error\n\n    \"\"\"\n    with dm.without_rdkit_log():\n        safe_obj = SAFEConverter()\n        try:\n            decoded = safe_obj.decoder(\n                safe_str,\n                as_mol=as_mol,\n                canonical=canonical,\n                fix=fix,\n                remove_dummies=remove_dummies,\n                remove_added_hs=remove_added_hs,\n            )\n\n        except Exception as e:\n            if ignore_errors:\n                return None\n            raise SAFEDecodeError(f\"Failed to decode {safe_str}\") from e\n        return decoded\n</code></pre>"},{"location":"api/safe.html#safe-design","title":"SAFE Design","text":""},{"location":"api/safe.html#safe.sample.SAFEDesign","title":"<code>SAFEDesign</code>","text":"<p>Molecular generation using SAFE pretrained model</p> Source code in <code>safe/sample.py</code> <pre><code>class SAFEDesign:\n    \"\"\"Molecular generation using SAFE pretrained model\"\"\"\n\n    _DEFAULT_MAX_LENGTH = 1024  # default max length used during training\n    _DEFAULT_MODEL_PATH = \"datamol-io/safe-gpt\"\n\n    def __init__(\n        self,\n        model: Union[SAFEDoubleHeadsModel, str],\n        tokenizer: Union[str, SAFETokenizer],\n        generation_config: Optional[Union[str, GenerationConfig]] = None,\n        safe_encoder: Optional[sf.SAFEConverter] = None,\n        verbose: bool = True,\n    ):\n        \"\"\"SAFEDesign constructor\n\n        !!! info\n            Design methods in SAFE are not deterministic when it comes to the token sampling step.\n            If a method accepts a `random_seed`, it's for the SAFE-related algorithms and not the\n            sampling from the autoregressive model. To ensure you get a deterministic sampling,\n            please set the seed at the `transformers` package level.\n\n            ```python\n            import safe as sf\n            import transformers\n            my_seed = 100\n            designer = sf.SAFEDesign(...)\n\n            transformers.set_seed(100) # use this before calling a design function\n            designer.linker_generation(...)\n            ```\n\n\n        Args:\n            model: input SAFEDoubleHeadsModel to use for generation\n            tokenizer: input SAFETokenizer to use for generation\n            generation_config: input GenerationConfig to use for generation\n            safe_encoder: custom safe encoder to use\n            verbose: whether to print out logging information during generation\n        \"\"\"\n        if isinstance(model, (str, os.PathLike)):\n            model = SAFEDoubleHeadsModel.from_pretrained(model)\n\n        if isinstance(tokenizer, (str, os.PathLike)):\n            tokenizer = SAFETokenizer.load(tokenizer)\n\n        model.eval()\n        self.model = model\n        self.tokenizer = tokenizer\n        if isinstance(generation_config, os.PathLike):\n            generation_config = GenerationConfig.from_pretrained(generation_config)\n        if generation_config is None:\n            generation_config = GenerationConfig.from_model_config(model.config)\n        self.generation_config = generation_config\n        for special_token_id in [\"bos_token_id\", \"eos_token_id\", \"pad_token_id\"]:\n            if getattr(self.generation_config, special_token_id) is None:\n                setattr(\n                    self.generation_config, special_token_id, getattr(tokenizer, special_token_id)\n                )\n\n        self.verbose = verbose\n        self.safe_encoder = safe_encoder or sf.SAFEConverter()\n\n    @classmethod\n    def load_default(\n        cls, verbose: bool = False, model_dir: Optional[str] = None, device: str = None\n    ) -&gt; \"SAFEDesign\":\n        \"\"\"Load default SAFEGenerator model\n\n        Args:\n            verbose: whether to print out logging information during generation\n            model_dir: Optional path to model folder to use instead of the default one.\n                If provided the tokenizer should be in the model_dir named as `tokenizer.json`\n            device: optional device where to move the model\n        \"\"\"\n        if model_dir is None or not model_dir:\n            model_dir = cls._DEFAULT_MODEL_PATH\n        model = SAFEDoubleHeadsModel.from_pretrained(model_dir)\n        tokenizer = SAFETokenizer.from_pretrained(model_dir)\n        gen_config = GenerationConfig.from_pretrained(model_dir)\n        if device is not None:\n            model = model.to(device)\n        return cls(model=model, tokenizer=tokenizer, generation_config=gen_config, verbose=verbose)\n\n    def linker_generation(\n        self,\n        *groups: Union[str, dm.Mol],\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        sanitize: bool = False,\n        do_not_fragment_further: Optional[bool] = True,\n        random_seed: Optional[int] = None,\n        model_only: Optional[bool] = False,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform linker generation using the pretrained SAFE model.\n        Linker generation is really just scaffold morphing underlying.\n\n        Args:\n            groups: list of fragments to link together, they are joined in the order provided\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of randomization to perform\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules\n            random_seed: random seed to use\n            model_only: whether to use the model only ability and nothing more.\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n        side_chains = list(groups)\n\n        if len(side_chains) != 2:\n            raise ValueError(\n                \"Linker generation only works when providing two groups as side chains\"\n            )\n\n        return self._fragment_linking(\n            side_chains=side_chains,\n            n_samples_per_trial=n_samples_per_trial,\n            n_trials=n_trials,\n            sanitize=sanitize,\n            do_not_fragment_further=do_not_fragment_further,\n            random_seed=random_seed,\n            is_linking=True,\n            model_only=model_only,\n            **kwargs,\n        )\n\n    def scaffold_morphing(\n        self,\n        side_chains: Optional[Union[dm.Mol, str, List[Union[str, dm.Mol]]]] = None,\n        mol: Optional[Union[dm.Mol, str]] = None,\n        core: Optional[Union[dm.Mol, str]] = None,\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        sanitize: bool = False,\n        do_not_fragment_further: Optional[bool] = True,\n        random_seed: Optional[int] = None,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform scaffold morphing decoration using the pretrained SAFE model\n\n        For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.\n        If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the\n        scaffold morphing then.\n\n        !!! note \"Finding the side chains\"\n            The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n            Those attachment points are never considered as part of the query, rather they are used to define the attachment points.\n            See ~sf.utils.compute_side_chains for more information.\n\n        Args:\n            side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)\n            mol: input molecules when side_chains are not provided\n            core: core to morph into another scaffold\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of randomization to perform\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules\n            random_seed: random seed to use\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n\n        return self._fragment_linking(\n            side_chains=side_chains,\n            mol=mol,\n            core=core,\n            n_samples_per_trial=n_samples_per_trial,\n            n_trials=n_trials,\n            sanitize=sanitize,\n            do_not_fragment_further=do_not_fragment_further,\n            random_seed=random_seed,\n            is_linking=False,\n            **kwargs,\n        )\n\n    def _fragment_linking(\n        self,\n        side_chains: Optional[Union[dm.Mol, str, List[Union[str, dm.Mol]]]] = None,\n        mol: Optional[Union[dm.Mol, str]] = None,\n        core: Optional[Union[dm.Mol, str]] = None,\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        sanitize: bool = False,\n        do_not_fragment_further: Optional[bool] = False,\n        random_seed: Optional[int] = None,\n        is_linking: Optional[bool] = False,\n        model_only: Optional[bool] = False,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform scaffold morphing decoration using the pretrained SAFE model\n\n        For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.\n        If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the\n        scaffold morphing then.\n\n        !!! note \"Finding the side chains\"\n            The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n            Those attachment points are never considered as part of the query, rather they are used to define the attachment points.\n            See ~sf.utils.compute_side_chains for more information.\n\n        Args:\n            side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)\n            mol: input molecules when side_chains are not provided\n            core: core to morph into another scaffold\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of randomization to perform\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules\n            random_seed: random seed to use\n            is_linking: whether it's a linking task or not.\n                For linking tasks, we use a different custom strategy of completing up to the attachment signal\n            model_only: whether to use the model only ability and nothing more. Only relevant when doing linker generation\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n        if side_chains is None:\n            if mol is None and core is None:\n                raise ValueError(\n                    \"Either side_chains OR mol+core should be provided for scaffold morphing\"\n                )\n            side_chains = sf.trainer.utils.compute_side_chains(mol, core)\n        side_chains = (\n            [dm.to_mol(x) for x in side_chains]\n            if isinstance(side_chains, list)\n            else [dm.to_mol(side_chains)]\n        )\n\n        side_chains = \".\".join([dm.to_smiles(x) for x in side_chains])\n\n        if \"*\" not in side_chains and self.verbose:\n            logger.warning(\n                f\"Side chain {side_chains} does not contain any dummy atoms, this might not be what you want\"\n            )\n\n        rng = random.Random(random_seed)\n        new_seed = rng.randint(1, 1000)\n\n        total_sequences = []\n        n_trials = n_trials or 1\n        for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n            with dm.without_rdkit_log():\n                context_mng = (\n                    sf.utils.attr_as(self.safe_encoder, \"slicer\", None)\n                    if do_not_fragment_further\n                    else suppress()\n                )\n                old_slicer = getattr(self.safe_encoder, \"slicer\", None)\n                with context_mng:\n                    try:\n                        encoded_fragment = self.safe_encoder.encoder(\n                            side_chains,\n                            canonical=False,\n                            randomize=False,\n                            constraints=None,\n                            allow_empty=True,\n                            seed=new_seed,\n                        )\n\n                    except Exception as e:\n                        if self.verbose:\n                            logger.error(e)\n                        raise sf.SAFEEncodeError(f\"Failed to encode {side_chains}\") from e\n                    finally:\n                        if old_slicer is not None:\n                            self.safe_encoder.slicer = old_slicer\n\n            fragments = encoded_fragment.split(\".\")\n            missing_closure = Counter(self.safe_encoder._find_branch_number(encoded_fragment))\n            missing_closure = [f\"{str(x)}\" for x in missing_closure if missing_closure[x] % 2 == 1]\n\n            closure_pos = [\n                m.start() for x in missing_closure for m in re.finditer(x, encoded_fragment)\n            ]\n            fragment_pos = [m.start() for m in re.finditer(r\"\\.\", encoded_fragment)]\n            min_pos = 0\n            while fragment_pos[min_pos] &lt; closure_pos[0] and min_pos &lt; len(fragment_pos):\n                min_pos += 1\n            min_pos += 1\n            max_pos = len(fragment_pos)\n            while fragment_pos[max_pos - 1] &gt; closure_pos[-1] and max_pos &gt; 0:\n                max_pos -= 1\n\n            split_index = rng.randint(min_pos, max_pos)\n            prefix, suffixes = \".\".join(fragments[:split_index]), \".\".join(fragments[split_index:])\n\n            missing_prefix_closure = Counter(self.safe_encoder._find_branch_number(prefix))\n            missing_suffix_closure = Counter(self.safe_encoder._find_branch_number(suffixes))\n\n            missing_prefix_closure = (\n                [\".\"] + [x for x in missing_closure if int(x) not in missing_prefix_closure] + [\".\"]\n            )\n            missing_suffix_closure = (\n                [\".\"] + [x for x in missing_closure if int(x) not in missing_suffix_closure] + [\".\"]\n            )\n\n            constraints_ids = []\n            for permutation in itertools.permutations(missing_closure + [\".\"]):\n                constraints_ids.append(\n                    self.tokenizer.encode(list(permutation), add_special_tokens=False)\n                )\n\n            # prefix_constraints_ids = self.tokenizer.encode(missing_prefix_closure, add_special_tokens=False)\n            # suffix_constraints_ids = self.tokenizer.encode(missing_suffix_closure, add_special_tokens=False)\n\n            # suffix_ids = self.tokenizer.encode([suffixes+self.tokenizer.tokenizer.eos_token], add_special_tokens=False)\n            # prefix_ids = self.tokenizer.encode([prefix], add_special_tokens=False)\n\n            prefix_kwargs = kwargs.copy()\n            suffix_kwargs = prefix_kwargs.copy()\n\n            if is_linking and model_only:\n                for _kwargs in [prefix_kwargs, suffix_kwargs]:\n                    _kwargs.setdefault(\"how\", \"beam\")\n                    _kwargs.setdefault(\"num_beams\", n_samples_per_trial)\n                    _kwargs.setdefault(\"do_sample\", False)\n\n                prefix_kwargs[\"constraints\"] = []\n                suffix_kwargs[\"constraints\"] = []\n                # prefix_kwargs[\"constraints\"] = [PhrasalConstraint(tkl) for tkl in suffix_constraints_ids]\n                # suffix_kwargs[\"constraints\"] = [PhrasalConstraint(tkl) for tkl in prefix_constraints_ids]\n\n                # we first generate a part of the fragment with for unique constraint that it should contain\n                # the closure required to join something to the suffix.\n                prefix_kwargs[\"constraints\"] += [\n                    DisjunctiveConstraint(tkl) for tkl in constraints_ids\n                ]\n                suffix_kwargs[\"constraints\"] += [\n                    DisjunctiveConstraint(tkl) for tkl in constraints_ids\n                ]\n\n                prefix_sequences = self._generate(\n                    n_samples=n_samples_per_trial, safe_prefix=prefix, **prefix_kwargs\n                )\n                suffix_sequences = self._generate(\n                    n_samples=n_samples_per_trial, safe_prefix=suffixes, **suffix_kwargs\n                )\n\n                prefix_sequences = [\n                    self._find_fragment_cut(x, prefix, missing_prefix_closure[1])\n                    for x in prefix_sequences\n                ]\n                suffix_sequences = [\n                    self._find_fragment_cut(x, suffixes, missing_suffix_closure[1])\n                    for x in suffix_sequences\n                ]\n\n                linkers = [x for x in set(prefix_sequences + suffix_sequences) if x]\n                sequences = [f\"{prefix}.{linker}.{suffixes}\" for linker in linkers]\n                sequences += self._decode_safe(sequences, canonical=True, remove_invalid=sanitize)\n\n            else:\n                mol_linker_slicer = sf.utils.MolSlicer(\n                    shortest_linker=(not is_linking), require_ring_system=(not is_linking)\n                )\n                prefix_smiles = sf.decode(prefix, remove_dummies=False, as_mol=False)\n                suffix_smiles = sf.decode(suffixes, remove_dummies=False, as_mol=False)\n\n                prefix_sequences = self._generate(\n                    n_samples=n_samples_per_trial, safe_prefix=prefix + \".\", **prefix_kwargs\n                )\n                suffix_sequences = self._generate(\n                    n_samples=n_samples_per_trial, safe_prefix=suffixes + \".\", **suffix_kwargs\n                )\n\n                prefix_sequences = self._decode_safe(\n                    prefix_sequences, canonical=True, remove_invalid=True\n                )\n                suffix_sequences = self._decode_safe(\n                    suffix_sequences, canonical=True, remove_invalid=True\n                )\n                sequences = self.__mix_sequences(\n                    prefix_sequences,\n                    suffix_sequences,\n                    prefix_smiles,\n                    suffix_smiles,\n                    n_samples_per_trial,\n                    mol_linker_slicer,\n                )\n\n            total_sequences.extend(sequences)\n\n        # then we should filter out molecules that do not match the requested\n        if sanitize:\n            total_sequences = sf.utils.filter_by_substructure_constraints(\n                total_sequences, side_chains\n            )\n            if self.verbose:\n                logger.info(\n                    f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %)  generated molecules are valid !\"\n                )\n        return total_sequences\n\n    def motif_extension(\n        self,\n        motif: Union[str, dm.Mol],\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        sanitize: bool = False,\n        do_not_fragment_further: Optional[bool] = True,\n        random_seed: Optional[int] = None,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform motif extension using the pretrained SAFE model.\n        Motif extension is really just scaffold decoration underlying.\n\n        Args:\n            motif: scaffold (with attachment points) to decorate\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of randomization to perform\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules and check\n            random_seed: random seed to use\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n        return self.scaffold_decoration(\n            motif,\n            n_samples_per_trial=n_samples_per_trial,\n            n_trials=n_trials,\n            sanitize=sanitize,\n            do_not_fragment_further=do_not_fragment_further,\n            random_seed=random_seed,\n            add_dot=True,\n            **kwargs,\n        )\n\n    def super_structure(\n        self,\n        core: Union[str, dm.Mol],\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        sanitize: bool = False,\n        do_not_fragment_further: Optional[bool] = True,\n        random_seed: Optional[int] = None,\n        attachment_point_depth: Optional[int] = None,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform super structure generation using the pretrained SAFE model.\n\n        To generate super-structure, we basically just create various attachment points to the input core,\n        then perform scaffold decoration.\n\n        Args:\n            core: input substructure to use. We aim to generate super structures of this molecule\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of different attachment points to consider\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules\n            random_seed: random seed to use\n            attachment_point_depth: depth of opening the attachment points.\n                Increasing this, means you increase the number of substitution point to consider.\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n\n        core = dm.to_mol(core)\n        cores = sf.utils.list_individual_attach_points(core, depth=attachment_point_depth)\n        # get the fully open mol, everytime too.\n        cores.append(dm.to_smiles(dm.reactions.open_attach_points(core)))\n        cores = list(set(cores))\n        rng = random.Random(random_seed)\n        rng.shuffle(cores)\n        # now also get the single openining of an attachment point\n        total_sequences = []\n        n_trials = n_trials or 1\n        for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n            core = cores[_ % len(cores)]\n            old_verbose = self.verbose\n            try:\n                with sf.utils.attr_as(self, \"verbose\", False):\n                    out = self._completion(\n                        fragment=core,\n                        n_samples_per_trial=n_samples_per_trial,\n                        n_trials=1,\n                        do_not_fragment_further=do_not_fragment_further,\n                        sanitize=sanitize,\n                        random_seed=random_seed,\n                        **kwargs,\n                    )\n                    total_sequences.extend(out)\n            except Exception as e:\n                if old_verbose:\n                    logger.error(e)\n\n            finally:\n                self.verbose = old_verbose\n\n        if sanitize and self.verbose:\n            logger.info(\n                f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %)  generated molecules are valid !\"\n            )\n        return total_sequences\n\n    def scaffold_decoration(\n        self,\n        scaffold: Union[str, dm.Mol],\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        do_not_fragment_further: Optional[bool] = True,\n        sanitize: bool = False,\n        random_seed: Optional[int] = None,\n        add_dot: Optional[bool] = True,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform scaffold decoration using the pretrained SAFE model\n\n        For scaffold decoration, we basically starts with a prefix with the attachment point.\n        We first convert the prefix into valid safe string.\n\n        Args:\n            scaffold: scaffold (with attachment points) to decorate\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of randomization to perform\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules and check if the scaffold is still present\n            random_seed: random seed to use\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n\n        total_sequences = self._completion(\n            fragment=scaffold,\n            n_samples_per_trial=n_samples_per_trial,\n            n_trials=n_trials,\n            do_not_fragment_further=do_not_fragment_further,\n            sanitize=sanitize,\n            random_seed=random_seed,\n            add_dot=add_dot,\n            **kwargs,\n        )\n        # if we require sanitization\n        # then we should filter out molecules that do not match the requested\n        if sanitize:\n            total_sequences = sf.utils.filter_by_substructure_constraints(total_sequences, scaffold)\n            if self.verbose:\n                logger.info(\n                    f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %)  generated molecules are valid !\"\n                )\n        return total_sequences\n\n    def de_novo_generation(\n        self,\n        n_samples_per_trial: int = 10,\n        sanitize: bool = False,\n        n_trials: Optional[int] = None,\n        **kwargs: Optional[Dict[Any, Any]],\n    ):\n        \"\"\"Perform de novo generation using the pretrained SAFE model.\n\n        De novo generation is equivalent to not having any prefix.\n\n        Args:\n            n_samples_per_trial: number of new molecules to generate\n            sanitize: whether to perform sanitization, aka, perform control to ensure what is asked is what is returned\n            n_trials: number of randomization to perform\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n        # EN: lazy programming much ?\n        kwargs.setdefault(\"how\", \"random\")\n        if kwargs[\"how\"] != \"random\" and not kwargs.get(\"do_sample\"):\n            logger.warning(\n                \"I don't think you know what you are doing ... for de novo generation `do_sample=True` or `how='random'` is expected !\"\n            )\n\n        total_sequences = []\n        n_trials = n_trials or 1\n        for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n            sequences = self._generate(n_samples=n_samples_per_trial, **kwargs)\n            total_sequences.extend(sequences)\n        total_sequences = self._decode_safe(\n            total_sequences, canonical=True, remove_invalid=sanitize\n        )\n\n        if sanitize and self.verbose:\n            logger.info(\n                f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n            )\n        return total_sequences\n\n    def _find_fragment_cut(self, fragment: str, prefix_constraint: str, branching_id: str):\n        \"\"\"\n        Perform a cut on the input fragment in such a way that it could be joined with another fragments sharing the same\n        branching id.\n\n        Args:\n            fragment: fragment to cut\n            prefix_constraint: prefix constraint to use\n            branching_id: branching id to use\n        \"\"\"\n        prefix_constraint = prefix_constraint.rstrip(\".\") + \".\"\n        fragment = (\n            fragment.replace(prefix_constraint, \"\", 1)\n            if fragment.startswith(prefix_constraint)\n            else fragment\n        )\n        fragments = fragment.split(\".\")\n        i = 0\n        for x in fragments:\n            if branching_id in x:\n                i += 1\n                break\n        return \".\".join(fragments[:i])\n\n    def __mix_sequences(\n        self,\n        prefix_sequences: List[str],\n        suffix_sequences: List[str],\n        prefix: str,\n        suffix: str,\n        n_samples: int,\n        mol_linker_slicer,\n    ):\n        \"\"\"Use generated prefix and suffix sequences to form new molecules\n        that will be the merging of both. This is the two step scaffold morphing and linker generation scheme\n        Args:\n            prefix_sequences: list of prefix sequences\n            suffix_sequences: list of suffix sequences\n            prefix: decoded smiles of the prefix\n            suffix: decoded smiles of the suffix\n            n_samples: number of samples to generate\n        \"\"\"\n        prefix_linkers = []\n        suffix_linkers = []\n        prefix_query = dm.from_smarts(prefix)\n        suffix_query = dm.from_smarts(suffix)\n\n        for x in prefix_sequences:\n            with suppress(Exception):\n                x = dm.to_mol(x)\n                out = mol_linker_slicer(x, prefix_query)\n                prefix_linkers.append(out[1])\n        for x in suffix_sequences:\n            with suppress(Exception):\n                x = dm.to_mol(x)\n                out = mol_linker_slicer(x, suffix_query)\n                suffix_linkers.append(out[1])\n        n_linked = 0\n        linked = []\n        linkers = prefix_linkers + suffix_linkers\n        linkers = [x for x in linkers if x is not None]\n        for n_linked, linker in enumerate(linkers):\n            linked.extend(mol_linker_slicer.link_fragments(linker, prefix, suffix))\n            if n_linked &gt; n_samples:\n                break\n            linked = [x for x in linked if x]\n        return linked[:n_samples]\n\n    def _decode_safe(\n        self, sequences: List[str], canonical: bool = True, remove_invalid: bool = False\n    ):\n        \"\"\"Decode a safe sequence into a molecule\n\n        Args:\n            sequence: safe sequence to decode\n            canonical: whether to return canonical sequence\n            remove_invalid: whether to remove invalid safe strings or keep them\n        \"\"\"\n\n        def _decode_fn(x):\n            return sf.decode(\n                x,\n                as_mol=False,\n                fix=True,\n                remove_added_hs=True,\n                canonical=canonical,\n                ignore_errors=True,\n                remove_dummies=True,\n            )\n\n        if len(sequences) &gt; 100:\n            safe_strings = dm.parallelized(_decode_fn, sequences, n_jobs=-1)\n        else:\n            safe_strings = [_decode_fn(x) for x in sequences]\n        if remove_invalid:\n            safe_strings = [x for x in safe_strings if x is not None]\n\n        return safe_strings\n\n    def _completion(\n        self,\n        fragment: Union[str, dm.Mol],\n        n_samples_per_trial: int = 10,\n        n_trials: Optional[int] = 1,\n        do_not_fragment_further: Optional[bool] = False,\n        sanitize: bool = False,\n        random_seed: Optional[int] = None,\n        add_dot: Optional[bool] = False,\n        is_safe: Optional[bool] = False,\n        **kwargs,\n    ):\n        \"\"\"Perform sentence completion using a prefix fragment\n\n        Args:\n            fragment: fragment (with attachment points)\n            n_samples_per_trial: number of new molecules to generate for each randomization\n            n_trials: number of randomization to perform\n            do_not_fragment_further: whether to fragment the scaffold further or not\n            sanitize: whether to sanitize the generated molecules\n            random_seed: random seed to use\n            is_safe: whether the smiles is already encoded as a safe string\n            add_dot: whether to add a dot at the end of the fragments to signal to the model that we want to generate a distinct fragment.\n            kwargs: any argument to provide to the underlying generation function\n        \"\"\"\n\n        # EN: lazy programming much ?\n        kwargs.setdefault(\"how\", \"random\")\n        if kwargs[\"how\"] != \"random\" and not kwargs.get(\"do_sample\"):\n            logger.warning(\n                \"I don't think you know what you are doing ... for de novo generation `do_sample=True` or `how='random'` is expected !\"\n            )\n\n        # Step 1: we conver the fragment into the relevant safe string format\n        # we use the provided safe encoder with the slicer that was expected\n\n        rng = random.Random(random_seed)\n        new_seed = rng.randint(1, 1000)\n\n        total_sequences = []\n        n_trials = n_trials or 1\n        for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n            if is_safe:\n                encoded_fragment = fragment\n            else:\n                with dm.without_rdkit_log():\n                    context_mng = (\n                        sf.utils.attr_as(self.safe_encoder, \"slicer\", None)\n                        if do_not_fragment_further\n                        else suppress()\n                    )\n                    old_slicer = getattr(self.safe_encoder, \"slicer\", None)\n                    with context_mng:\n                        try:\n                            encoded_fragment = self.safe_encoder.encoder(\n                                fragment,\n                                canonical=False,\n                                randomize=True,\n                                constraints=None,\n                                allow_empty=True,\n                                seed=new_seed,\n                            )\n\n                        except Exception as e:\n                            if self.verbose:\n                                logger.error(e)\n                            raise sf.SAFEEncodeError(f\"Failed to encode {fragment}\") from e\n                        finally:\n                            if old_slicer is not None:\n                                self.safe_encoder.slicer = old_slicer\n\n            if add_dot and encoded_fragment.count(\"(\") == encoded_fragment.count(\")\"):\n                encoded_fragment = encoded_fragment.rstrip(\".\") + \".\"\n\n            sequences = self._generate(\n                n_samples=n_samples_per_trial, safe_prefix=encoded_fragment, **kwargs\n            )\n\n            sequences = self._decode_safe(sequences, canonical=True, remove_invalid=sanitize)\n            total_sequences.extend(sequences)\n\n        return total_sequences\n\n    def _generate(\n        self,\n        n_samples: int = 1,\n        safe_prefix: Optional[str] = None,\n        max_length: Optional[int] = 100,\n        how: Optional[str] = \"random\",\n        num_beams: Optional[int] = None,\n        num_beam_groups: Optional[int] = None,\n        do_sample: Optional[bool] = None,\n        **kwargs,\n    ):\n        \"\"\"Sample a new sequence using the underlying hugging face model.\n        This emulates the izanagi sampling models, if you wish to retain the hugging face generation\n        behaviour, either call the hugging face functions directly or overwrite this function\n\n        ??? note \"Generation Parameters\"\n            From the hugging face documentation:\n\n            * `greedy decoding` if how=\"greedy\" and num_beams=1 and do_sample=False.\n            * `multinomial sampling` if num_beams=1 and do_sample=True.\n            * `beam-search decoding` if how=\"beam\" and num_beams&gt;1 and do_sample=False.\n            * `beam-search multinomial` sampling by calling if beam=True, num_beams&gt;1 and do_sample=True or how=\"random\" and num_beams&gt;1\n            * `diverse beam-search decoding` if num_beams&gt;1 and num_beam_groups&gt;1\n\n            It's also possible to ignore the 'how' shortcut and directly call the underlying generation methods using the proper arguments.\n            Learn more here: https://huggingface.co/docs/transformers/v4.32.0/en/main_classes/text_generation#transformers.GenerationConfig\n            Under the hood, the following will be applied depending on the arguments:\n\n            * greedy decoding by calling greedy_search() if num_beams=1 and do_sample=False\n            * contrastive search by calling contrastive_search() if penalty_alpha&gt;0. and top_k&gt;1\n            * multinomial sampling by calling sample() if num_beams=1 and do_sample=True\n            * beam-search decoding by calling beam_search() if num_beams&gt;1 and do_sample=False\n            * beam-search multinomial sampling by calling beam_sample() if num_beams&gt;1 and do_sample=True\n            * diverse beam-search decoding by calling group_beam_search(), if num_beams&gt;1 and num_beam_groups&gt;1\n            * constrained beam-search decoding by calling constrained_beam_search(), if constraints!=None or force_words_ids!=None\n            * assisted decoding by calling assisted_decoding(), if assistant_model is passed to .generate()\n\n        Args:\n            n_samples: number of sequences to return\n            safe_prefix: Prefix to use in sampling, should correspond to a safe fragment\n            max_length : maximum length of sampled sequence\n            how: which sampling method to use: \"beam\", \"greedy\" or \"random\". Can be used to control other parameters by setting defaults\n            num_beams: number of beams for beam search. 1 means no beam search, unless beam is specified then max(n_samples, num_beams) is used\n            num_beam_groups: number of beam groups for diverse beam search\n            do_sample: whether to perform random sampling or not, equivalent to setting random to True\n            kwargs: any additional keyword argument to pass to the underlying sampling `generate`  from hugging face transformer\n\n        Returns:\n            samples: list of sampled molecules, including failed validation\n\n        \"\"\"\n        pretrained_tk = self.tokenizer.get_pretrained()\n        if getattr(pretrained_tk, \"model_max_length\") is None:\n            setattr(\n                pretrained_tk,\n                \"model_max_length\",\n                self._DEFAULT_MAX_LENGTH,  # this was the defaul\n            )\n\n        input_ids = safe_prefix\n        if isinstance(safe_prefix, str):\n            # EN: should we address the special token issues\n            input_ids = pretrained_tk(\n                safe_prefix,\n                return_tensors=\"pt\",\n            )\n\n        num_beams = num_beams or None\n        do_sample = do_sample or False\n\n        if how == \"random\":\n            do_sample = True\n\n        elif how is not None and \"beam\" in how:\n            num_beams = max((num_beams or 0), n_samples)\n\n        is_greedy = how == \"greedy\" or (num_beams in [0, 1, None]) and do_sample is False\n\n        kwargs[\"do_sample\"] = do_sample\n        if num_beams is not None:\n            kwargs[\"num_beams\"] = num_beams\n        if num_beam_groups is not None:\n            kwargs[\"num_beam_groups\"] = num_beam_groups\n        kwargs[\"output_scores\"] = True\n        kwargs[\"return_dict_in_generate\"] = True\n        kwargs[\"num_return_sequences\"] = n_samples\n        kwargs[\"max_length\"] = max_length\n        kwargs.setdefault(\"early_stopping\", True)\n        # EN we don't do anything with the score that the model might return on generate ...\n        if not isinstance(input_ids, Mapping):\n            input_ids = {\"inputs\": None}\n        else:\n            # EN: we remove the EOS token added before running the prediction\n            # because the model output nonsense when we keep it.\n            for k in input_ids:\n                input_ids[k] = input_ids[k][:, :-1]\n\n        for k, v in input_ids.items():\n            if torch.is_tensor(v):\n                input_ids[k] = v.to(self.model.device)\n\n        # we remove the token_type_ids to support more model type than just GPT2\n        input_ids.pop(\"token_type_ids\", None)\n\n        if is_greedy:\n            kwargs[\"num_return_sequences\"] = 1\n            if num_beams is not None and num_beams &gt; 1:\n                raise ValueError(\"Cannot set num_beams|num_beam_groups &gt; 1 for greedy\")\n            # under greedy decoding there can only be a single solution\n            # we just duplicate the solution several time for efficiency\n            outputs = self.model.generate(\n                **input_ids,\n                generation_config=self.generation_config,\n                **kwargs,\n            )\n            sequences = [\n                pretrained_tk.decode(outputs.sequences.squeeze(), skip_special_tokens=True)\n            ] * n_samples\n\n        else:\n            outputs = self.model.generate(\n                **input_ids,\n                generation_config=self.generation_config,\n                **kwargs,\n            )\n            sequences = pretrained_tk.batch_decode(outputs.sequences, skip_special_tokens=True)\n        return sequences\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.__init__","title":"<code>__init__(model, tokenizer, generation_config=None, safe_encoder=None, verbose=True)</code>","text":"<p>SAFEDesign constructor</p> <p>Info</p> <p>Design methods in SAFE are not deterministic when it comes to the token sampling step. If a method accepts a <code>random_seed</code>, it's for the SAFE-related algorithms and not the sampling from the autoregressive model. To ensure you get a deterministic sampling, please set the seed at the <code>transformers</code> package level.</p> <pre><code>import safe as sf\nimport transformers\nmy_seed = 100\ndesigner = sf.SAFEDesign(...)\n\ntransformers.set_seed(100) # use this before calling a design function\ndesigner.linker_generation(...)\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>model</code> <code>Union[SAFEDoubleHeadsModel, str]</code> <p>input SAFEDoubleHeadsModel to use for generation</p> required <code>tokenizer</code> <code>Union[str, SAFETokenizer]</code> <p>input SAFETokenizer to use for generation</p> required <code>generation_config</code> <code>Optional[Union[str, GenerationConfig]]</code> <p>input GenerationConfig to use for generation</p> <code>None</code> <code>safe_encoder</code> <code>Optional[SAFEConverter]</code> <p>custom safe encoder to use</p> <code>None</code> <code>verbose</code> <code>bool</code> <p>whether to print out logging information during generation</p> <code>True</code> Source code in <code>safe/sample.py</code> <pre><code>def __init__(\n    self,\n    model: Union[SAFEDoubleHeadsModel, str],\n    tokenizer: Union[str, SAFETokenizer],\n    generation_config: Optional[Union[str, GenerationConfig]] = None,\n    safe_encoder: Optional[sf.SAFEConverter] = None,\n    verbose: bool = True,\n):\n    \"\"\"SAFEDesign constructor\n\n    !!! info\n        Design methods in SAFE are not deterministic when it comes to the token sampling step.\n        If a method accepts a `random_seed`, it's for the SAFE-related algorithms and not the\n        sampling from the autoregressive model. To ensure you get a deterministic sampling,\n        please set the seed at the `transformers` package level.\n\n        ```python\n        import safe as sf\n        import transformers\n        my_seed = 100\n        designer = sf.SAFEDesign(...)\n\n        transformers.set_seed(100) # use this before calling a design function\n        designer.linker_generation(...)\n        ```\n\n\n    Args:\n        model: input SAFEDoubleHeadsModel to use for generation\n        tokenizer: input SAFETokenizer to use for generation\n        generation_config: input GenerationConfig to use for generation\n        safe_encoder: custom safe encoder to use\n        verbose: whether to print out logging information during generation\n    \"\"\"\n    if isinstance(model, (str, os.PathLike)):\n        model = SAFEDoubleHeadsModel.from_pretrained(model)\n\n    if isinstance(tokenizer, (str, os.PathLike)):\n        tokenizer = SAFETokenizer.load(tokenizer)\n\n    model.eval()\n    self.model = model\n    self.tokenizer = tokenizer\n    if isinstance(generation_config, os.PathLike):\n        generation_config = GenerationConfig.from_pretrained(generation_config)\n    if generation_config is None:\n        generation_config = GenerationConfig.from_model_config(model.config)\n    self.generation_config = generation_config\n    for special_token_id in [\"bos_token_id\", \"eos_token_id\", \"pad_token_id\"]:\n        if getattr(self.generation_config, special_token_id) is None:\n            setattr(\n                self.generation_config, special_token_id, getattr(tokenizer, special_token_id)\n            )\n\n    self.verbose = verbose\n    self.safe_encoder = safe_encoder or sf.SAFEConverter()\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.__mix_sequences","title":"<code>__mix_sequences(prefix_sequences, suffix_sequences, prefix, suffix, n_samples, mol_linker_slicer)</code>","text":"<p>Use generated prefix and suffix sequences to form new molecules that will be the merging of both. This is the two step scaffold morphing and linker generation scheme Args:     prefix_sequences: list of prefix sequences     suffix_sequences: list of suffix sequences     prefix: decoded smiles of the prefix     suffix: decoded smiles of the suffix     n_samples: number of samples to generate</p> Source code in <code>safe/sample.py</code> <pre><code>def __mix_sequences(\n    self,\n    prefix_sequences: List[str],\n    suffix_sequences: List[str],\n    prefix: str,\n    suffix: str,\n    n_samples: int,\n    mol_linker_slicer,\n):\n    \"\"\"Use generated prefix and suffix sequences to form new molecules\n    that will be the merging of both. This is the two step scaffold morphing and linker generation scheme\n    Args:\n        prefix_sequences: list of prefix sequences\n        suffix_sequences: list of suffix sequences\n        prefix: decoded smiles of the prefix\n        suffix: decoded smiles of the suffix\n        n_samples: number of samples to generate\n    \"\"\"\n    prefix_linkers = []\n    suffix_linkers = []\n    prefix_query = dm.from_smarts(prefix)\n    suffix_query = dm.from_smarts(suffix)\n\n    for x in prefix_sequences:\n        with suppress(Exception):\n            x = dm.to_mol(x)\n            out = mol_linker_slicer(x, prefix_query)\n            prefix_linkers.append(out[1])\n    for x in suffix_sequences:\n        with suppress(Exception):\n            x = dm.to_mol(x)\n            out = mol_linker_slicer(x, suffix_query)\n            suffix_linkers.append(out[1])\n    n_linked = 0\n    linked = []\n    linkers = prefix_linkers + suffix_linkers\n    linkers = [x for x in linkers if x is not None]\n    for n_linked, linker in enumerate(linkers):\n        linked.extend(mol_linker_slicer.link_fragments(linker, prefix, suffix))\n        if n_linked &gt; n_samples:\n            break\n        linked = [x for x in linked if x]\n    return linked[:n_samples]\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.de_novo_generation","title":"<code>de_novo_generation(n_samples_per_trial=10, sanitize=False, n_trials=None, **kwargs)</code>","text":"<p>Perform de novo generation using the pretrained SAFE model.</p> <p>De novo generation is equivalent to not having any prefix.</p> <p>Parameters:</p> Name Type Description Default <code>n_samples_per_trial</code> <code>int</code> <p>number of new molecules to generate</p> <code>10</code> <code>sanitize</code> <code>bool</code> <p>whether to perform sanitization, aka, perform control to ensure what is asked is what is returned</p> <code>False</code> <code>n_trials</code> <code>Optional[int]</code> <p>number of randomization to perform</p> <code>None</code> <code>kwargs</code> <code>Optional[Dict[Any, Any]]</code> <p>any argument to provide to the underlying generation function</p> <code>{}</code> Source code in <code>safe/sample.py</code> <pre><code>def de_novo_generation(\n    self,\n    n_samples_per_trial: int = 10,\n    sanitize: bool = False,\n    n_trials: Optional[int] = None,\n    **kwargs: Optional[Dict[Any, Any]],\n):\n    \"\"\"Perform de novo generation using the pretrained SAFE model.\n\n    De novo generation is equivalent to not having any prefix.\n\n    Args:\n        n_samples_per_trial: number of new molecules to generate\n        sanitize: whether to perform sanitization, aka, perform control to ensure what is asked is what is returned\n        n_trials: number of randomization to perform\n        kwargs: any argument to provide to the underlying generation function\n    \"\"\"\n    # EN: lazy programming much ?\n    kwargs.setdefault(\"how\", \"random\")\n    if kwargs[\"how\"] != \"random\" and not kwargs.get(\"do_sample\"):\n        logger.warning(\n            \"I don't think you know what you are doing ... for de novo generation `do_sample=True` or `how='random'` is expected !\"\n        )\n\n    total_sequences = []\n    n_trials = n_trials or 1\n    for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n        sequences = self._generate(n_samples=n_samples_per_trial, **kwargs)\n        total_sequences.extend(sequences)\n    total_sequences = self._decode_safe(\n        total_sequences, canonical=True, remove_invalid=sanitize\n    )\n\n    if sanitize and self.verbose:\n        logger.info(\n            f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %) generated molecules are valid !\"\n        )\n    return total_sequences\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.linker_generation","title":"<code>linker_generation(*groups, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, model_only=False, **kwargs)</code>","text":"<p>Perform linker generation using the pretrained SAFE model. Linker generation is really just scaffold morphing underlying.</p> <p>Parameters:</p> Name Type Description Default <code>groups</code> <code>Union[str, Mol]</code> <p>list of fragments to link together, they are joined in the order provided</p> <code>()</code> <code>n_samples_per_trial</code> <code>int</code> <p>number of new molecules to generate for each randomization</p> <code>10</code> <code>n_trials</code> <code>Optional[int]</code> <p>number of randomization to perform</p> <code>1</code> <code>do_not_fragment_further</code> <code>Optional[bool]</code> <p>whether to fragment the scaffold further or not</p> <code>True</code> <code>sanitize</code> <code>bool</code> <p>whether to sanitize the generated molecules</p> <code>False</code> <code>random_seed</code> <code>Optional[int]</code> <p>random seed to use</p> <code>None</code> <code>model_only</code> <code>Optional[bool]</code> <p>whether to use the model only ability and nothing more.</p> <code>False</code> <code>kwargs</code> <code>Optional[Dict[Any, Any]]</code> <p>any argument to provide to the underlying generation function</p> <code>{}</code> Source code in <code>safe/sample.py</code> <pre><code>def linker_generation(\n    self,\n    *groups: Union[str, dm.Mol],\n    n_samples_per_trial: int = 10,\n    n_trials: Optional[int] = 1,\n    sanitize: bool = False,\n    do_not_fragment_further: Optional[bool] = True,\n    random_seed: Optional[int] = None,\n    model_only: Optional[bool] = False,\n    **kwargs: Optional[Dict[Any, Any]],\n):\n    \"\"\"Perform linker generation using the pretrained SAFE model.\n    Linker generation is really just scaffold morphing underlying.\n\n    Args:\n        groups: list of fragments to link together, they are joined in the order provided\n        n_samples_per_trial: number of new molecules to generate for each randomization\n        n_trials: number of randomization to perform\n        do_not_fragment_further: whether to fragment the scaffold further or not\n        sanitize: whether to sanitize the generated molecules\n        random_seed: random seed to use\n        model_only: whether to use the model only ability and nothing more.\n        kwargs: any argument to provide to the underlying generation function\n    \"\"\"\n    side_chains = list(groups)\n\n    if len(side_chains) != 2:\n        raise ValueError(\n            \"Linker generation only works when providing two groups as side chains\"\n        )\n\n    return self._fragment_linking(\n        side_chains=side_chains,\n        n_samples_per_trial=n_samples_per_trial,\n        n_trials=n_trials,\n        sanitize=sanitize,\n        do_not_fragment_further=do_not_fragment_further,\n        random_seed=random_seed,\n        is_linking=True,\n        model_only=model_only,\n        **kwargs,\n    )\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.load_default","title":"<code>load_default(verbose=False, model_dir=None, device=None)</code>  <code>classmethod</code>","text":"<p>Load default SAFEGenerator model</p> <p>Parameters:</p> Name Type Description Default <code>verbose</code> <code>bool</code> <p>whether to print out logging information during generation</p> <code>False</code> <code>model_dir</code> <code>Optional[str]</code> <p>Optional path to model folder to use instead of the default one. If provided the tokenizer should be in the model_dir named as <code>tokenizer.json</code></p> <code>None</code> <code>device</code> <code>str</code> <p>optional device where to move the model</p> <code>None</code> Source code in <code>safe/sample.py</code> <pre><code>@classmethod\ndef load_default(\n    cls, verbose: bool = False, model_dir: Optional[str] = None, device: str = None\n) -&gt; \"SAFEDesign\":\n    \"\"\"Load default SAFEGenerator model\n\n    Args:\n        verbose: whether to print out logging information during generation\n        model_dir: Optional path to model folder to use instead of the default one.\n            If provided the tokenizer should be in the model_dir named as `tokenizer.json`\n        device: optional device where to move the model\n    \"\"\"\n    if model_dir is None or not model_dir:\n        model_dir = cls._DEFAULT_MODEL_PATH\n    model = SAFEDoubleHeadsModel.from_pretrained(model_dir)\n    tokenizer = SAFETokenizer.from_pretrained(model_dir)\n    gen_config = GenerationConfig.from_pretrained(model_dir)\n    if device is not None:\n        model = model.to(device)\n    return cls(model=model, tokenizer=tokenizer, generation_config=gen_config, verbose=verbose)\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.motif_extension","title":"<code>motif_extension(motif, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, **kwargs)</code>","text":"<p>Perform motif extension using the pretrained SAFE model. Motif extension is really just scaffold decoration underlying.</p> <p>Parameters:</p> Name Type Description Default <code>motif</code> <code>Union[str, Mol]</code> <p>scaffold (with attachment points) to decorate</p> required <code>n_samples_per_trial</code> <code>int</code> <p>number of new molecules to generate for each randomization</p> <code>10</code> <code>n_trials</code> <code>Optional[int]</code> <p>number of randomization to perform</p> <code>1</code> <code>do_not_fragment_further</code> <code>Optional[bool]</code> <p>whether to fragment the scaffold further or not</p> <code>True</code> <code>sanitize</code> <code>bool</code> <p>whether to sanitize the generated molecules and check</p> <code>False</code> <code>random_seed</code> <code>Optional[int]</code> <p>random seed to use</p> <code>None</code> <code>kwargs</code> <code>Optional[Dict[Any, Any]]</code> <p>any argument to provide to the underlying generation function</p> <code>{}</code> Source code in <code>safe/sample.py</code> <pre><code>def motif_extension(\n    self,\n    motif: Union[str, dm.Mol],\n    n_samples_per_trial: int = 10,\n    n_trials: Optional[int] = 1,\n    sanitize: bool = False,\n    do_not_fragment_further: Optional[bool] = True,\n    random_seed: Optional[int] = None,\n    **kwargs: Optional[Dict[Any, Any]],\n):\n    \"\"\"Perform motif extension using the pretrained SAFE model.\n    Motif extension is really just scaffold decoration underlying.\n\n    Args:\n        motif: scaffold (with attachment points) to decorate\n        n_samples_per_trial: number of new molecules to generate for each randomization\n        n_trials: number of randomization to perform\n        do_not_fragment_further: whether to fragment the scaffold further or not\n        sanitize: whether to sanitize the generated molecules and check\n        random_seed: random seed to use\n        kwargs: any argument to provide to the underlying generation function\n    \"\"\"\n    return self.scaffold_decoration(\n        motif,\n        n_samples_per_trial=n_samples_per_trial,\n        n_trials=n_trials,\n        sanitize=sanitize,\n        do_not_fragment_further=do_not_fragment_further,\n        random_seed=random_seed,\n        add_dot=True,\n        **kwargs,\n    )\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.scaffold_decoration","title":"<code>scaffold_decoration(scaffold, n_samples_per_trial=10, n_trials=1, do_not_fragment_further=True, sanitize=False, random_seed=None, add_dot=True, **kwargs)</code>","text":"<p>Perform scaffold decoration using the pretrained SAFE model</p> <p>For scaffold decoration, we basically starts with a prefix with the attachment point. We first convert the prefix into valid safe string.</p> <p>Parameters:</p> Name Type Description Default <code>scaffold</code> <code>Union[str, Mol]</code> <p>scaffold (with attachment points) to decorate</p> required <code>n_samples_per_trial</code> <code>int</code> <p>number of new molecules to generate for each randomization</p> <code>10</code> <code>n_trials</code> <code>Optional[int]</code> <p>number of randomization to perform</p> <code>1</code> <code>do_not_fragment_further</code> <code>Optional[bool]</code> <p>whether to fragment the scaffold further or not</p> <code>True</code> <code>sanitize</code> <code>bool</code> <p>whether to sanitize the generated molecules and check if the scaffold is still present</p> <code>False</code> <code>random_seed</code> <code>Optional[int]</code> <p>random seed to use</p> <code>None</code> <code>kwargs</code> <code>Optional[Dict[Any, Any]]</code> <p>any argument to provide to the underlying generation function</p> <code>{}</code> Source code in <code>safe/sample.py</code> <pre><code>def scaffold_decoration(\n    self,\n    scaffold: Union[str, dm.Mol],\n    n_samples_per_trial: int = 10,\n    n_trials: Optional[int] = 1,\n    do_not_fragment_further: Optional[bool] = True,\n    sanitize: bool = False,\n    random_seed: Optional[int] = None,\n    add_dot: Optional[bool] = True,\n    **kwargs: Optional[Dict[Any, Any]],\n):\n    \"\"\"Perform scaffold decoration using the pretrained SAFE model\n\n    For scaffold decoration, we basically starts with a prefix with the attachment point.\n    We first convert the prefix into valid safe string.\n\n    Args:\n        scaffold: scaffold (with attachment points) to decorate\n        n_samples_per_trial: number of new molecules to generate for each randomization\n        n_trials: number of randomization to perform\n        do_not_fragment_further: whether to fragment the scaffold further or not\n        sanitize: whether to sanitize the generated molecules and check if the scaffold is still present\n        random_seed: random seed to use\n        kwargs: any argument to provide to the underlying generation function\n    \"\"\"\n\n    total_sequences = self._completion(\n        fragment=scaffold,\n        n_samples_per_trial=n_samples_per_trial,\n        n_trials=n_trials,\n        do_not_fragment_further=do_not_fragment_further,\n        sanitize=sanitize,\n        random_seed=random_seed,\n        add_dot=add_dot,\n        **kwargs,\n    )\n    # if we require sanitization\n    # then we should filter out molecules that do not match the requested\n    if sanitize:\n        total_sequences = sf.utils.filter_by_substructure_constraints(total_sequences, scaffold)\n        if self.verbose:\n            logger.info(\n                f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %)  generated molecules are valid !\"\n            )\n    return total_sequences\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.scaffold_morphing","title":"<code>scaffold_morphing(side_chains=None, mol=None, core=None, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, **kwargs)</code>","text":"<p>Perform scaffold morphing decoration using the pretrained SAFE model</p> <p>For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them. If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the scaffold morphing then.</p> <p>Finding the side chains</p> <p>The algorithm to find the side chains from core assumes that the core we get as input has attachment points. Those attachment points are never considered as part of the query, rather they are used to define the attachment points. See ~sf.utils.compute_side_chains for more information.</p> <p>Parameters:</p> Name Type Description Default <code>side_chains</code> <code>Optional[Union[Mol, str, List[Union[str, Mol]]]]</code> <p>side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)</p> <code>None</code> <code>mol</code> <code>Optional[Union[Mol, str]]</code> <p>input molecules when side_chains are not provided</p> <code>None</code> <code>core</code> <code>Optional[Union[Mol, str]]</code> <p>core to morph into another scaffold</p> <code>None</code> <code>n_samples_per_trial</code> <code>int</code> <p>number of new molecules to generate for each randomization</p> <code>10</code> <code>n_trials</code> <code>Optional[int]</code> <p>number of randomization to perform</p> <code>1</code> <code>do_not_fragment_further</code> <code>Optional[bool]</code> <p>whether to fragment the scaffold further or not</p> <code>True</code> <code>sanitize</code> <code>bool</code> <p>whether to sanitize the generated molecules</p> <code>False</code> <code>random_seed</code> <code>Optional[int]</code> <p>random seed to use</p> <code>None</code> <code>kwargs</code> <code>Optional[Dict[Any, Any]]</code> <p>any argument to provide to the underlying generation function</p> <code>{}</code> Source code in <code>safe/sample.py</code> <pre><code>def scaffold_morphing(\n    self,\n    side_chains: Optional[Union[dm.Mol, str, List[Union[str, dm.Mol]]]] = None,\n    mol: Optional[Union[dm.Mol, str]] = None,\n    core: Optional[Union[dm.Mol, str]] = None,\n    n_samples_per_trial: int = 10,\n    n_trials: Optional[int] = 1,\n    sanitize: bool = False,\n    do_not_fragment_further: Optional[bool] = True,\n    random_seed: Optional[int] = None,\n    **kwargs: Optional[Dict[Any, Any]],\n):\n    \"\"\"Perform scaffold morphing decoration using the pretrained SAFE model\n\n    For scaffold morphing, we try to replace the core by a new one. If the side_chains are provided, we use them.\n    If a combination of molecule and core is provided, then, we use them to extract the side chains and performing the\n    scaffold morphing then.\n\n    !!! note \"Finding the side chains\"\n        The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n        Those attachment points are never considered as part of the query, rather they are used to define the attachment points.\n        See ~sf.utils.compute_side_chains for more information.\n\n    Args:\n        side_chains: side chains to use to perform scaffold morphing (joining as best as possible the set of fragments)\n        mol: input molecules when side_chains are not provided\n        core: core to morph into another scaffold\n        n_samples_per_trial: number of new molecules to generate for each randomization\n        n_trials: number of randomization to perform\n        do_not_fragment_further: whether to fragment the scaffold further or not\n        sanitize: whether to sanitize the generated molecules\n        random_seed: random seed to use\n        kwargs: any argument to provide to the underlying generation function\n    \"\"\"\n\n    return self._fragment_linking(\n        side_chains=side_chains,\n        mol=mol,\n        core=core,\n        n_samples_per_trial=n_samples_per_trial,\n        n_trials=n_trials,\n        sanitize=sanitize,\n        do_not_fragment_further=do_not_fragment_further,\n        random_seed=random_seed,\n        is_linking=False,\n        **kwargs,\n    )\n</code></pre>"},{"location":"api/safe.html#safe.sample.SAFEDesign.super_structure","title":"<code>super_structure(core, n_samples_per_trial=10, n_trials=1, sanitize=False, do_not_fragment_further=True, random_seed=None, attachment_point_depth=None, **kwargs)</code>","text":"<p>Perform super structure generation using the pretrained SAFE model.</p> <p>To generate super-structure, we basically just create various attachment points to the input core, then perform scaffold decoration.</p> <p>Parameters:</p> Name Type Description Default <code>core</code> <code>Union[str, Mol]</code> <p>input substructure to use. We aim to generate super structures of this molecule</p> required <code>n_samples_per_trial</code> <code>int</code> <p>number of new molecules to generate for each randomization</p> <code>10</code> <code>n_trials</code> <code>Optional[int]</code> <p>number of different attachment points to consider</p> <code>1</code> <code>do_not_fragment_further</code> <code>Optional[bool]</code> <p>whether to fragment the scaffold further or not</p> <code>True</code> <code>sanitize</code> <code>bool</code> <p>whether to sanitize the generated molecules</p> <code>False</code> <code>random_seed</code> <code>Optional[int]</code> <p>random seed to use</p> <code>None</code> <code>attachment_point_depth</code> <code>Optional[int]</code> <p>depth of opening the attachment points. Increasing this, means you increase the number of substitution point to consider.</p> <code>None</code> <code>kwargs</code> <code>Optional[Dict[Any, Any]]</code> <p>any argument to provide to the underlying generation function</p> <code>{}</code> Source code in <code>safe/sample.py</code> <pre><code>def super_structure(\n    self,\n    core: Union[str, dm.Mol],\n    n_samples_per_trial: int = 10,\n    n_trials: Optional[int] = 1,\n    sanitize: bool = False,\n    do_not_fragment_further: Optional[bool] = True,\n    random_seed: Optional[int] = None,\n    attachment_point_depth: Optional[int] = None,\n    **kwargs: Optional[Dict[Any, Any]],\n):\n    \"\"\"Perform super structure generation using the pretrained SAFE model.\n\n    To generate super-structure, we basically just create various attachment points to the input core,\n    then perform scaffold decoration.\n\n    Args:\n        core: input substructure to use. We aim to generate super structures of this molecule\n        n_samples_per_trial: number of new molecules to generate for each randomization\n        n_trials: number of different attachment points to consider\n        do_not_fragment_further: whether to fragment the scaffold further or not\n        sanitize: whether to sanitize the generated molecules\n        random_seed: random seed to use\n        attachment_point_depth: depth of opening the attachment points.\n            Increasing this, means you increase the number of substitution point to consider.\n        kwargs: any argument to provide to the underlying generation function\n    \"\"\"\n\n    core = dm.to_mol(core)\n    cores = sf.utils.list_individual_attach_points(core, depth=attachment_point_depth)\n    # get the fully open mol, everytime too.\n    cores.append(dm.to_smiles(dm.reactions.open_attach_points(core)))\n    cores = list(set(cores))\n    rng = random.Random(random_seed)\n    rng.shuffle(cores)\n    # now also get the single openining of an attachment point\n    total_sequences = []\n    n_trials = n_trials or 1\n    for _ in tqdm(range(n_trials), disable=(not self.verbose), leave=False):\n        core = cores[_ % len(cores)]\n        old_verbose = self.verbose\n        try:\n            with sf.utils.attr_as(self, \"verbose\", False):\n                out = self._completion(\n                    fragment=core,\n                    n_samples_per_trial=n_samples_per_trial,\n                    n_trials=1,\n                    do_not_fragment_further=do_not_fragment_further,\n                    sanitize=sanitize,\n                    random_seed=random_seed,\n                    **kwargs,\n                )\n                total_sequences.extend(out)\n        except Exception as e:\n            if old_verbose:\n                logger.error(e)\n\n        finally:\n            self.verbose = old_verbose\n\n    if sanitize and self.verbose:\n        logger.info(\n            f\"After sanitization, {len(total_sequences)} / {n_samples_per_trial*n_trials} ({len(total_sequences)*100/(n_samples_per_trial*n_trials):.2f} %)  generated molecules are valid !\"\n        )\n    return total_sequences\n</code></pre>"},{"location":"api/safe.html#safe-tokenizer","title":"SAFE Tokenizer","text":""},{"location":"api/safe.html#safe.tokenizer.SAFESplitter","title":"<code>SAFESplitter</code>","text":"<p>Standard Splitter for SAFE string</p> Source code in <code>safe/tokenizer.py</code> <pre><code>class SAFESplitter:\n    \"\"\"Standard Splitter for SAFE string\"\"\"\n\n    REGEX_PATTERN = r\"\"\"(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|&gt;&gt;?|\\*|\\$|\\%[0-9]{2}|[0-9])\"\"\"\n\n    name = \"safe\"\n\n    def __init__(self, pattern: Optional[str] = None):\n        # do not use this as raw strings (not r before)\n        if pattern is None:\n            pattern = self.REGEX_PATTERN\n        self.regex = re.compile(pattern)\n\n    def tokenize(self, line):\n        \"\"\"Tokenize a safe string into characters.\"\"\"\n        if isinstance(line, str):\n            tokens = list(self.regex.findall(line))\n            reconstruction = \"\".join(tokens)\n            if line != reconstruction:\n                logger.error(\n                    f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n                )\n                raise ValueError(line)\n        else:\n            idxs = re.finditer(self.regex, str(line))\n            tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n        return tokens\n\n    def detokenize(self, chars):\n        \"\"\"Detokenize SAFE notation\"\"\"\n        if isinstance(chars, str):\n            chars = chars.split(\" \")\n        return \"\".join([x.strip() for x in chars])\n\n    def split(self, n, normalized):\n        \"\"\"Perform splitting for pretokenization\"\"\"\n        return self.tokenize(normalized)\n\n    def pre_tokenize(self, pretok):\n        \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n        pretok.split(self.split)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.detokenize","title":"<code>detokenize(chars)</code>","text":"<p>Detokenize SAFE notation</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def detokenize(self, chars):\n    \"\"\"Detokenize SAFE notation\"\"\"\n    if isinstance(chars, str):\n        chars = chars.split(\" \")\n    return \"\".join([x.strip() for x in chars])\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.pre_tokenize","title":"<code>pre_tokenize(pretok)</code>","text":"<p>Pretokenize using an input pretokenizer object from the tokenizer library</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def pre_tokenize(self, pretok):\n    \"\"\"Pretokenize using an input pretokenizer object from the tokenizer library\"\"\"\n    pretok.split(self.split)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.split","title":"<code>split(n, normalized)</code>","text":"<p>Perform splitting for pretokenization</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def split(self, n, normalized):\n    \"\"\"Perform splitting for pretokenization\"\"\"\n    return self.tokenize(normalized)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFESplitter.tokenize","title":"<code>tokenize(line)</code>","text":"<p>Tokenize a safe string into characters.</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def tokenize(self, line):\n    \"\"\"Tokenize a safe string into characters.\"\"\"\n    if isinstance(line, str):\n        tokens = list(self.regex.findall(line))\n        reconstruction = \"\".join(tokens)\n        if line != reconstruction:\n            logger.error(\n                f\"Tokens different from sample:\\ntokens {reconstruction}\\nsample {line}.\"\n            )\n            raise ValueError(line)\n    else:\n        idxs = re.finditer(self.regex, str(line))\n        tokens = [line[m.start(0) : m.end(0)] for m in idxs]\n    return tokens\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer","title":"<code>SAFETokenizer</code>","text":"<p>               Bases: <code>PushToHubMixin</code></p> <p>Class to initialize and train a tokenizer for SAFE string Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast</p> Source code in <code>safe/tokenizer.py</code> <pre><code>class SAFETokenizer(PushToHubMixin):\n    \"\"\"\n    Class to initialize and train a tokenizer for SAFE string\n    Once trained, you can use the converted version of the tokenizer to an HuggingFace PreTrainedTokenizerFast\n    \"\"\"\n\n    vocab_files_names: str = \"tokenizer.json\"\n\n    def __init__(\n        self,\n        tokenizer_type: str = \"bpe\",\n        splitter: Optional[str] = \"safe\",\n        trainer_args=None,\n        decoder_args=None,\n        token_model_args=None,\n    ):\n        super().__init__()\n        self.tokenizer_type = tokenizer_type\n        self.trainer_args = trainer_args or {}\n        self.decoder_args = decoder_args or {}\n        self.token_model_args = token_model_args or {}\n        if tokenizer_type is not None and tokenizer_type.startswith(\"bpe\"):\n            self.model = BPE(unk_token=UNK_TOKEN, **self.token_model_args)\n            self.trainer = BpeTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n        else:\n            self.model = WordLevel(unk_token=UNK_TOKEN, **self.token_model_args)\n            self.trainer = WordLevelTrainer(special_tokens=SPECIAL_TOKENS, **self.trainer_args)\n\n        self.tokenizer = Tokenizer(self.model)\n        self.splitter = None\n        if splitter == \"safe\":\n            self.splitter = SAFESplitter()\n            self.tokenizer.pre_tokenizer = PreTokenizer.custom(self.splitter)\n        self.tokenizer.post_processor = TemplateProcessing(\n            single=TEMPLATE_SINGLE,\n            pair=TEMPLATE_PAIR,\n            special_tokens=TEMPLATE_SPECIAL_TOKENS,\n        )\n        self.tokenizer.decoder = decoders.BPEDecoder(**self.decoder_args)\n        self.tokenizer = self.set_special_tokens(self.tokenizer)\n\n    @property\n    def bos_token_id(self):\n        \"\"\"Get the bos token id\"\"\"\n        return self.tokenizer.token_to_id(self.tokenizer.bos_token)\n\n    @property\n    def pad_token_id(self):\n        \"\"\"Get the bos token id\"\"\"\n        return self.tokenizer.token_to_id(self.tokenizer.pad_token)\n\n    @property\n    def eos_token_id(self):\n        \"\"\"Get the bos token id\"\"\"\n        return self.tokenizer.token_to_id(self.tokenizer.eos_token)\n\n    @classmethod\n    def set_special_tokens(\n        cls,\n        tokenizer: Tokenizer,\n        bos_token: str = CLS_TOKEN,\n        eos_token: str = SEP_TOKEN,\n    ):\n        \"\"\"Set special tokens for a tokenizer\n\n        Args:\n            tokenizer: tokenizer for which special tokens will be set\n            bos_token: Optional bos token to use\n            eos_token: Optional eos token to use\n        \"\"\"\n        tokenizer.pad_token = PADDING_TOKEN\n        tokenizer.cls_token = CLS_TOKEN\n        tokenizer.sep_token = SEP_TOKEN\n        tokenizer.mask_token = MASK_TOKEN\n        tokenizer.unk_token = UNK_TOKEN\n        tokenizer.eos_token = eos_token\n        tokenizer.bos_token = bos_token\n\n        if isinstance(tokenizer, Tokenizer):\n            tokenizer.add_special_tokens(\n                [\n                    PADDING_TOKEN,\n                    CLS_TOKEN,\n                    SEP_TOKEN,\n                    MASK_TOKEN,\n                    UNK_TOKEN,\n                    eos_token,\n                    bos_token,\n                ]\n            )\n        return tokenizer\n\n    def train(self, files: Optional[List[str]], **kwargs):\n        r\"\"\"\n        This is to train a new tokenizer from either a list of file or some input data\n\n        Args\n            files (str): file in which your molecules are separated by new line\n            kwargs (dict): optional args for the tokenizer `train`\n        \"\"\"\n        if isinstance(files, str):\n            files = [files]\n        self.tokenizer.train(files=files, trainer=self.trainer)\n\n    def __getstate__(self):\n        \"\"\"Getting state to allow pickling\"\"\"\n        with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n            d = copy.deepcopy(self.__dict__)\n        # copy back tokenizer level attribute\n        d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n        d[\"tokenizer\"].pre_tokenizer = Whitespace()\n        return d\n\n    def __setstate__(self, d):\n        \"\"\"Setting state during reloading pickling\"\"\"\n        use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n        if use_pretokenizer:\n            d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n        d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n        self.__dict__.update(d)\n\n    def train_from_iterator(self, data: Iterator, **kwargs: Any):\n        \"\"\"Train the Tokenizer using the provided iterator.\n\n        You can provide anything that is a Python Iterator\n            * A list of sequences :obj:`List[str]`\n            * A generator that yields :obj:`str` or :obj:`List[str]`\n            * A Numpy array of strings\n\n        Args:\n            data: data iterator\n            **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n        \"\"\"\n        self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n\n    def __len__(self):\n        r\"\"\"\n        Gets the count of tokens in vocab along with special tokens.\n        \"\"\"\n        return len(self.tokenizer.get_vocab().keys())\n\n    def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -&gt; list:\n        r\"\"\"\n        Encodes a given molecule string once training is done\n\n        Args:\n            sample_str: Sample string to encode molecule\n            ids_only: whether to return only the ids or the encoding objet\n\n        Returns:\n            object: Returns encoded list of IDs\n        \"\"\"\n        if isinstance(sample_str, str):\n            enc = self.tokenizer.encode(sample_str, **kwargs)\n            if ids_only:\n                return enc.ids\n            return enc\n\n        encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n        if ids_only:\n            return [enc.ids for enc in encs]\n        return encs\n\n    def to_dict(self, **kwargs):\n        \"\"\"Convert tokenizer to dict\"\"\"\n        # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n        if self.splitter is None:\n            tk_data = json.loads(self.tokenizer.to_str())\n        else:\n            with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n                # temporary replace pre tokenizer with whitespace\n                tk_data = json.loads(self.tokenizer.to_str())\n                tk_data[\"custom_pre_tokenizer\"] = True\n        tk_data[\"tokenizer_type\"] = self.tokenizer_type\n        tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n        return tk_data\n\n    def save_pretrained(self, *args, **kwargs):\n        \"\"\"Save pretrained tokenizer\"\"\"\n        self.tokenizer.save_pretrained(*args, **kwargs)\n\n    def save(self, file_name=None):\n        r\"\"\"\n        Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n        Args:\n            file_name (str, optional): File where to save tokenizer\n        \"\"\"\n        # EN: whole logic here assumes noone is going to mess with the special token\n        tk_data = self.to_dict()\n        with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n            out_str = json.dumps(tk_data, ensure_ascii=False)\n            OUT.write(out_str)\n\n    @classmethod\n    def from_dict(cls, data: dict):\n        \"\"\"Load tokenizer from dict\n\n        Args:\n            data: dictionary containing the tokenizer info\n        \"\"\"\n        tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n        tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n        custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n        tokenizer = Tokenizer.from_str(json.dumps(data))\n        if custom_pre_tokenizer:\n            tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n        mol_tokenizer = cls(tokenizer_type)\n        mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n        if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n            mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n        return mol_tokenizer\n\n    @classmethod\n    def load(cls, file_name):\n        \"\"\"Load the current tokenizer from file\"\"\"\n        with fsspec.open(file_name, \"r\") as OUT:\n            data_str = OUT.read()\n        data = json.loads(data_str)\n        # EN: the rust json parser of tokenizers has a predefined structure\n        # the next two lines are important\n        return cls.from_dict(data)\n\n    def decode(\n        self,\n        ids: list,\n        skip_special_tokens: bool = True,\n        ignore_stops: bool = False,\n        stop_token_ids: Optional[List[int]] = None,\n    ) -&gt; str:\n        r\"\"\"\n        Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n        Args:\n            ids: list of IDs\n            skip_special_tokens: whether to skip all special tokens when encountering them\n            ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n            stop_token_ids: optional list of stop token ids to use\n\n        Returns:\n            sequence: str representation of molecule\n        \"\"\"\n        old_id_list = ids\n        if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n            old_id_list = [ids]\n        if not stop_token_ids:\n            stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n        new_ids_list = []\n        for ids in old_id_list:\n            new_ids = ids\n            if not ignore_stops:\n                new_ids = []\n                # if first tokens are stop, we just remove it\n                # this is because of bart essentially\n                pos = 0\n                if len(ids) &gt; 1:\n                    while ids[pos] in stop_token_ids:\n                        pos += 1\n                # we only ignore when there is a list of tokens\n                ids = ids[pos:]\n                for pos, id in enumerate(ids):\n                    if int(id) in stop_token_ids:\n                        break\n                    new_ids.append(id)\n            new_ids_list.append(new_ids)\n        if len(new_ids_list) == 1:\n            return self.tokenizer.decode(\n                list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n            )\n        return self.tokenizer.decode_batch(\n            list(new_ids_list), skip_special_tokens=skip_special_tokens\n        )\n\n    def get_pretrained(self, **kwargs) -&gt; PreTrainedTokenizerFast:\n        r\"\"\"\n        Get a pretrained tokenizer from this tokenizer\n\n        Returns:\n            Returns pre-trained fast tokenizer for hugging face models.\n        \"\"\"\n        with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n            tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n        tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n        # now we need to add special_tokens\n        tk.add_special_tokens(\n            {\n                \"cls_token\": self.tokenizer.cls_token,\n                \"bos_token\": self.tokenizer.bos_token,\n                \"eos_token\": self.tokenizer.eos_token,\n                \"mask_token\": self.tokenizer.mask_token,\n                \"pad_token\": self.tokenizer.pad_token,\n                \"unk_token\": self.tokenizer.unk_token,\n                \"sep_token\": self.tokenizer.sep_token,\n            }\n        )\n        if (\n            tk.model_max_length is None\n            or tk.model_max_length &gt; 1e8\n            and hasattr(self.tokenizer, \"model_max_length\")\n        ):\n            tk.model_max_length = self.tokenizer.model_max_length\n            setattr(\n                tk,\n                \"model_max_length\",\n                getattr(self.tokenizer, \"model_max_length\"),\n            )\n        return tk\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        use_temp_dir: Optional[bool] = None,\n        commit_message: Optional[str] = None,\n        private: Optional[bool] = None,\n        token: Optional[Union[bool, str]] = None,\n        max_shard_size: Optional[Union[int, str]] = \"10GB\",\n        create_pr: bool = False,\n        safe_serialization: bool = False,\n        **deprecated_kwargs,\n    ) -&gt; str:\n        \"\"\"\n        Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n        Args:\n            repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n                when pushing to a given organization.\n            use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n                Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n            commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n            private: Whether or not the repository created should be private.\n            token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n                is not specified.\n            max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n                will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n                by a unit (like `\"5MB\"`).\n            create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n            safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n        \"\"\"\n        use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n        if use_auth_token is not None:\n            warnings.warn(\n                \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n                FutureWarning,\n            )\n            if token is not None:\n                raise ValueError(\n                    \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n                )\n            token = use_auth_token\n\n        repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n        if repo_path_or_name is not None:\n            # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n            # repo_id from the folder path, if it exists.\n            warnings.warn(\n                \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n                \"`repo_id` instead.\",\n                FutureWarning,\n            )\n            if repo_id is not None:\n                raise ValueError(\n                    \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n                )\n            if os.path.isdir(repo_path_or_name):\n                # repo_path: infer repo_id from the path\n                repo_id = repo_id.split(os.path.sep)[-1]\n                working_dir = repo_id\n            else:\n                # repo_name: use it as repo_id\n                repo_id = repo_path_or_name\n                working_dir = repo_id.split(\"/\")[-1]\n        else:\n            # Repo_id is passed correctly: infer working_dir from it\n            working_dir = repo_id.split(\"/\")[-1]\n\n        # Deprecation warning will be sent after for repo_url and organization\n        repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n        organization = deprecated_kwargs.pop(\"organization\", None)\n\n        repo_id = self._create_repo(\n            repo_id, private, token, repo_url=repo_url, organization=organization\n        )\n\n        if use_temp_dir is None:\n            use_temp_dir = not os.path.isdir(working_dir)\n\n        with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n            files_timestamps = self._get_files_timestamps(work_dir)\n\n            # Save all files.\n            with contextlib.suppress(Exception):\n                self.save_pretrained(\n                    work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n                )\n\n            self.save(os.path.join(work_dir, self.vocab_files_names))\n\n            return self._upload_modified_files(\n                work_dir,\n                repo_id,\n                files_timestamps,\n                commit_message=commit_message,\n                token=token,\n                create_pr=create_pr,\n            )\n\n    @classmethod\n    def from_pretrained(\n        cls,\n        pretrained_model_name_or_path: Union[str, os.PathLike],\n        cache_dir: Optional[Union[str, os.PathLike]] = None,\n        force_download: bool = False,\n        local_files_only: bool = False,\n        token: Optional[Union[str, bool]] = None,\n        return_fast_tokenizer: Optional[bool] = False,\n        proxies: Optional[Dict[str, str]] = None,\n        **kwargs,\n    ):\n        r\"\"\"\n        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n        tokenizer.\n\n        Args:\n            pretrained_model_name_or_path:\n                Can be either:\n\n                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n                  user or organization name, like `dbmdz/bert-base-german-cased`.\n                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n                  using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n                  `./my_model_directory/`.\n                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n                  `./my_model_directory/vocab.txt`.\n            cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n                standard cache should not be used.\n            force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n            proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n                `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n            token: The token to use as HTTP bearer authorization for remote files.\n                If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n            local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n            return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n        Examples:\n        ``` py\n            # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n            # Download vocabulary from huggingface.co and cache.\n            tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n            # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n            tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n            # If the tokenizer uses a single vocabulary file, you can point directly to this file\n            tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n        ```\n        \"\"\"\n        resume_download = kwargs.pop(\"resume_download\", False)\n        use_auth_token = kwargs.pop(\"use_auth_token\", None)\n        subfolder = kwargs.pop(\"subfolder\", None)\n        from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n        from_auto_class = kwargs.pop(\"_from_auto\", False)\n        commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n        if use_auth_token is not None:\n            warnings.warn(\n                \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n                FutureWarning,\n            )\n            if token is not None:\n                raise ValueError(\n                    \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n                )\n            token = use_auth_token\n\n        user_agent = {\n            \"file_type\": \"tokenizer\",\n            \"from_auto_class\": from_auto_class,\n            \"is_fast\": \"Fast\" in cls.__name__,\n        }\n        if from_pipeline is not None:\n            user_agent[\"using_pipeline\"] = from_pipeline\n\n        if is_offline_mode() and not local_files_only:\n            logger.info(\"Offline mode: forcing local_files_only=True\")\n            local_files_only = True\n\n        pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n        os.path.isdir(pretrained_model_name_or_path)\n        file_path = None\n        if os.path.isfile(pretrained_model_name_or_path):\n            file_path = pretrained_model_name_or_path\n        elif is_remote_url(pretrained_model_name_or_path):\n            file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n        else:\n            # EN: remove this when transformers package has uniform API\n            cached_file_extra_kwargs = {\"use_auth_token\": token}\n            if packaging.version.parse(transformers_version) &gt;= packaging.version.parse(\"5.0\"):\n                cached_file_extra_kwargs = {\"token\": token}\n            # Try to get the tokenizer config to see if there are versioned tokenizer files.\n            resolved_vocab_files = cached_file(\n                pretrained_model_name_or_path,\n                cls.vocab_files_names,\n                cache_dir=cache_dir,\n                force_download=force_download,\n                resume_download=resume_download,\n                proxies=proxies,\n                local_files_only=local_files_only,\n                subfolder=subfolder,\n                user_agent=user_agent,\n                _raise_exceptions_for_missing_entries=False,\n                _raise_exceptions_for_connection_errors=False,\n                _commit_hash=commit_hash,\n                **cached_file_extra_kwargs,\n            )\n            commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n            file_path = resolved_vocab_files\n\n        if not os.path.isfile(file_path):\n            logger.info(\n                f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n            )\n\n        tokenizer = cls.load(file_path)\n        if return_fast_tokenizer:\n            return tokenizer.get_pretrained()\n        return tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.bos_token_id","title":"<code>bos_token_id</code>  <code>property</code>","text":"<p>Get the bos token id</p>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.eos_token_id","title":"<code>eos_token_id</code>  <code>property</code>","text":"<p>Get the bos token id</p>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.pad_token_id","title":"<code>pad_token_id</code>  <code>property</code>","text":"<p>Get the bos token id</p>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__getstate__","title":"<code>__getstate__()</code>","text":"<p>Getting state to allow pickling</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def __getstate__(self):\n    \"\"\"Getting state to allow pickling\"\"\"\n    with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n        d = copy.deepcopy(self.__dict__)\n    # copy back tokenizer level attribute\n    d[\"tokenizer_attrs\"] = self.tokenizer.__dict__.copy()\n    d[\"tokenizer\"].pre_tokenizer = Whitespace()\n    return d\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__len__","title":"<code>__len__()</code>","text":"<p>Gets the count of tokens in vocab along with special tokens.</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def __len__(self):\n    r\"\"\"\n    Gets the count of tokens in vocab along with special tokens.\n    \"\"\"\n    return len(self.tokenizer.get_vocab().keys())\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.__setstate__","title":"<code>__setstate__(d)</code>","text":"<p>Setting state during reloading pickling</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def __setstate__(self, d):\n    \"\"\"Setting state during reloading pickling\"\"\"\n    use_pretokenizer = d.get(\"custom_pre_tokenizer\")\n    if use_pretokenizer:\n        d[\"tokenizer\"].pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n    d[\"tokenizer\"].__dict__.update(d.get(\"tokenizer_attrs\", {}))\n    self.__dict__.update(d)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.decode","title":"<code>decode(ids, skip_special_tokens=True, ignore_stops=False, stop_token_ids=None)</code>","text":"<p>Decodes a list of ids to molecular representation in the format in which this tokenizer was created.</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list</code> <p>list of IDs</p> required <code>skip_special_tokens</code> <code>bool</code> <p>whether to skip all special tokens when encountering them</p> <code>True</code> <code>ignore_stops</code> <code>bool</code> <p>whether to ignore the stop tokens, thus decoding till the end</p> <code>False</code> <code>stop_token_ids</code> <code>Optional[List[int]]</code> <p>optional list of stop token ids to use</p> <code>None</code> <p>Returns:</p> Name Type Description <code>sequence</code> <code>str</code> <p>str representation of molecule</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def decode(\n    self,\n    ids: list,\n    skip_special_tokens: bool = True,\n    ignore_stops: bool = False,\n    stop_token_ids: Optional[List[int]] = None,\n) -&gt; str:\n    r\"\"\"\n    Decodes a list of ids to molecular representation in the format in which this tokenizer was created.\n\n    Args:\n        ids: list of IDs\n        skip_special_tokens: whether to skip all special tokens when encountering them\n        ignore_stops: whether to ignore the stop tokens, thus decoding till the end\n        stop_token_ids: optional list of stop token ids to use\n\n    Returns:\n        sequence: str representation of molecule\n    \"\"\"\n    old_id_list = ids\n    if not isinstance(ids[0], (list, np.ndarray)) and not torch.is_tensor(ids[0]):\n        old_id_list = [ids]\n    if not stop_token_ids:\n        stop_token_ids = [self.tokenizer.token_to_id(self.tokenizer.eos_token)]\n\n    new_ids_list = []\n    for ids in old_id_list:\n        new_ids = ids\n        if not ignore_stops:\n            new_ids = []\n            # if first tokens are stop, we just remove it\n            # this is because of bart essentially\n            pos = 0\n            if len(ids) &gt; 1:\n                while ids[pos] in stop_token_ids:\n                    pos += 1\n            # we only ignore when there is a list of tokens\n            ids = ids[pos:]\n            for pos, id in enumerate(ids):\n                if int(id) in stop_token_ids:\n                    break\n                new_ids.append(id)\n        new_ids_list.append(new_ids)\n    if len(new_ids_list) == 1:\n        return self.tokenizer.decode(\n            list(new_ids_list[0]), skip_special_tokens=skip_special_tokens\n        )\n    return self.tokenizer.decode_batch(\n        list(new_ids_list), skip_special_tokens=skip_special_tokens\n    )\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.encode","title":"<code>encode(sample_str, ids_only=True, **kwargs)</code>","text":"<p>Encodes a given molecule string once training is done</p> <p>Parameters:</p> Name Type Description Default <code>sample_str</code> <code>str</code> <p>Sample string to encode molecule</p> required <code>ids_only</code> <code>bool</code> <p>whether to return only the ids or the encoding objet</p> <code>True</code> <p>Returns:</p> Name Type Description <code>object</code> <code>list</code> <p>Returns encoded list of IDs</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def encode(self, sample_str: str, ids_only: bool = True, **kwargs) -&gt; list:\n    r\"\"\"\n    Encodes a given molecule string once training is done\n\n    Args:\n        sample_str: Sample string to encode molecule\n        ids_only: whether to return only the ids or the encoding objet\n\n    Returns:\n        object: Returns encoded list of IDs\n    \"\"\"\n    if isinstance(sample_str, str):\n        enc = self.tokenizer.encode(sample_str, **kwargs)\n        if ids_only:\n            return enc.ids\n        return enc\n\n    encs = self.tokenizer.encode_batch(sample_str, **kwargs)\n    if ids_only:\n        return [enc.ids for enc in encs]\n    return encs\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_dict","title":"<code>from_dict(data)</code>  <code>classmethod</code>","text":"<p>Load tokenizer from dict</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>dict</code> <p>dictionary containing the tokenizer info</p> required Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef from_dict(cls, data: dict):\n    \"\"\"Load tokenizer from dict\n\n    Args:\n        data: dictionary containing the tokenizer info\n    \"\"\"\n    tokenizer_type = data.pop(\"tokenizer_type\", \"safe\")\n    tokenizer_attrs = data.pop(\"tokenizer_attrs\", None)\n    custom_pre_tokenizer = data.pop(\"custom_pre_tokenizer\", False)\n    tokenizer = Tokenizer.from_str(json.dumps(data))\n    if custom_pre_tokenizer:\n        tokenizer.pre_tokenizer = PreTokenizer.custom(SAFESplitter())\n    mol_tokenizer = cls(tokenizer_type)\n    mol_tokenizer.tokenizer = mol_tokenizer.set_special_tokens(tokenizer)\n    if tokenizer_attrs and isinstance(tokenizer_attrs, dict):\n        mol_tokenizer.tokenizer.__dict__.update(tokenizer_attrs)\n    return mol_tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.from_pretrained","title":"<code>from_pretrained(pretrained_model_name_or_path, cache_dir=None, force_download=False, local_files_only=False, token=None, return_fast_tokenizer=False, proxies=None, **kwargs)</code>  <code>classmethod</code>","text":"<p>Instantiate a [<code>~tokenization_utils_base.PreTrainedTokenizerBase</code>] (or a derived class) from a predefined tokenizer.</p> <p>Parameters:</p> Name Type Description Default <code>pretrained_model_name_or_path</code> <code>Union[str, PathLike]</code> <p>Can be either:</p> <ul> <li>A string, the model id of a predefined tokenizer hosted inside a model repo on huggingface.co.   Valid model ids can be located at the root-level, like <code>bert-base-uncased</code>, or namespaced under a   user or organization name, like <code>dbmdz/bert-base-german-cased</code>.</li> <li>A path to a directory containing vocabulary files required by the tokenizer, for instance saved   using the [<code>~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained</code>] method, e.g.,   <code>./my_model_directory/</code>.</li> <li>(Deprecated, not applicable to all derived classes) A path or url to a single saved vocabulary   file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,   <code>./my_model_directory/vocab.txt</code>.</li> </ul> required <code>cache_dir</code> <code>Optional[Union[str, PathLike]]</code> <p>Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.</p> <code>None</code> <code>force_download</code> <code>bool</code> <p>Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.</p> <code>False</code> <code>proxies</code> <code>Optional[Dict[str, str]]</code> <p>A dictionary of proxy servers to use by protocol or endpoint, e.g., <code>{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}</code>. The proxies are used on each request.</p> <code>None</code> <code>token</code> <code>Optional[Union[str, bool]]</code> <p>The token to use as HTTP bearer authorization for remote files. If <code>True</code>, will use the token generated when running <code>huggingface-cli login</code> (stored in <code>~/.huggingface</code>).</p> <code>None</code> <code>local_files_only</code> <code>bool</code> <p>Whether or not to only rely on local files and not to attempt to download any files.</p> <code>False</code> <code>return_fast_tokenizer</code> <code>Optional[bool]</code> <p>Whether to return fast tokenizer or not.</p> <code>False</code> <p>Examples:</p> <pre><code>    # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n    # Download vocabulary from huggingface.co and cache.\n    tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n    # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n    tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n    # If the tokenizer uses a single vocabulary file, you can point directly to this file\n    tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n</code></pre> Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef from_pretrained(\n    cls,\n    pretrained_model_name_or_path: Union[str, os.PathLike],\n    cache_dir: Optional[Union[str, os.PathLike]] = None,\n    force_download: bool = False,\n    local_files_only: bool = False,\n    token: Optional[Union[str, bool]] = None,\n    return_fast_tokenizer: Optional[bool] = False,\n    proxies: Optional[Dict[str, str]] = None,\n    **kwargs,\n):\n    r\"\"\"\n    Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined\n    tokenizer.\n\n    Args:\n        pretrained_model_name_or_path:\n            Can be either:\n\n            - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.\n              Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a\n              user or organization name, like `dbmdz/bert-base-german-cased`.\n            - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved\n              using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,\n              `./my_model_directory/`.\n            - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary\n              file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,\n              `./my_model_directory/vocab.txt`.\n        cache_dir: Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the\n            standard cache should not be used.\n        force_download: Whether or not to force the (re-)download the vocabulary files and override the cached versions if they exist.\n        proxies: A dictionary of proxy servers to use by protocol or endpoint, e.g.,\n            `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.\n        token: The token to use as HTTP bearer authorization for remote files.\n            If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).\n        local_files_only: Whether or not to only rely on local files and not to attempt to download any files.\n        return_fast_tokenizer: Whether to return fast tokenizer or not.\n\n    Examples:\n    ``` py\n        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer\n        # Download vocabulary from huggingface.co and cache.\n        tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n\n        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)\n        tokenizer = SAFETokenizer.from_pretrained(\"./test/saved_model/\")\n\n        # If the tokenizer uses a single vocabulary file, you can point directly to this file\n        tokenizer = BertTokenizer.from_pretrained(\"./test/saved_model/tokenizer.json\")\n    ```\n    \"\"\"\n    resume_download = kwargs.pop(\"resume_download\", False)\n    use_auth_token = kwargs.pop(\"use_auth_token\", None)\n    subfolder = kwargs.pop(\"subfolder\", None)\n    from_pipeline = kwargs.pop(\"_from_pipeline\", None)\n    from_auto_class = kwargs.pop(\"_from_auto\", False)\n    commit_hash = kwargs.pop(\"_commit_hash\", None)\n\n    if use_auth_token is not None:\n        warnings.warn(\n            \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n            FutureWarning,\n        )\n        if token is not None:\n            raise ValueError(\n                \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n            )\n        token = use_auth_token\n\n    user_agent = {\n        \"file_type\": \"tokenizer\",\n        \"from_auto_class\": from_auto_class,\n        \"is_fast\": \"Fast\" in cls.__name__,\n    }\n    if from_pipeline is not None:\n        user_agent[\"using_pipeline\"] = from_pipeline\n\n    if is_offline_mode() and not local_files_only:\n        logger.info(\"Offline mode: forcing local_files_only=True\")\n        local_files_only = True\n\n    pretrained_model_name_or_path = str(pretrained_model_name_or_path)\n\n    os.path.isdir(pretrained_model_name_or_path)\n    file_path = None\n    if os.path.isfile(pretrained_model_name_or_path):\n        file_path = pretrained_model_name_or_path\n    elif is_remote_url(pretrained_model_name_or_path):\n        file_path = download_url(pretrained_model_name_or_path, proxies=proxies)\n\n    else:\n        # EN: remove this when transformers package has uniform API\n        cached_file_extra_kwargs = {\"use_auth_token\": token}\n        if packaging.version.parse(transformers_version) &gt;= packaging.version.parse(\"5.0\"):\n            cached_file_extra_kwargs = {\"token\": token}\n        # Try to get the tokenizer config to see if there are versioned tokenizer files.\n        resolved_vocab_files = cached_file(\n            pretrained_model_name_or_path,\n            cls.vocab_files_names,\n            cache_dir=cache_dir,\n            force_download=force_download,\n            resume_download=resume_download,\n            proxies=proxies,\n            local_files_only=local_files_only,\n            subfolder=subfolder,\n            user_agent=user_agent,\n            _raise_exceptions_for_missing_entries=False,\n            _raise_exceptions_for_connection_errors=False,\n            _commit_hash=commit_hash,\n            **cached_file_extra_kwargs,\n        )\n        commit_hash = extract_commit_hash(resolved_vocab_files, commit_hash)\n        file_path = resolved_vocab_files\n\n    if not os.path.isfile(file_path):\n        logger.info(\n            f\"Can't load the following file: {file_path} required for loading the tokenizer\"\n        )\n\n    tokenizer = cls.load(file_path)\n    if return_fast_tokenizer:\n        return tokenizer.get_pretrained()\n    return tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.get_pretrained","title":"<code>get_pretrained(**kwargs)</code>","text":"<p>Get a pretrained tokenizer from this tokenizer</p> <p>Returns:</p> Type Description <code>PreTrainedTokenizerFast</code> <p>Returns pre-trained fast tokenizer for hugging face models.</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def get_pretrained(self, **kwargs) -&gt; PreTrainedTokenizerFast:\n    r\"\"\"\n    Get a pretrained tokenizer from this tokenizer\n\n    Returns:\n        Returns pre-trained fast tokenizer for hugging face models.\n    \"\"\"\n    with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n        tk = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer)\n    tk._tokenizer.pre_tokenizer = self.tokenizer.pre_tokenizer\n    # now we need to add special_tokens\n    tk.add_special_tokens(\n        {\n            \"cls_token\": self.tokenizer.cls_token,\n            \"bos_token\": self.tokenizer.bos_token,\n            \"eos_token\": self.tokenizer.eos_token,\n            \"mask_token\": self.tokenizer.mask_token,\n            \"pad_token\": self.tokenizer.pad_token,\n            \"unk_token\": self.tokenizer.unk_token,\n            \"sep_token\": self.tokenizer.sep_token,\n        }\n    )\n    if (\n        tk.model_max_length is None\n        or tk.model_max_length &gt; 1e8\n        and hasattr(self.tokenizer, \"model_max_length\")\n    ):\n        tk.model_max_length = self.tokenizer.model_max_length\n        setattr(\n            tk,\n            \"model_max_length\",\n            getattr(self.tokenizer, \"model_max_length\"),\n        )\n    return tk\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.load","title":"<code>load(file_name)</code>  <code>classmethod</code>","text":"<p>Load the current tokenizer from file</p> Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef load(cls, file_name):\n    \"\"\"Load the current tokenizer from file\"\"\"\n    with fsspec.open(file_name, \"r\") as OUT:\n        data_str = OUT.read()\n    data = json.loads(data_str)\n    # EN: the rust json parser of tokenizers has a predefined structure\n    # the next two lines are important\n    return cls.from_dict(data)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.push_to_hub","title":"<code>push_to_hub(repo_id, use_temp_dir=None, commit_message=None, private=None, token=None, max_shard_size='10GB', create_pr=False, safe_serialization=False, **deprecated_kwargs)</code>","text":"<p>Upload the tokenizer to the \ud83e\udd17 Model Hub.</p> <p>Parameters:</p> Name Type Description Default <code>repo_id</code> <code>str</code> <p>The name of the repository you want to push your {object} to. It should contain your organization name when pushing to a given organization.</p> required <code>use_temp_dir</code> <code>Optional[bool]</code> <p>Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub. Will default to <code>True</code> if there is no directory named like <code>repo_id</code>, <code>False</code> otherwise.</p> <code>None</code> <code>commit_message</code> <code>Optional[str]</code> <p>Message to commit while pushing. Will default to <code>\"Upload {object}\"</code>.</p> <code>None</code> <code>private</code> <code>Optional[bool]</code> <p>Whether or not the repository created should be private.</p> <code>None</code> <code>token</code> <code>Optional[Union[bool, str]]</code> <p>The token to use as HTTP bearer authorization for remote files. If <code>True</code>, will use the token generated when running <code>huggingface-cli login</code> (stored in <code>~/.huggingface</code>). Will default to <code>True</code> if <code>repo_url</code> is not specified.</p> <code>None</code> <code>max_shard_size</code> <code>Optional[Union[int, str]]</code> <p>Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size lower than this size. If expressed as a string, needs to be digits followed by a unit (like <code>\"5MB\"</code>).</p> <code>'10GB'</code> <code>create_pr</code> <code>bool</code> <p>Whether or not to create a PR with the uploaded files or directly commit.</p> <code>False</code> <code>safe_serialization</code> <code>bool</code> <p>Whether or not to convert the model weights in safetensors format for safer serialization.</p> <code>False</code> Source code in <code>safe/tokenizer.py</code> <pre><code>def push_to_hub(\n    self,\n    repo_id: str,\n    use_temp_dir: Optional[bool] = None,\n    commit_message: Optional[str] = None,\n    private: Optional[bool] = None,\n    token: Optional[Union[bool, str]] = None,\n    max_shard_size: Optional[Union[int, str]] = \"10GB\",\n    create_pr: bool = False,\n    safe_serialization: bool = False,\n    **deprecated_kwargs,\n) -&gt; str:\n    \"\"\"\n    Upload the tokenizer to the \ud83e\udd17 Model Hub.\n\n    Args:\n        repo_id: The name of the repository you want to push your {object} to. It should contain your organization name\n            when pushing to a given organization.\n        use_temp_dir: Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.\n            Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.\n        commit_message: Message to commit while pushing. Will default to `\"Upload {object}\"`.\n        private: Whether or not the repository created should be private.\n        token: The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated\n            when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n            is not specified.\n        max_shard_size: Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard\n            will then be each of size lower than this size. If expressed as a string, needs to be digits followed\n            by a unit (like `\"5MB\"`).\n        create_pr: Whether or not to create a PR with the uploaded files or directly commit.\n        safe_serialization: Whether or not to convert the model weights in safetensors format for safer serialization.\n    \"\"\"\n    use_auth_token = deprecated_kwargs.pop(\"use_auth_token\", None)\n    if use_auth_token is not None:\n        warnings.warn(\n            \"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\",\n            FutureWarning,\n        )\n        if token is not None:\n            raise ValueError(\n                \"`token` and `use_auth_token` are both specified. Please set only the argument `token`.\"\n            )\n        token = use_auth_token\n\n    repo_path_or_name = deprecated_kwargs.pop(\"repo_path_or_name\", None)\n    if repo_path_or_name is not None:\n        # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer\n        # repo_id from the folder path, if it exists.\n        warnings.warn(\n            \"The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use \"\n            \"`repo_id` instead.\",\n            FutureWarning,\n        )\n        if repo_id is not None:\n            raise ValueError(\n                \"`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`.\"\n            )\n        if os.path.isdir(repo_path_or_name):\n            # repo_path: infer repo_id from the path\n            repo_id = repo_id.split(os.path.sep)[-1]\n            working_dir = repo_id\n        else:\n            # repo_name: use it as repo_id\n            repo_id = repo_path_or_name\n            working_dir = repo_id.split(\"/\")[-1]\n    else:\n        # Repo_id is passed correctly: infer working_dir from it\n        working_dir = repo_id.split(\"/\")[-1]\n\n    # Deprecation warning will be sent after for repo_url and organization\n    repo_url = deprecated_kwargs.pop(\"repo_url\", None)\n    organization = deprecated_kwargs.pop(\"organization\", None)\n\n    repo_id = self._create_repo(\n        repo_id, private, token, repo_url=repo_url, organization=organization\n    )\n\n    if use_temp_dir is None:\n        use_temp_dir = not os.path.isdir(working_dir)\n\n    with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:\n        files_timestamps = self._get_files_timestamps(work_dir)\n\n        # Save all files.\n        with contextlib.suppress(Exception):\n            self.save_pretrained(\n                work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization\n            )\n\n        self.save(os.path.join(work_dir, self.vocab_files_names))\n\n        return self._upload_modified_files(\n            work_dir,\n            repo_id,\n            files_timestamps,\n            commit_message=commit_message,\n            token=token,\n            create_pr=create_pr,\n        )\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save","title":"<code>save(file_name=None)</code>","text":"<p>Saves the :class:<code>~tokenizers.Tokenizer</code> to the file at the given path.</p> <p>Parameters:</p> Name Type Description Default <code>file_name</code> <code>str</code> <p>File where to save tokenizer</p> <code>None</code> Source code in <code>safe/tokenizer.py</code> <pre><code>def save(self, file_name=None):\n    r\"\"\"\n    Saves the :class:`~tokenizers.Tokenizer` to the file at the given path.\n\n    Args:\n        file_name (str, optional): File where to save tokenizer\n    \"\"\"\n    # EN: whole logic here assumes noone is going to mess with the special token\n    tk_data = self.to_dict()\n    with fsspec.open(file_name, \"w\", encoding=\"utf-8\") as OUT:\n        out_str = json.dumps(tk_data, ensure_ascii=False)\n        OUT.write(out_str)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.save_pretrained","title":"<code>save_pretrained(*args, **kwargs)</code>","text":"<p>Save pretrained tokenizer</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def save_pretrained(self, *args, **kwargs):\n    \"\"\"Save pretrained tokenizer\"\"\"\n    self.tokenizer.save_pretrained(*args, **kwargs)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.set_special_tokens","title":"<code>set_special_tokens(tokenizer, bos_token=CLS_TOKEN, eos_token=SEP_TOKEN)</code>  <code>classmethod</code>","text":"<p>Set special tokens for a tokenizer</p> <p>Parameters:</p> Name Type Description Default <code>tokenizer</code> <code>Tokenizer</code> <p>tokenizer for which special tokens will be set</p> required <code>bos_token</code> <code>str</code> <p>Optional bos token to use</p> <code>CLS_TOKEN</code> <code>eos_token</code> <code>str</code> <p>Optional eos token to use</p> <code>SEP_TOKEN</code> Source code in <code>safe/tokenizer.py</code> <pre><code>@classmethod\ndef set_special_tokens(\n    cls,\n    tokenizer: Tokenizer,\n    bos_token: str = CLS_TOKEN,\n    eos_token: str = SEP_TOKEN,\n):\n    \"\"\"Set special tokens for a tokenizer\n\n    Args:\n        tokenizer: tokenizer for which special tokens will be set\n        bos_token: Optional bos token to use\n        eos_token: Optional eos token to use\n    \"\"\"\n    tokenizer.pad_token = PADDING_TOKEN\n    tokenizer.cls_token = CLS_TOKEN\n    tokenizer.sep_token = SEP_TOKEN\n    tokenizer.mask_token = MASK_TOKEN\n    tokenizer.unk_token = UNK_TOKEN\n    tokenizer.eos_token = eos_token\n    tokenizer.bos_token = bos_token\n\n    if isinstance(tokenizer, Tokenizer):\n        tokenizer.add_special_tokens(\n            [\n                PADDING_TOKEN,\n                CLS_TOKEN,\n                SEP_TOKEN,\n                MASK_TOKEN,\n                UNK_TOKEN,\n                eos_token,\n                bos_token,\n            ]\n        )\n    return tokenizer\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.to_dict","title":"<code>to_dict(**kwargs)</code>","text":"<p>Convert tokenizer to dict</p> Source code in <code>safe/tokenizer.py</code> <pre><code>def to_dict(self, **kwargs):\n    \"\"\"Convert tokenizer to dict\"\"\"\n    # we need to do this because HuggingFace tokenizers doesnt save with custom pre-tokenizers\n    if self.splitter is None:\n        tk_data = json.loads(self.tokenizer.to_str())\n    else:\n        with attr_as(self.tokenizer, \"pre_tokenizer\", Whitespace()):\n            # temporary replace pre tokenizer with whitespace\n            tk_data = json.loads(self.tokenizer.to_str())\n            tk_data[\"custom_pre_tokenizer\"] = True\n    tk_data[\"tokenizer_type\"] = self.tokenizer_type\n    tk_data[\"tokenizer_attrs\"] = self.tokenizer.__dict__\n    return tk_data\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train","title":"<code>train(files, **kwargs)</code>","text":"<p>This is to train a new tokenizer from either a list of file or some input data</p> <p>Args     files (str): file in which your molecules are separated by new line     kwargs (dict): optional args for the tokenizer <code>train</code></p> Source code in <code>safe/tokenizer.py</code> <pre><code>def train(self, files: Optional[List[str]], **kwargs):\n    r\"\"\"\n    This is to train a new tokenizer from either a list of file or some input data\n\n    Args\n        files (str): file in which your molecules are separated by new line\n        kwargs (dict): optional args for the tokenizer `train`\n    \"\"\"\n    if isinstance(files, str):\n        files = [files]\n    self.tokenizer.train(files=files, trainer=self.trainer)\n</code></pre>"},{"location":"api/safe.html#safe.tokenizer.SAFETokenizer.train_from_iterator","title":"<code>train_from_iterator(data, **kwargs)</code>","text":"<p>Train the Tokenizer using the provided iterator.</p> <p>You can provide anything that is a Python Iterator     * A list of sequences :obj:<code>List[str]</code>     * A generator that yields :obj:<code>str</code> or :obj:<code>List[str]</code>     * A Numpy array of strings</p> <p>Parameters:</p> Name Type Description Default <code>data</code> <code>Iterator</code> <p>data iterator</p> required <code>**kwargs</code> <code>Any</code> <p>additional keyword argument for the tokenizer <code>train_from_iterator</code></p> <code>{}</code> Source code in <code>safe/tokenizer.py</code> <pre><code>def train_from_iterator(self, data: Iterator, **kwargs: Any):\n    \"\"\"Train the Tokenizer using the provided iterator.\n\n    You can provide anything that is a Python Iterator\n        * A list of sequences :obj:`List[str]`\n        * A generator that yields :obj:`str` or :obj:`List[str]`\n        * A Numpy array of strings\n\n    Args:\n        data: data iterator\n        **kwargs: additional keyword argument for the tokenizer `train_from_iterator`\n    \"\"\"\n    self.tokenizer.train_from_iterator(data, trainer=self.trainer, **kwargs)\n</code></pre>"},{"location":"api/safe.html#utils","title":"Utils","text":""},{"location":"api/safe.html#safe.utils.MolSlicer","title":"<code>MolSlicer</code>","text":"<p>Slice a molecule into head-linker-tail</p> Source code in <code>safe/utils.py</code> <pre><code>class MolSlicer:\n    \"\"\"Slice a molecule into head-linker-tail\"\"\"\n\n    BOND_SPLITTERS = [\n        # two atoms connected by a non ring single bond, one of each is not in a ring and at least two heavy neighbor\n        \"[R:1]-&amp;!@[!R;!D1:2]\",\n        # two atoms in different rings linked by a non-ring single bond\n        \"[R:1]-&amp;!@[R:2]\",\n    ]\n    _BOND_BUFFER = 1  # buffer around substructure match size.\n    MAX_CUTS = 2  # maximum number of cuts. Here we need two cuts for head-linker-tail.\n\n    _MERGING_RXN = dm.reactions.rxn_from_smarts(\n        \"[#0][*:1].[#0][*:4].([#0][*:2].[#0][*:3])&gt;&gt;([*:1][*:2].[*:3][*:4])\"\n    )\n\n    def __init__(\n        self,\n        shortest_linker: bool = False,\n        min_linker_size: int = 0,\n        require_ring_system: bool = True,\n        verbose: bool = False,\n    ):\n        \"\"\"\n        Constructor of bond slicer.\n\n        Args:\n            shortest_linker: whether to consider longuest or shortest linker.\n                Does not have any effect when expected_head group is provided during splitting\n            min_linker_size: minimum linker size\n            require_ring_system: whether all fragment needs to have a ring system\n            verbose: whether to allow verbosity in logging\n        \"\"\"\n\n        self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n        self.shortest_linker = shortest_linker\n        self.min_linker_size = min_linker_size\n        self.require_ring_system = require_ring_system\n        self.verbose = verbose\n\n    def get_ring_system(self, mol: dm.Mol):\n        \"\"\"Get the list of ring system from a molecule\n\n        Args:\n            mol: input molecule for which we are computing the ring system\n        \"\"\"\n        mol.UpdatePropertyCache()\n        ri = mol.GetRingInfo()\n        systems = []\n        for ring in ri.AtomRings():\n            ring_atoms = set(ring)\n            cur_system = []  # keep a track of ring system\n            for system in systems:\n                if len(ring_atoms.intersection(system)) &gt; 0:\n                    ring_atoms = ring_atoms.union(system)  # merge ring system that overlap\n                else:\n                    cur_system.append(system)\n            cur_system.append(ring_atoms)\n            systems = cur_system\n        return systems\n\n    def _bond_selection_from_max_cuts(self, bond_list: List[int], dist_mat: np.ndarray):\n        \"\"\"Select bonds based on maximum number of cuts allowed\"\"\"\n        # for now we are just implementing to 2 max cuts algorithms\n        if self.MAX_CUTS != 2:\n            raise ValueError(f\"Only MAX_CUTS=2 is supported, got {self.MAX_CUTS}\")\n\n        bond_pdist = np.full((len(bond_list), len(bond_list)), -1)\n        for i in range(len(bond_list)):\n            for j in range(i, len(bond_list)):\n                # we get the minimum topological distance between bond to cut\n                bond_pdist[i, j] = bond_pdist[j, i] = min(\n                    [dist_mat[a1, a2] for a1, a2 in itertools.product(bond_list[i], bond_list[j])]\n                )\n\n        masked_bond_pdist = np.ma.masked_less_equal(bond_pdist, self.min_linker_size)\n\n        if self.shortest_linker:\n            return np.unravel_index(np.ma.argmin(masked_bond_pdist), bond_pdist.shape)\n        return np.unravel_index(np.ma.argmax(masked_bond_pdist), bond_pdist.shape)\n\n    def _get_bonds_to_cut(self, mol: dm.Mol):\n        \"\"\"Get possible bond to cuts\n\n        Args:\n            mol: input molecule\n        \"\"\"\n        # use this if you want to enumerate yourself the possible cuts\n\n        ring_systems = self.get_ring_system(mol)\n        candidate_bonds = []\n        ring_query = Chem.rdqueries.IsInRingQueryAtom()\n\n        for query in self.bond_splitters:\n            bonds = mol.GetSubstructMatches(query, uniquify=True)\n            cur_unique_bonds = [set(cbond) for cbond in candidate_bonds]\n            # do not accept bonds part of the same ring system or already known\n            for b in bonds:\n                bond_id = mol.GetBondBetweenAtoms(*b).GetIdx()\n                bond_cut = Chem.GetMolFrags(\n                    Chem.FragmentOnBonds(mol, [bond_id], addDummies=False), asMols=True\n                )\n                can_add = not self.require_ring_system or all(\n                    len(frag.GetAtomsMatchingQuery(ring_query)) &gt; 0 for frag in bond_cut\n                )\n                if can_add and not (\n                    set(b) in cur_unique_bonds or any(x.issuperset(set(b)) for x in ring_systems)\n                ):\n                    candidate_bonds.append(b)\n        return candidate_bonds\n\n    def _fragment_mol(self, mol: dm.Mol, bonds: List[dm.Bond]):\n        \"\"\"Fragment molecules on bonds and return head, linker, tail combination\n\n        Args:\n            mol: input molecule\n            bonds: list of bonds to cut\n        \"\"\"\n        tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in bonds])\n        _frags = list(Chem.GetMolFrags(tmp, asMols=True))\n        # linker is the one with 2 dummy atoms\n        linker_pos = 0\n        for pos, _frag in enumerate(_frags):\n            if sum([at.GetSymbol() == \"*\" for at in _frag.GetAtoms()]) == 2:\n                linker_pos = pos\n                break\n        linker = _frags.pop(linker_pos)\n        head, tail = _frags\n        return (head, linker, tail)\n\n    def _compute_linker_score(self, linker: dm.Mol):\n        \"\"\"Compute the score of a linker to help select between linkers\"\"\"\n\n        # we need to take into account\n        # case where we require the linker to have a ring system\n        # case where we want the linker to be longuest or shortest\n\n        # find shortest path\n        attach1, attach2, *_ = [at.GetIdx() for at in linker.GetAtoms() if at.GetSymbol() == \"*\"]\n        score = len(Chem.rdmolops.GetShortestPath(linker, attach1, attach2))\n        ring_query = Chem.rdqueries.IsInRingQueryAtom()\n        linker_ring_count = len(linker.GetAtomsMatchingQuery(ring_query))\n        if self.require_ring_system:\n            score *= int(linker_ring_count &gt; 0)\n        if score == 0:\n            return float(\"inf\")\n        if not self.shortest_linker:\n            score = 1 / score\n        return score\n\n    def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n        \"\"\"Perform slicing of the input molecule\n\n        Args:\n            mol: input molecule\n            expected_head: substructure that should be part of the head.\n                The small fragment containing this substructure would be kept as head\n        \"\"\"\n\n        mol = dm.to_mol(mol)\n        # remove salt and solution\n        mol = dm.keep_largest_fragment(mol)\n        Chem.rdDepictor.Compute2DCoords(mol)\n        dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n        if expected_head is not None:\n            if isinstance(expected_head, str):\n                expected_head = dm.to_mol(expected_head)\n            if not mol.HasSubstructMatch(expected_head):\n                if self.verbose:\n                    logger.info(\n                        \"Expected head was provided, but does not match molecules. It will be ignored\"\n                    )\n                expected_head = None\n\n        candidate_bonds = self._get_bonds_to_cut(mol)\n\n        # we have all the candidate bonds we can cut\n        # now we need to pick the most plausible bonds\n        selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n        # CASE 1: no bond to cut ==&gt; only head\n        if len(selected_bonds) == 0:\n            return (mol, None, None)\n\n        # CASE 2: only one bond ==&gt; linker is empty\n        if len(selected_bonds) == 1:\n            # there is not linker\n            tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n            head, tail = Chem.GetMolFrags(tmp, asMols=True)\n            return (head, None, tail)\n\n        # CASE 3a: we select the most plausible bond to cut on ourselves\n        if expected_head is None:\n            choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n            selected_bonds = [selected_bonds[c] for c in choice]\n            return self._fragment_mol(mol, selected_bonds)\n\n        # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n        # provided substructure\n        bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n        bond_score = float(\"inf\")\n        linker_score = float(\"inf\")\n        head, linker, tail = (None, None, None)\n        for split_bonds in bond_combination:\n            cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n            # head can also be tail\n            head_match = cur_head.GetSubstructMatch(expected_head)\n            tail_match = cur_tail.GetSubstructMatch(expected_head)\n            if not head_match and not tail_match:\n                continue\n            if not head_match and tail_match:\n                cur_head, cur_tail = cur_tail, cur_head\n            cur_bond_score = cur_head.GetNumHeavyAtoms()\n            # compute linker score\n            cur_linker_score = self._compute_linker_score(cur_linker)\n            if (cur_bond_score &lt; bond_score) or (\n                cur_bond_score &lt; self._BOND_BUFFER + bond_score and cur_linker_score &lt; linker_score\n            ):\n                head, linker, tail = cur_head, cur_linker, cur_tail\n                bond_score = cur_bond_score\n                linker_score = cur_linker_score\n\n        return (head, linker, tail)\n\n    @classmethod\n    def link_fragments(\n        cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n    ):\n        \"\"\"Link fragments together using the provided linker\n\n        Args:\n            linker: linker to use\n            head: head fragment\n            tail: tail fragment\n        \"\"\"\n        if isinstance(linker, dm.Mol):\n            linker = dm.to_smiles(linker)\n        linker = standardize_attach(linker)\n        reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n        return dm.reactions.apply_reaction(\n            cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n        )\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.__call__","title":"<code>__call__(mol, expected_head=None)</code>","text":"<p>Perform slicing of the input molecule</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Union[Mol, str]</code> <p>input molecule</p> required <code>expected_head</code> <code>Union[Mol, str]</code> <p>substructure that should be part of the head. The small fragment containing this substructure would be kept as head</p> <code>None</code> Source code in <code>safe/utils.py</code> <pre><code>def __call__(self, mol: Union[dm.Mol, str], expected_head: Union[dm.Mol, str] = None):\n    \"\"\"Perform slicing of the input molecule\n\n    Args:\n        mol: input molecule\n        expected_head: substructure that should be part of the head.\n            The small fragment containing this substructure would be kept as head\n    \"\"\"\n\n    mol = dm.to_mol(mol)\n    # remove salt and solution\n    mol = dm.keep_largest_fragment(mol)\n    Chem.rdDepictor.Compute2DCoords(mol)\n    dist_mat = Chem.rdmolops.GetDistanceMatrix(mol)\n\n    if expected_head is not None:\n        if isinstance(expected_head, str):\n            expected_head = dm.to_mol(expected_head)\n        if not mol.HasSubstructMatch(expected_head):\n            if self.verbose:\n                logger.info(\n                    \"Expected head was provided, but does not match molecules. It will be ignored\"\n                )\n            expected_head = None\n\n    candidate_bonds = self._get_bonds_to_cut(mol)\n\n    # we have all the candidate bonds we can cut\n    # now we need to pick the most plausible bonds\n    selected_bonds = [mol.GetBondBetweenAtoms(a1, a2) for (a1, a2) in candidate_bonds]\n\n    # CASE 1: no bond to cut ==&gt; only head\n    if len(selected_bonds) == 0:\n        return (mol, None, None)\n\n    # CASE 2: only one bond ==&gt; linker is empty\n    if len(selected_bonds) == 1:\n        # there is not linker\n        tmp = Chem.rdmolops.FragmentOnBonds(mol, [b.GetIdx() for b in selected_bonds])\n        head, tail = Chem.GetMolFrags(tmp, asMols=True)\n        return (head, None, tail)\n\n    # CASE 3a: we select the most plausible bond to cut on ourselves\n    if expected_head is None:\n        choice = self._bond_selection_from_max_cuts(candidate_bonds, dist_mat)\n        selected_bonds = [selected_bonds[c] for c in choice]\n        return self._fragment_mol(mol, selected_bonds)\n\n    # CASE 3b: slightly more complex case where we want the head to be the smallest graph containing the\n    # provided substructure\n    bond_combination = list(itertools.combinations(selected_bonds, self.MAX_CUTS))\n    bond_score = float(\"inf\")\n    linker_score = float(\"inf\")\n    head, linker, tail = (None, None, None)\n    for split_bonds in bond_combination:\n        cur_head, cur_linker, cur_tail = self._fragment_mol(mol, split_bonds)\n        # head can also be tail\n        head_match = cur_head.GetSubstructMatch(expected_head)\n        tail_match = cur_tail.GetSubstructMatch(expected_head)\n        if not head_match and not tail_match:\n            continue\n        if not head_match and tail_match:\n            cur_head, cur_tail = cur_tail, cur_head\n        cur_bond_score = cur_head.GetNumHeavyAtoms()\n        # compute linker score\n        cur_linker_score = self._compute_linker_score(cur_linker)\n        if (cur_bond_score &lt; bond_score) or (\n            cur_bond_score &lt; self._BOND_BUFFER + bond_score and cur_linker_score &lt; linker_score\n        ):\n            head, linker, tail = cur_head, cur_linker, cur_tail\n            bond_score = cur_bond_score\n            linker_score = cur_linker_score\n\n    return (head, linker, tail)\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.__init__","title":"<code>__init__(shortest_linker=False, min_linker_size=0, require_ring_system=True, verbose=False)</code>","text":"<p>Constructor of bond slicer.</p> <p>Parameters:</p> Name Type Description Default <code>shortest_linker</code> <code>bool</code> <p>whether to consider longuest or shortest linker. Does not have any effect when expected_head group is provided during splitting</p> <code>False</code> <code>min_linker_size</code> <code>int</code> <p>minimum linker size</p> <code>0</code> <code>require_ring_system</code> <code>bool</code> <p>whether all fragment needs to have a ring system</p> <code>True</code> <code>verbose</code> <code>bool</code> <p>whether to allow verbosity in logging</p> <code>False</code> Source code in <code>safe/utils.py</code> <pre><code>def __init__(\n    self,\n    shortest_linker: bool = False,\n    min_linker_size: int = 0,\n    require_ring_system: bool = True,\n    verbose: bool = False,\n):\n    \"\"\"\n    Constructor of bond slicer.\n\n    Args:\n        shortest_linker: whether to consider longuest or shortest linker.\n            Does not have any effect when expected_head group is provided during splitting\n        min_linker_size: minimum linker size\n        require_ring_system: whether all fragment needs to have a ring system\n        verbose: whether to allow verbosity in logging\n    \"\"\"\n\n    self.bond_splitters = [dm.from_smarts(x) for x in self.BOND_SPLITTERS]\n    self.shortest_linker = shortest_linker\n    self.min_linker_size = min_linker_size\n    self.require_ring_system = require_ring_system\n    self.verbose = verbose\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.get_ring_system","title":"<code>get_ring_system(mol)</code>","text":"<p>Get the list of ring system from a molecule</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>input molecule for which we are computing the ring system</p> required Source code in <code>safe/utils.py</code> <pre><code>def get_ring_system(self, mol: dm.Mol):\n    \"\"\"Get the list of ring system from a molecule\n\n    Args:\n        mol: input molecule for which we are computing the ring system\n    \"\"\"\n    mol.UpdatePropertyCache()\n    ri = mol.GetRingInfo()\n    systems = []\n    for ring in ri.AtomRings():\n        ring_atoms = set(ring)\n        cur_system = []  # keep a track of ring system\n        for system in systems:\n            if len(ring_atoms.intersection(system)) &gt; 0:\n                ring_atoms = ring_atoms.union(system)  # merge ring system that overlap\n            else:\n                cur_system.append(system)\n        cur_system.append(ring_atoms)\n        systems = cur_system\n    return systems\n</code></pre>"},{"location":"api/safe.html#safe.utils.MolSlicer.link_fragments","title":"<code>link_fragments(linker, head, tail)</code>  <code>classmethod</code>","text":"<p>Link fragments together using the provided linker</p> <p>Parameters:</p> Name Type Description Default <code>linker</code> <code>Union[Mol, str]</code> <p>linker to use</p> required <code>head</code> <code>Union[Mol, str]</code> <p>head fragment</p> required <code>tail</code> <code>Union[Mol, str]</code> <p>tail fragment</p> required Source code in <code>safe/utils.py</code> <pre><code>@classmethod\ndef link_fragments(\n    cls, linker: Union[dm.Mol, str], head: Union[dm.Mol, str], tail: Union[dm.Mol, str]\n):\n    \"\"\"Link fragments together using the provided linker\n\n    Args:\n        linker: linker to use\n        head: head fragment\n        tail: tail fragment\n    \"\"\"\n    if isinstance(linker, dm.Mol):\n        linker = dm.to_smiles(linker)\n    linker = standardize_attach(linker)\n    reactants = [dm.to_mol(head), dm.to_mol(tail), dm.to_mol(linker)]\n    return dm.reactions.apply_reaction(\n        cls._MERGING_RXN, reactants, as_smiles=True, sanitize=True, product_index=0\n    )\n</code></pre>"},{"location":"api/safe.html#safe.utils.attr_as","title":"<code>attr_as(obj, field, value)</code>","text":"<p>Temporary replace the value of an object</p> <p>Parameters:</p> Name Type Description Default <code>obj</code> <code>Any</code> <p>object to temporary patch</p> required <code>field</code> <code>str</code> <p>name of the key to change</p> required <code>value</code> <code>Any</code> <p>value of key to be temporary changed</p> required Source code in <code>safe/utils.py</code> <pre><code>@contextmanager\ndef attr_as(obj: Any, field: str, value: Any):\n    \"\"\"Temporary replace the value of an object\n\n    Args:\n        obj: object to temporary patch\n        field: name of the key to change\n        value: value of key to be temporary changed\n    \"\"\"\n    old_value = getattr(obj, field, None)\n    setattr(obj, field, value)\n    yield\n    with suppress(TypeError):\n        setattr(obj, field, old_value)\n</code></pre>"},{"location":"api/safe.html#safe.utils.compute_side_chains","title":"<code>compute_side_chains(mol, core, label_by_index=False)</code>","text":"<p>Compute the side chain of a molecule given a core</p> <p>Finding the side chains</p> <p>The algorithm to find the side chains from core assumes that the core we get as input has attachment points. Those attachment points are never considered as part of the query, rather they are used to define the attachment points on the side chains. Removing the attachment points from the core is exactly the same as keeping them.</p> <p><pre><code>mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\ncore0 = \"CC1(C)CN2C(CC2=O)S1\"\ncore1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\ncore2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\nside_chain = compute_side_chain(core=core0, mol=mol)\ndm.to_image([side_chain, core0, mol])\n</code></pre> Therefore on the above, core0 and core1 are equivalent for the molecule <code>mol</code>, but core2 is not.</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to split</p> required <code>core</code> <code>Mol</code> <p>core to use for deriving the side chains</p> required Source code in <code>safe/utils.py</code> <pre><code>def compute_side_chains(mol: dm.Mol, core: dm.Mol, label_by_index: bool = False):\n    \"\"\"Compute the side chain of a molecule given a core\n\n    !!! note \"Finding the side chains\"\n        The algorithm to find the side chains from core assumes that the core we get as input has attachment points.\n        Those attachment points are never considered as part of the query, rather they are used to define the attachment points\n        on the side chains. Removing the attachment points from the core is exactly the same as keeping them.\n\n        ```python\n        mol = \"CC1=C(C(=NO1)C2=CC=CC=C2Cl)C(=O)NC3C4N(C3=O)C(C(S4)(C)C)C(=O)O\"\n        core0 = \"CC1(C)CN2C(CC2=O)S1\"\n        core1 = \"CC1(C)SC2C(-*)C(=O)N2C1-*\"\n        core2 = \"CC1N2C(SC1(C)C)C(N)C2=O\"\n        side_chain = compute_side_chain(core=core0, mol=mol)\n        dm.to_image([side_chain, core0, mol])\n        ```\n        Therefore on the above, core0 and core1 are equivalent for the molecule `mol`, but core2 is not.\n\n    Args:\n        mol: molecule to split\n        core: core to use for deriving the side chains\n    \"\"\"\n\n    if isinstance(mol, str):\n        mol = dm.to_mol(mol)\n    if isinstance(core, str):\n        core = dm.to_mol(core)\n    core_query_param = AdjustQueryParameters()\n    core_query_param.makeDummiesQueries = True\n    core_query_param.adjustDegree = False\n    core_query_param.aromatizeIfPossible = True\n    core_query_param.makeBondsGeneric = False\n    core_query = AdjustQueryProperties(core, core_query_param)\n    return ReplaceCore(\n        mol, core_query, labelByIndex=label_by_index, replaceDummies=False, requireDummyMatch=False\n    )\n</code></pre>"},{"location":"api/safe.html#safe.utils.convert_to_safe","title":"<code>convert_to_safe(mol, canonical=False, randomize=False, seed=1, slicer='brics', split_fragment=True, fraction_hs=None, resolution=0.5)</code>","text":"<p>Convert a molecule to a safe representation</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to convert</p> required <code>canonical</code> <code>bool</code> <p>whether to use canonical encoding</p> <code>False</code> <code>randomize</code> <code>bool</code> <p>whether to randomize the encoding</p> <code>False</code> <code>seed</code> <code>Optional[int]</code> <p>random seed</p> <code>1</code> <code>slicer</code> <code>str</code> <p>the slicer to use for fragmentation</p> <code>'brics'</code> <code>split_fragment</code> <code>bool</code> <p>whether to split fragments</p> <code>True</code> <code>fraction_hs</code> <code>bool</code> <p>proportion of random atom to which we will add explicit hydrogens</p> <code>None</code> <code>resolution</code> <code>Optional[float]</code> <p>resolution for the partitioning algorithm</p> <code>0.5</code> <code>seed</code> <code>Optional[int]</code> <p>random seed</p> <code>1</code> Source code in <code>safe/utils.py</code> <pre><code>def convert_to_safe(\n    mol: dm.Mol,\n    canonical: bool = False,\n    randomize: bool = False,\n    seed: Optional[int] = 1,\n    slicer: str = \"brics\",\n    split_fragment: bool = True,\n    fraction_hs: bool = None,\n    resolution: Optional[float] = 0.5,\n):\n    \"\"\"Convert a molecule to a safe representation\n\n    Args:\n        mol: molecule to convert\n        canonical: whether to use canonical encoding\n        randomize: whether to randomize the encoding\n        seed: random seed\n        slicer: the slicer to use for fragmentation\n        split_fragment: whether to split fragments\n        fraction_hs: proportion of random atom to which we will add explicit hydrogens\n        resolution: resolution for the partitioning algorithm\n        seed: random seed\n    \"\"\"\n    x = None\n    try:\n        x = sf.encode(mol, canonical=canonical, randomize=randomize, slicer=slicer, seed=seed)\n    except sf.SAFEFragmentationError:\n        if split_fragment:\n            if \".\" in mol:\n                return None\n            try:\n                x = sf.encode(\n                    mol,\n                    canonical=False,\n                    randomize=randomize,\n                    seed=seed,\n                    slicer=partial(\n                        fragment_aware_spliting,\n                        fraction_hs=fraction_hs,\n                        resolution=resolution,\n                        seed=seed,\n                    ),\n                )\n            except (sf.SAFEEncodeError, sf.SAFEFragmentationError):\n                # logger.exception(e)\n                return x\n        # we need to resplit using attachment point but here we are only adding\n    except sf.SAFEEncodeError:\n        return x\n    return x\n</code></pre>"},{"location":"api/safe.html#safe.utils.filter_by_substructure_constraints","title":"<code>filter_by_substructure_constraints(sequences, substruct, n_jobs=-1)</code>","text":"<p>Check whether the input substructures are present in each of the molecule in the sequences</p> <p>Parameters:</p> Name Type Description Default <code>sequences</code> <code>List[Union[str, Mol]]</code> <p>list of molecules to validate</p> required <code>substruct</code> <code>Union[str, Mol]</code> <p>substructure to use as query</p> required <code>n_jobs</code> <code>int</code> <p>number of jobs to use for parallelization</p> <code>-1</code> Source code in <code>safe/utils.py</code> <pre><code>def filter_by_substructure_constraints(\n    sequences: List[Union[str, dm.Mol]], substruct: Union[str, dm.Mol], n_jobs: int = -1\n):\n    \"\"\"Check whether the input substructures are present in each of the molecule in the sequences\n\n    Args:\n        sequences: list of molecules to validate\n        substruct: substructure to use as query\n        n_jobs: number of jobs to use for parallelization\n\n    \"\"\"\n\n    if isinstance(substruct, str):\n        substruct = standardize_attach(substruct)\n        substruct = dm.from_smarts(substruct)\n\n    def _check_match(mol):\n        with suppress(Exception):\n            mol = dm.to_mol(mol)\n            return mol.HasSubstructMatch(substruct)\n        return False\n\n    matches = dm.parallelized(_check_match, sequences, n_jobs=n_jobs)\n    return list(compress(sequences, matches))\n</code></pre>"},{"location":"api/safe.html#safe.utils.find_partition_edges","title":"<code>find_partition_edges(G, partition)</code>","text":"<p>Find the edges connecting the subgraphs in a given partition of a graph.</p> <p>Parameters:</p> Name Type Description Default <code>G</code> <code>Graph</code> <p>The original graph.</p> required <code>partition</code> <code>list of list of nodes</code> <p>The partition of the graph where each element is a list of nodes representing a subgraph.</p> required <p>Returns:</p> Name Type Description <code>list</code> <code>List[Tuple]</code> <p>A list of edges connecting the subgraphs in the partition.</p> Source code in <code>safe/utils.py</code> <pre><code>def find_partition_edges(G: nx.Graph, partition: List[List]) -&gt; List[Tuple]:\n    \"\"\"\n    Find the edges connecting the subgraphs in a given partition of a graph.\n\n    Args:\n        G (networkx.Graph): The original graph.\n        partition (list of list of nodes): The partition of the graph where each element is a list of nodes representing a subgraph.\n\n    Returns:\n        list: A list of edges connecting the subgraphs in the partition.\n    \"\"\"\n    partition_edges = []\n    for subgraph1, subgraph2 in combinations(partition, 2):\n        edges = nx.edge_boundary(G, subgraph1, subgraph2)\n        partition_edges.extend(edges)\n    return partition_edges\n</code></pre>"},{"location":"api/safe.html#safe.utils.fragment_aware_spliting","title":"<code>fragment_aware_spliting(mol, fraction_hs=None, **kwargs)</code>","text":"<p>Custom splitting algorithm for dataset building.</p> <p>This slicing strategy will cut any bond including bonding with hydrogens However, only one cut per atom is allowed</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to split</p> required <code>fraction_hs</code> <code>Optional[bool]</code> <p>proportion of random atom to which we will add explicit hydrogens</p> <code>None</code> <code>kwargs</code> <code>Any</code> <p>additional arguments to pass to the partitioning algorithm</p> <code>{}</code> Source code in <code>safe/utils.py</code> <pre><code>def fragment_aware_spliting(mol: dm.Mol, fraction_hs: Optional[bool] = None, **kwargs: Any):\n    \"\"\"Custom splitting algorithm for dataset building.\n\n    This slicing strategy will cut any bond including bonding with hydrogens\n    However, only one cut per atom is allowed\n\n    Args:\n        mol: molecule to split\n        fraction_hs: proportion of random atom to which we will add explicit hydrogens\n        kwargs: additional arguments to pass to the partitioning algorithm\n    \"\"\"\n    random.seed(kwargs.get(\"seed\", 1))\n    mol = dm.to_mol(mol, remove_hs=False)\n    mol = _selective_add_hs(mol, fraction_hs=fraction_hs)\n    graph = dm.graph.to_graph(mol)\n    d = mol_partition(mol, **kwargs)\n    q = deque(d)\n    partition = q.pop()\n    return find_partition_edges(graph, partition)\n</code></pre>"},{"location":"api/safe.html#safe.utils.list_individual_attach_points","title":"<code>list_individual_attach_points(mol, depth=None)</code>","text":"<p>List all individual attachement points.</p> <p>We do not allow multiple attachment points per substitution position.</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule for which we need to open the attachment points</p> required Source code in <code>safe/utils.py</code> <pre><code>def list_individual_attach_points(mol: dm.Mol, depth: Optional[int] = None):\n    \"\"\"List all individual attachement points.\n\n    We do not allow multiple attachment points per substitution position.\n\n    Args:\n        mol: molecule for which we need to open the attachment points\n\n    \"\"\"\n    ATTACHING_RXN = ReactionFromSmarts(\"[*;h;!$([*][#0]):1]&gt;&gt;[*:1][*]\")\n    mols = [mol]\n    curated_prods = set()\n    num_attachs = len(mol.GetSubstructMatches(dm.from_smarts(\"[*;h:1]\"), uniquify=True))\n    depth = depth or 1\n    depth = min(max(depth, 1), num_attachs)\n    while depth &gt; 0:\n        prods = set()\n        for mol in mols:\n            mol = dm.to_mol(mol)\n            for p in ATTACHING_RXN.RunReactants((mol,)):\n                try:\n                    m = dm.sanitize_mol(p[0])\n                    sm = dm.to_smiles(m, canonical=True)\n                    sm = dm.reactions.add_brackets_to_attachment_points(sm)\n                    prods.add(dm.reactions.convert_attach_to_isotope(sm, as_smiles=True))\n                except Exception as e:\n                    logger.error(e)\n        curated_prods.update(prods)\n        mols = prods\n        depth -= 1\n    return list(curated_prods)\n</code></pre>"},{"location":"api/safe.html#safe.utils.mol_partition","title":"<code>mol_partition(mol, query=None, seed=None, **kwargs)</code>","text":"<p>Partition a molecule into fragments using a bond query</p> <p>Parameters:</p> Name Type Description Default <code>mol</code> <code>Mol</code> <p>molecule to split</p> required <code>query</code> <code>Optional[Mol]</code> <p>bond query to use for splitting</p> <code>None</code> <code>seed</code> <code>Optional[int]</code> <p>random seed</p> <code>None</code> <code>kwargs</code> <code>Any</code> <p>additional arguments to pass to the partitioning algorithm</p> <code>{}</code> Source code in <code>safe/utils.py</code> <pre><code>@py_random_state(\"seed\")\ndef mol_partition(\n    mol: dm.Mol, query: Optional[dm.Mol] = None, seed: Optional[int] = None, **kwargs: Any\n):\n    \"\"\"Partition a molecule into fragments using a bond query\n\n    Args:\n        mol: molecule to split\n        query: bond query to use for splitting\n        seed: random seed\n        kwargs: additional arguments to pass to the partitioning algorithm\n\n    \"\"\"\n    resolution = kwargs.get(\"resolution\", 1.0)\n    threshold = kwargs.get(\"threshold\", 1e-7)\n    weight = kwargs.get(\"weight\", \"weight\")\n\n    if query is None:\n        query = __mmpa_query\n\n    G = dm.graph.to_graph(mol)\n    bond_partition = [\n        tuple(sorted(match)) for match in mol.GetSubstructMatches(query, uniquify=True)\n    ]\n\n    def get_relevant_edges(e1, e2):\n        return tuple(sorted([e1, e2])) not in bond_partition\n\n    subgraphs = nx.subgraph_view(G, filter_edge=get_relevant_edges)\n\n    partition = [{u} for u in G.nodes()]\n    inner_partition = sorted(nx.connected_components(subgraphs), key=lambda x: min(x))\n    mod = nx.algorithms.community.modularity(\n        G, inner_partition, resolution=resolution, weight=weight\n    )\n    is_directed = G.is_directed()\n    graph = G.__class__()\n    graph.add_nodes_from(G)\n    graph.add_weighted_edges_from(G.edges(data=weight, default=1))\n    graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n    m = graph.size(weight=\"weight\")\n    partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n        graph, m, inner_partition, resolution, is_directed, seed\n    )\n    improvement = True\n    while improvement:\n        # gh-5901 protect the sets in the yielded list from further manipulation here\n        yield [s.copy() for s in partition]\n        new_mod = nx.algorithms.community.modularity(\n            graph, inner_partition, resolution=resolution, weight=\"weight\"\n        )\n        if new_mod - mod &lt;= threshold:\n            return\n        mod = new_mod\n        graph = nx.algorithms.community.louvain._gen_graph(graph, inner_partition)\n        partition, inner_partition, improvement = nx.algorithms.community.louvain._one_level(\n            graph, m, partition, resolution, is_directed, seed\n        )\n</code></pre>"},{"location":"api/safe.html#safe.utils.standardize_attach","title":"<code>standardize_attach(inputs, standard_attach='[*]')</code>","text":"<p>Standardize the attachment points of a molecule</p> <p>Parameters:</p> Name Type Description Default <code>inputs</code> <code>str</code> <p>input molecule</p> required <code>standard_attach</code> <code>str</code> <p>standard attachment point to use</p> <code>'[*]'</code> Source code in <code>safe/utils.py</code> <pre><code>def standardize_attach(inputs: str, standard_attach: str = \"[*]\"):\n    \"\"\"Standardize the attachment points of a molecule\n\n    Args:\n        inputs: input molecule\n        standard_attach: standard attachment point to use\n    \"\"\"\n\n    for attach_regex in _SMILES_ATTACHMENT_POINTS:\n        inputs = re.sub(attach_regex, standard_attach, inputs)\n    return inputs\n</code></pre>"},{"location":"api/safe.models.html","title":"Model training","text":""},{"location":"api/safe.models.html#config-file","title":"Config File","text":"<p>The input config file for training a <code>SAFE</code> model is very similar to the GPT2 config file, with the addition of an optional <code>num_labels</code> attribute for training with descriptors regularization.</p> <pre><code>{\n  \"activation_function\": \"gelu_new\",\n  \"attn_pdrop\": 0.1,\n  \"bos_token_id\": 10000,\n  \"embd_pdrop\": 0.1,\n  \"eos_token_id\": 1,\n  \"initializer_range\": 0.02,\n  \"layer_norm_epsilon\": 1e-05,\n  \"model_type\": \"gpt2\",\n  \"n_embd\": 768,\n  \"n_head\": 12,\n  \"n_inner\": null,\n  \"n_layer\": 12,\n  \"n_positions\": 1024,\n  \"reorder_and_upcast_attn\": false,\n  \"resid_pdrop\": 0.1,\n  \"scale_attn_by_inverse_layer_idx\": false,\n  \"scale_attn_weights\": true,\n  \"summary_activation\": \"tanh\",\n  \"summary_first_dropout\": 0.1,\n  \"summary_proj_to_labels\": true,\n  \"summary_type\": \"cls_index\",\n  \"summary_hidden_size\": 128,\n  \"summary_use_proj\": true,\n  \"transformers_version\": \"4.31.0\",\n  \"use_cache\": true,\n  \"vocab_size\": 10000,\n  \"num_labels\": 9\n}\n</code></pre>"},{"location":"api/safe.models.html#safe-model","title":"SAFE Model","text":""},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead","title":"<code>PropertyHead</code>","text":"<p>               Bases: <code>Module</code></p> <p>Compute a single vector summary of a sequence hidden states.</p> <p>Parameters:</p> Name Type Description Default <code>config</code> <code>[`PretrainedConfig`]</code> <p>The config used by the model. Relevant arguments in the config class of the model are (refer to the actual config class of your model for the default values it uses):</p> <ul> <li>summary_type (<code>str</code>) -- The method to use to make this summary. Accepted values are:<p>- <code>\"last\"</code> -- Take the last token hidden state (like XLNet)   - <code>\"first\"</code> -- Take the first token hidden state (like Bert)   - <code>\"mean\"</code> -- Take the mean of all tokens hidden states   - <code>\"cls_index\"</code> -- Supply a Tensor of classification token position (GPT/GPT-2)</p> </li> </ul> <ul> <li>summary_activation (<code>Optional[str]</code>) -- Set to <code>\"tanh\"</code> to add a tanh activation to the output,   another string, or <code>None</code> to add no activation.</li> </ul> required Source code in <code>safe/trainer/model.py</code> <pre><code>class PropertyHead(torch.nn.Module):\n    r\"\"\"\n    Compute a single vector summary of a sequence hidden states.\n\n    Args:\n        config ([`PretrainedConfig`]):\n            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual\n            config class of your model for the default values it uses):\n\n            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:\n\n                - `\"last\"` -- Take the last token hidden state (like XLNet)\n                - `\"first\"` -- Take the first token hidden state (like Bert)\n                - `\"mean\"` -- Take the mean of all tokens hidden states\n                - `\"cls_index\"` -- Supply a Tensor of classification token position (GPT/GPT-2)\n\n            - **summary_activation** (`Optional[str]`) -- Set to `\"tanh\"` to add a tanh activation to the output,\n              another string, or `None` to add no activation.\n    \"\"\"\n\n    def __init__(self, config: PretrainedConfig):\n        super().__init__()\n\n        self.summary_type = getattr(config, \"summary_type\", \"cls_index\")\n        self.summary = torch.nn.Identity()\n        last_hidden_size = config.hidden_size\n\n        if getattr(config, \"summary_hidden_size\", None) and config.summary_hidden_size &gt; 0:\n            self.summary = nn.Linear(config.hidden_size, config.summary_hidden_size)\n            last_hidden_size = config.summary_hidden_size\n\n        activation_string = getattr(config, \"summary_activation\", None)\n        self.activation: Callable = (\n            get_activation(activation_string) if activation_string else nn.Identity()\n        )\n\n        self.out = torch.nn.Identity()\n        if getattr(config, \"num_labels\", None) and config.num_labels &gt; 0:\n            num_labels = config.num_labels\n            self.out = nn.Linear(last_hidden_size, num_labels)\n\n    def forward(\n        self,\n        hidden_states: torch.FloatTensor,\n        cls_index: Optional[torch.LongTensor] = None,\n    ) -&gt; torch.FloatTensor:\n        \"\"\"\n        Compute a single vector summary of a sequence hidden states.\n\n        Args:\n            hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n                The hidden states of the last layer.\n            cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n                where ... are optional leading dimensions of `hidden_states`, *optional*\n                Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n        Returns:\n            `torch.FloatTensor`: The summary of the sequence hidden states.\n        \"\"\"\n        if self.summary_type == \"last\":\n            output = hidden_states[:, -1]\n        elif self.summary_type == \"first\":\n            output = hidden_states[:, 0]\n        elif self.summary_type == \"mean\":\n            output = hidden_states.mean(dim=1)\n        elif self.summary_type == \"cls_index\":\n            # if cls_index is None:\n            #     cls_index = torch.full_like(\n            #         hidden_states[..., :1, :],\n            #         hidden_states.shape[-2] - 1,\n            #         dtype=torch.long,\n            #     )\n            # else:\n            #     cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n            #     cls_index = cls_index.expand(\n            #         (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n            #     )\n\n            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n            # output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)\n            batch_size = hidden_states.shape[0]\n            output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n        else:\n            raise NotImplementedError\n\n        output = self.summary(output)\n        output = self.activation(output)\n        return self.out(output)\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.model.PropertyHead.forward","title":"<code>forward(hidden_states, cls_index=None)</code>","text":"<p>Compute a single vector summary of a sequence hidden states.</p> <p>Parameters:</p> Name Type Description Default <code>hidden_states</code> <code>FloatTensor</code> <p><code>torch.FloatTensor</code> of shape <code>[batch_size, seq_len, hidden_size]</code>) The hidden states of the last layer.</p> required <code>cls_index</code> <code>Optional[LongTensor]</code> <p><code>torch.LongTensor</code> of shape <code>[batch_size]</code> or <code>[batch_size, ...]</code> where ... are optional leading dimensions of <code>hidden_states</code>, optional Used if <code>summary_type == \"cls_index\"</code> and takes the last token of the sequence as classification token.</p> <code>None</code> <p>Returns:</p> Type Description <code>FloatTensor</code> <p><code>torch.FloatTensor</code>: The summary of the sequence hidden states.</p> Source code in <code>safe/trainer/model.py</code> <pre><code>def forward(\n    self,\n    hidden_states: torch.FloatTensor,\n    cls_index: Optional[torch.LongTensor] = None,\n) -&gt; torch.FloatTensor:\n    \"\"\"\n    Compute a single vector summary of a sequence hidden states.\n\n    Args:\n        hidden_states: `torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`)\n            The hidden states of the last layer.\n        cls_index: `torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]`\n            where ... are optional leading dimensions of `hidden_states`, *optional*\n            Used if `summary_type == \"cls_index\"` and takes the last token of the sequence as classification token.\n\n    Returns:\n        `torch.FloatTensor`: The summary of the sequence hidden states.\n    \"\"\"\n    if self.summary_type == \"last\":\n        output = hidden_states[:, -1]\n    elif self.summary_type == \"first\":\n        output = hidden_states[:, 0]\n    elif self.summary_type == \"mean\":\n        output = hidden_states.mean(dim=1)\n    elif self.summary_type == \"cls_index\":\n        # if cls_index is None:\n        #     cls_index = torch.full_like(\n        #         hidden_states[..., :1, :],\n        #         hidden_states.shape[-2] - 1,\n        #         dtype=torch.long,\n        #     )\n        # else:\n        #     cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)\n        #     cls_index = cls_index.expand(\n        #         (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)\n        #     )\n\n        # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states\n        # output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)\n        batch_size = hidden_states.shape[0]\n        output = hidden_states.squeeze()[torch.arange(batch_size), cls_index]\n    else:\n        raise NotImplementedError\n\n    output = self.summary(output)\n    output = self.activation(output)\n    return self.out(output)\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel","title":"<code>SAFEDoubleHeadsModel</code>","text":"<p>               Bases: <code>GPT2DoubleHeadsModel</code></p> <p>The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head</p> Source code in <code>safe/trainer/model.py</code> <pre><code>class SAFEDoubleHeadsModel(GPT2DoubleHeadsModel):\n    \"\"\"The safe model is a dual head GPT2 model with a language modeling head and an optional multi-task regression head\"\"\"\n\n    def __init__(self, config):\n        self.num_labels = getattr(config, \"num_labels\", None)\n        super().__init__(config)\n        self.config.num_labels = self.num_labels\n        del self.multiple_choice_head\n        self.multiple_choice_head = PropertyHead(config)\n\n    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\n    def forward(\n        self,\n        input_ids: Optional[torch.LongTensor] = None,\n        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n        attention_mask: Optional[torch.FloatTensor] = None,\n        token_type_ids: Optional[torch.LongTensor] = None,\n        position_ids: Optional[torch.LongTensor] = None,\n        head_mask: Optional[torch.FloatTensor] = None,\n        inputs_embeds: Optional[torch.FloatTensor] = None,\n        mc_token_ids: Optional[torch.LongTensor] = None,\n        labels: Optional[torch.LongTensor] = None,\n        mc_labels: Optional[torch.LongTensor] = None,\n        use_cache: Optional[bool] = None,\n        output_attentions: Optional[bool] = None,\n        output_hidden_states: Optional[bool] = None,\n        return_dict: Optional[bool] = None,\n        inputs: Optional[Any] = None,  # do not remove because of trainer\n        encoder_hidden_states: Optional[torch.Tensor] = None,\n        **kwargs,\n    ) -&gt; Union[Tuple, GPT2DoubleHeadsModelOutput]:\n        r\"\"\"\n\n        Args:\n            mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n                Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n                1]`.\n            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n                Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n                `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n                `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n            mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n                Labels for computing the supervized loss for regularization.\n            inputs: List of inputs, put here because the trainer removes information not in signature\n        Returns:\n            output (GPT2DoubleHeadsModelOutput): output of the model\n        \"\"\"\n        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n        transformer_outputs = self.transformer(\n            input_ids,\n            past_key_values=past_key_values,\n            attention_mask=attention_mask,\n            token_type_ids=token_type_ids,\n            position_ids=position_ids,\n            head_mask=head_mask,\n            inputs_embeds=inputs_embeds,\n            use_cache=use_cache,\n            output_attentions=output_attentions,\n            output_hidden_states=output_hidden_states,\n            return_dict=return_dict,\n            encoder_hidden_states=encoder_hidden_states,\n        )\n\n        hidden_states = transformer_outputs[0]\n        lm_logits = self.lm_head(hidden_states)\n\n        if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n            mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n                lm_logits.device\n            )\n\n        # Set device for model parallelism\n        if self.model_parallel:\n            torch.cuda.set_device(self.transformer.first_device)\n            hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n        mc_loss = None\n        mc_logits = None\n        if mc_labels is not None and getattr(self.config, \"num_labels\", 0) &gt; 0:\n            mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n            mc_labels = mc_labels.to(mc_logits.device)\n            loss_fct = MSELoss()\n            mc_loss = loss_fct(\n                mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n            )\n\n        lm_loss = None\n        if labels is not None:\n            labels = labels.to(lm_logits.device)\n            shift_logits = lm_logits[..., :-1, :].contiguous()\n            shift_labels = labels[..., 1:].contiguous()\n            loss_fct = CrossEntropyLoss()\n            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n        if not return_dict:\n            output = (lm_logits, mc_logits) + transformer_outputs[1:]\n            return (\n                lm_loss,\n                mc_loss,\n            ) + output\n\n        return GPT2DoubleHeadsModelOutput(\n            loss=lm_loss,\n            mc_loss=mc_loss,\n            logits=lm_logits,\n            mc_logits=mc_logits,\n            past_key_values=transformer_outputs.past_key_values,\n            hidden_states=transformer_outputs.hidden_states,\n            attentions=transformer_outputs.attentions,\n        )\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.model.SAFEDoubleHeadsModel.forward","title":"<code>forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, labels=None, mc_labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, inputs=None, encoder_hidden_states=None, **kwargs)</code>","text":"<p>Parameters:</p> Name Type Description Default <code>mc_token_ids</code> <code>`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input</code> <p>Index of the classification token in each input sequence. Selected in the range <code>[0, input_ids.size(-1) - 1]</code>.</p> <code>None</code> <code>labels</code> <code>`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*</code> <p>Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set <code>labels = input_ids</code>. Indices are selected in <code>[-100, 0, ..., config.vocab_size - 1]</code>. All labels set to <code>-100</code> are ignored (masked), the loss is only computed for labels in <code>[0, ..., config.vocab_size - 1]</code></p> <code>None</code> <code>mc_labels</code> <code>`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*</code> <p>Labels for computing the supervized loss for regularization.</p> <code>None</code> <code>inputs</code> <code>Optional[Any]</code> <p>List of inputs, put here because the trainer removes information not in signature</p> <code>None</code> <p>Returns:     output (GPT2DoubleHeadsModelOutput): output of the model</p> Source code in <code>safe/trainer/model.py</code> <pre><code>@add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)\n@replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)\ndef forward(\n    self,\n    input_ids: Optional[torch.LongTensor] = None,\n    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,\n    attention_mask: Optional[torch.FloatTensor] = None,\n    token_type_ids: Optional[torch.LongTensor] = None,\n    position_ids: Optional[torch.LongTensor] = None,\n    head_mask: Optional[torch.FloatTensor] = None,\n    inputs_embeds: Optional[torch.FloatTensor] = None,\n    mc_token_ids: Optional[torch.LongTensor] = None,\n    labels: Optional[torch.LongTensor] = None,\n    mc_labels: Optional[torch.LongTensor] = None,\n    use_cache: Optional[bool] = None,\n    output_attentions: Optional[bool] = None,\n    output_hidden_states: Optional[bool] = None,\n    return_dict: Optional[bool] = None,\n    inputs: Optional[Any] = None,  # do not remove because of trainer\n    encoder_hidden_states: Optional[torch.Tensor] = None,\n    **kwargs,\n) -&gt; Union[Tuple, GPT2DoubleHeadsModelOutput]:\n    r\"\"\"\n\n    Args:\n        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):\n            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -\n            1]`.\n        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set\n            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to\n            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`\n        mc_labels (`torch.LongTensor` of shape `(batch_size, n_tasks)`, *optional*):\n            Labels for computing the supervized loss for regularization.\n        inputs: List of inputs, put here because the trainer removes information not in signature\n    Returns:\n        output (GPT2DoubleHeadsModelOutput): output of the model\n    \"\"\"\n    return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n    transformer_outputs = self.transformer(\n        input_ids,\n        past_key_values=past_key_values,\n        attention_mask=attention_mask,\n        token_type_ids=token_type_ids,\n        position_ids=position_ids,\n        head_mask=head_mask,\n        inputs_embeds=inputs_embeds,\n        use_cache=use_cache,\n        output_attentions=output_attentions,\n        output_hidden_states=output_hidden_states,\n        return_dict=return_dict,\n        encoder_hidden_states=encoder_hidden_states,\n    )\n\n    hidden_states = transformer_outputs[0]\n    lm_logits = self.lm_head(hidden_states)\n\n    if mc_token_ids is None and self.config.pad_token_id is not None and input_ids is not None:\n        mc_token_ids = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(\n            lm_logits.device\n        )\n\n    # Set device for model parallelism\n    if self.model_parallel:\n        torch.cuda.set_device(self.transformer.first_device)\n        hidden_states = hidden_states.to(self.lm_head.weight.device)\n\n    mc_loss = None\n    mc_logits = None\n    if mc_labels is not None and getattr(self.config, \"num_labels\", 0) &gt; 0:\n        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)\n        mc_labels = mc_labels.to(mc_logits.device)\n        loss_fct = MSELoss()\n        mc_loss = loss_fct(\n            mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1, mc_logits.size(-1))\n        )\n\n    lm_loss = None\n    if labels is not None:\n        labels = labels.to(lm_logits.device)\n        shift_logits = lm_logits[..., :-1, :].contiguous()\n        shift_labels = labels[..., 1:].contiguous()\n        loss_fct = CrossEntropyLoss()\n        lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n\n    if not return_dict:\n        output = (lm_logits, mc_logits) + transformer_outputs[1:]\n        return (\n            lm_loss,\n            mc_loss,\n        ) + output\n\n    return GPT2DoubleHeadsModelOutput(\n        loss=lm_loss,\n        mc_loss=mc_loss,\n        logits=lm_logits,\n        mc_logits=mc_logits,\n        past_key_values=transformer_outputs.past_key_values,\n        hidden_states=transformer_outputs.hidden_states,\n        attentions=transformer_outputs.attentions,\n    )\n</code></pre>"},{"location":"api/safe.models.html#trainer","title":"Trainer","text":""},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer","title":"<code>SAFETrainer</code>","text":"<p>               Bases: <code>Trainer</code></p> <p>Custom trainer for training SAFE model.</p> <p>This custom trainer changes the loss function to support the property head</p> Source code in <code>safe/trainer/trainer_utils.py</code> <pre><code>class SAFETrainer(Trainer):\n    \"\"\"\n    Custom trainer for training SAFE model.\n\n    This custom trainer changes the loss function to support the property head\n\n    \"\"\"\n\n    def __init__(self, *args, prop_loss_coeff: float = 1e-3, **kwargs):\n        super().__init__(*args, **kwargs)\n        self.prop_loss_coeff = prop_loss_coeff\n\n    def compute_loss(self, model, inputs, return_outputs=False):\n        \"\"\"\n        How the loss is computed by Trainer. By default, all models return the loss in the first element.\n        \"\"\"\n        labels = (\n            inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n        )\n\n        outputs = model(**inputs)\n        # Save past state if it exists\n        # TODO: this needs to be fixed and made cleaner later.\n        if self.args.past_index &gt;= 0:\n            self._past = outputs[self.args.past_index]\n\n        if labels is not None:\n            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n                loss = self.label_smoother(outputs, labels, shift_labels=True)\n            else:\n                loss = self.label_smoother(outputs, labels)\n        else:\n            if isinstance(outputs, dict) and \"loss\" not in outputs:\n                raise ValueError(\n                    \"The model did not return a loss from the inputs, only the following keys: \"\n                    f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n                )\n            # We don't use .loss here since the model may return tuples instead of ModelOutput.\n            loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n        mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n        if mc_loss is not None:\n            loss = loss + self.prop_loss_coeff * mc_loss\n        return (loss, outputs) if return_outputs else loss\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.trainer_utils.SAFETrainer.compute_loss","title":"<code>compute_loss(model, inputs, return_outputs=False)</code>","text":"<p>How the loss is computed by Trainer. By default, all models return the loss in the first element.</p> Source code in <code>safe/trainer/trainer_utils.py</code> <pre><code>def compute_loss(self, model, inputs, return_outputs=False):\n    \"\"\"\n    How the loss is computed by Trainer. By default, all models return the loss in the first element.\n    \"\"\"\n    labels = (\n        inputs.pop(\"labels\") if self.label_smoother is not None and \"labels\" in inputs else None\n    )\n\n    outputs = model(**inputs)\n    # Save past state if it exists\n    # TODO: this needs to be fixed and made cleaner later.\n    if self.args.past_index &gt;= 0:\n        self._past = outputs[self.args.past_index]\n\n    if labels is not None:\n        if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():\n            loss = self.label_smoother(outputs, labels, shift_labels=True)\n        else:\n            loss = self.label_smoother(outputs, labels)\n    else:\n        if isinstance(outputs, dict) and \"loss\" not in outputs:\n            raise ValueError(\n                \"The model did not return a loss from the inputs, only the following keys: \"\n                f\"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}.\"\n            )\n        # We don't use .loss here since the model may return tuples instead of ModelOutput.\n        loss = outputs[\"loss\"] if isinstance(outputs, dict) else outputs[0]\n    mc_loss = outputs.get(\"mc_loss\", None) if isinstance(outputs, dict) else outputs[1]\n    if mc_loss is not None:\n        loss = loss + self.prop_loss_coeff * mc_loss\n    return (loss, outputs) if return_outputs else loss\n</code></pre>"},{"location":"api/safe.models.html#data-collator","title":"Data Collator","text":""},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator","title":"<code>SAFECollator</code>","text":"<p>Collate function for language modelling tasks</p> <p>Note</p> <p>The collate function is based on the default DataCollatorForLanguageModeling in huggingface see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py</p> Source code in <code>safe/trainer/collator.py</code> <pre><code>class SAFECollator:\n    \"\"\"Collate function for language modelling tasks\n\n\n    !!! note\n        The collate function is based on the default DataCollatorForLanguageModeling in huggingface\n        see: https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/data/data_collator.py\n    \"\"\"\n\n    def __init__(\n        self,\n        tokenizer: Tokenizer,\n        pad_to_multiple_of: Optional[int] = None,\n        input_key: str = \"inputs\",\n        label_key: str = \"labels\",\n        property_key: str = \"descriptors\",\n        include_descriptors: bool = False,\n        max_length: Optional[int] = None,\n    ):\n        \"\"\"\n        Default collator for huggingface transformers in izanagi.\n\n        Args:\n            tokenizer: Huggingface tokenizer\n            input_key: key to use for input ids\n            label_key: key to use for labels\n            property_key: key to use for properties\n            include_descriptors: whether to include training on descriptors or not\n            pad_to_multiple_of: pad to multiple of this value\n        \"\"\"\n\n        self.tokenizer = tokenizer\n        self.pad_to_multiple_of = pad_to_multiple_of\n        self.input_key = input_key\n        self.label_key = label_key\n        self.property_key = property_key\n        self.include_descriptors = include_descriptors\n        self.max_length = max_length\n\n    @functools.lru_cache()\n    def get_tokenizer(self):\n        \"\"\"Get underlying tokenizer\"\"\"\n        if isinstance(self.tokenizer, SAFETokenizer):\n            return self.tokenizer.get_pretrained()\n        return self.tokenizer\n\n    def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n        \"\"\"\n        Call collate function\n\n        Args:\n            samples: list of examples\n        \"\"\"\n        # Handle dict or lists with proper padding and conversion to tensor.\n        tokenizer = self.get_tokenizer()\n\n        # examples = samples\n        examples = copy.deepcopy(samples)\n        inputs = [example.pop(self.input_key, None) for example in examples]\n        mc_labels = (\n            torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n            if self.property_key in examples[0]\n            else None\n        )\n\n        if \"input_ids\" not in examples[0] and inputs is not None:\n            batch = tokenizer(\n                inputs,\n                return_tensors=\"pt\",\n                padding=True,\n                truncation=True,\n                max_length=self.max_length,\n                pad_to_multiple_of=self.pad_to_multiple_of,\n            )\n        else:\n            batch = tokenizer.pad(\n                examples,\n                return_tensors=\"pt\",\n                padding=True,\n                pad_to_multiple_of=self.pad_to_multiple_of,\n                max_length=self.max_length,\n            )\n\n        # If special token mask has been preprocessed, pop it from the dict.\n        batch.pop(\"special_tokens_mask\", None)\n        labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n        if tokenizer.pad_token_id is not None:\n            labels[labels == tokenizer.pad_token_id] = -100\n        batch[\"labels\"] = labels\n\n        if mc_labels is not None and self.include_descriptors:\n            batch.update(\n                {\n                    \"mc_labels\": mc_labels,\n                    # \"input_text\": inputs,\n                }\n            )\n        return batch\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__call__","title":"<code>__call__(samples)</code>","text":"<p>Call collate function</p> <p>Parameters:</p> Name Type Description Default <code>samples</code> <code>List[Union[List[int], Any, Dict[str, Any]]]</code> <p>list of examples</p> required Source code in <code>safe/trainer/collator.py</code> <pre><code>def __call__(self, samples: List[Union[List[int], Any, Dict[str, Any]]]):\n    \"\"\"\n    Call collate function\n\n    Args:\n        samples: list of examples\n    \"\"\"\n    # Handle dict or lists with proper padding and conversion to tensor.\n    tokenizer = self.get_tokenizer()\n\n    # examples = samples\n    examples = copy.deepcopy(samples)\n    inputs = [example.pop(self.input_key, None) for example in examples]\n    mc_labels = (\n        torch.tensor([example.pop(self.property_key, None) for example in examples]).float()\n        if self.property_key in examples[0]\n        else None\n    )\n\n    if \"input_ids\" not in examples[0] and inputs is not None:\n        batch = tokenizer(\n            inputs,\n            return_tensors=\"pt\",\n            padding=True,\n            truncation=True,\n            max_length=self.max_length,\n            pad_to_multiple_of=self.pad_to_multiple_of,\n        )\n    else:\n        batch = tokenizer.pad(\n            examples,\n            return_tensors=\"pt\",\n            padding=True,\n            pad_to_multiple_of=self.pad_to_multiple_of,\n            max_length=self.max_length,\n        )\n\n    # If special token mask has been preprocessed, pop it from the dict.\n    batch.pop(\"special_tokens_mask\", None)\n    labels = batch.get(\"labels\", batch[\"input_ids\"].clone())\n    if tokenizer.pad_token_id is not None:\n        labels[labels == tokenizer.pad_token_id] = -100\n    batch[\"labels\"] = labels\n\n    if mc_labels is not None and self.include_descriptors:\n        batch.update(\n            {\n                \"mc_labels\": mc_labels,\n                # \"input_text\": inputs,\n            }\n        )\n    return batch\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.__init__","title":"<code>__init__(tokenizer, pad_to_multiple_of=None, input_key='inputs', label_key='labels', property_key='descriptors', include_descriptors=False, max_length=None)</code>","text":"<p>Default collator for huggingface transformers in izanagi.</p> <p>Parameters:</p> Name Type Description Default <code>tokenizer</code> <code>Tokenizer</code> <p>Huggingface tokenizer</p> required <code>input_key</code> <code>str</code> <p>key to use for input ids</p> <code>'inputs'</code> <code>label_key</code> <code>str</code> <p>key to use for labels</p> <code>'labels'</code> <code>property_key</code> <code>str</code> <p>key to use for properties</p> <code>'descriptors'</code> <code>include_descriptors</code> <code>bool</code> <p>whether to include training on descriptors or not</p> <code>False</code> <code>pad_to_multiple_of</code> <code>Optional[int]</code> <p>pad to multiple of this value</p> <code>None</code> Source code in <code>safe/trainer/collator.py</code> <pre><code>def __init__(\n    self,\n    tokenizer: Tokenizer,\n    pad_to_multiple_of: Optional[int] = None,\n    input_key: str = \"inputs\",\n    label_key: str = \"labels\",\n    property_key: str = \"descriptors\",\n    include_descriptors: bool = False,\n    max_length: Optional[int] = None,\n):\n    \"\"\"\n    Default collator for huggingface transformers in izanagi.\n\n    Args:\n        tokenizer: Huggingface tokenizer\n        input_key: key to use for input ids\n        label_key: key to use for labels\n        property_key: key to use for properties\n        include_descriptors: whether to include training on descriptors or not\n        pad_to_multiple_of: pad to multiple of this value\n    \"\"\"\n\n    self.tokenizer = tokenizer\n    self.pad_to_multiple_of = pad_to_multiple_of\n    self.input_key = input_key\n    self.label_key = label_key\n    self.property_key = property_key\n    self.include_descriptors = include_descriptors\n    self.max_length = max_length\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.collator.SAFECollator.get_tokenizer","title":"<code>get_tokenizer()</code>  <code>cached</code>","text":"<p>Get underlying tokenizer</p> Source code in <code>safe/trainer/collator.py</code> <pre><code>@functools.lru_cache()\ndef get_tokenizer(self):\n    \"\"\"Get underlying tokenizer\"\"\"\n    if isinstance(self.tokenizer, SAFETokenizer):\n        return self.tokenizer.get_pretrained()\n    return self.tokenizer\n</code></pre>"},{"location":"api/safe.models.html#data-utils","title":"Data Utils","text":""},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset","title":"<code>get_dataset(data_path, name=None, tokenizer=None, cache_dir=None, streaming=True, use_auth_token=False, tokenize_column='inputs', property_column='descriptors', max_length=None, num_shards=1024)</code>","text":"<p>Get the datasets from the config file</p> Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def get_dataset(\n    data_path,\n    name: Optional[str] = None,\n    tokenizer: Optional[Callable] = None,\n    cache_dir: Optional[str] = None,\n    streaming: bool = True,\n    use_auth_token: bool = False,\n    tokenize_column: Optional[str] = \"inputs\",\n    property_column: Optional[str] = \"descriptors\",\n    max_length: Optional[int] = None,\n    num_shards=1024,\n):\n    \"\"\"Get the datasets from the config file\"\"\"\n    raw_datasets = {}\n    if data_path is not None:\n        data_path = upath.UPath(str(data_path))\n\n        if data_path.exists():\n            # then we need to load from disk\n            data_path = str(data_path)\n            # for some reason, the datasets package is not able to load the dataset\n            # because the split where not originally proposed\n            raw_datasets = datasets.load_from_disk(data_path)\n\n            if streaming:\n                if isinstance(raw_datasets, datasets.DatasetDict):\n                    previous_num_examples = {k: len(dt) for k, dt in raw_datasets.items()}\n                    raw_datasets = datasets.IterableDatasetDict(\n                        {\n                            k: dt.to_iterable_dataset(num_shards=num_shards)\n                            for k, dt in raw_datasets.items()\n                        }\n                    )\n                    for k, dt in raw_datasets.items():\n                        if previous_num_examples[k] is not None:\n                            setattr(dt, \"num_examples\", previous_num_examples[k])\n                else:\n                    num_examples = len(raw_datasets)\n                    raw_datasets = raw_datasets.to_iterable_dataset(num_shards=num_shards)\n                    setattr(raw_datasets, \"num_examples\", num_examples)\n\n        else:\n            data_path = str(data_path)\n            raw_datasets = datasets.load_dataset(\n                data_path,\n                name=name,\n                cache_dir=cache_dir,\n                use_auth_token=True if use_auth_token else None,\n                streaming=streaming,\n            )\n    # that means we need to return a tokenized version of the dataset\n\n    if property_column not in [\"mc_labels\", None]:\n        raw_datasets = raw_datasets.rename_column(property_column, \"mc_labels\")\n\n    columns_to_remove = None\n    if tokenize_column is not None:\n        columns_to_remove = [\n            x\n            for x in (get_dataset_column_names(raw_datasets) or [])\n            if x not in [tokenize_column, \"mc_labels\"] and \"label\" not in x\n        ] or None\n\n    if tokenizer is None:\n        if columns_to_remove is not None:\n            raw_datasets = raw_datasets.remove_columns(columns_to_remove)\n        return raw_datasets\n\n    return raw_datasets.map(\n        partial(\n            tokenize_fn,\n            tokenizer=tokenizer,\n            tokenize_column=tokenize_column,\n            max_length=max_length,\n        ),\n        batched=True,\n        remove_columns=columns_to_remove,\n    )\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.data_utils.get_dataset_column_names","title":"<code>get_dataset_column_names(dataset)</code>","text":"<p>Get the column names in a dataset</p> <p>Parameters:</p> Name Type Description Default <code>dataset</code> <code>Union[Dataset, IterableDataset, Mapping]</code> <p>dataset to get the column names from</p> required Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def get_dataset_column_names(dataset: Union[datasets.Dataset, datasets.IterableDataset, Mapping]):\n    \"\"\"Get the column names in a dataset\n\n    Args:\n        dataset: dataset to get the column names from\n\n    \"\"\"\n    if isinstance(dataset, (datasets.IterableDatasetDict, Mapping)):\n        column_names = {split: dataset[split].column_names for split in dataset}\n    else:\n        column_names = dataset.column_names\n    if isinstance(column_names, dict):\n        column_names = list(column_names.values())[0]\n    return column_names\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.data_utils.take","title":"<code>take(n, iterable)</code>","text":"<p>Return first n items of the iterable as a list</p> Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def take(n, iterable):\n    \"Return first n items of the iterable as a list\"\n    return list(itertools.islice(iterable, n))\n</code></pre>"},{"location":"api/safe.models.html#safe.trainer.data_utils.tokenize_fn","title":"<code>tokenize_fn(row, tokenizer, tokenize_column='inputs', max_length=None, padding=False)</code>","text":"<p>Perform the tokenization of a row Args:     row: row to tokenize     tokenizer: tokenizer to use     tokenize_column: column to tokenize     max_length: maximum size of the tokenized sequence     padding: whether to pad the sequence</p> Source code in <code>safe/trainer/data_utils.py</code> <pre><code>def tokenize_fn(\n    row: Dict[str, Any],\n    tokenizer: Callable,\n    tokenize_column: str = \"inputs\",\n    max_length: Optional[int] = None,\n    padding: bool = False,\n):\n    \"\"\"Perform the tokenization of a row\n    Args:\n        row: row to tokenize\n        tokenizer: tokenizer to use\n        tokenize_column: column to tokenize\n        max_length: maximum size of the tokenized sequence\n        padding: whether to pad the sequence\n    \"\"\"\n    # there's probably a way to do this with the tokenizer settings\n    # but again, gotta move fast\n\n    fast_tokenizer = (\n        tokenizer.get_pretrained() if isinstance(tokenizer, SAFETokenizer) else tokenizer\n    )\n\n    return fast_tokenizer(\n        row[tokenize_column],\n        truncation=(max_length is not None),\n        max_length=max_length,\n        padding=padding,\n        return_tensors=None,\n    )\n</code></pre>"},{"location":"api/safe.viz.html","title":"Visualization","text":""},{"location":"api/safe.viz.html#safe.viz.to_image","title":"<code>to_image(safe_str, fragments=None, legend=None, mol_size=(300, 300), use_svg=True, highlight_mode='lasso', highlight_bond_width_multiplier=12, **kwargs)</code>","text":"<p>Display a safe string by highlighting the fragments that make it.</p> <p>Parameters:</p> Name Type Description Default <code>safe_str</code> <code>str</code> <p>the safe string to display</p> required <code>fragments</code> <code>Optional[Union[str, Mol]]</code> <p>list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.</p> <code>None</code> <code>legend</code> <code>Union[str, None]</code> <p>A string to use as the legend under the molecule.</p> <code>None</code> <code>mol_size</code> <code>Union[Tuple[int, int], int]</code> <p>The size of the image to be returned</p> <code>(300, 300)</code> <code>use_svg</code> <code>Optional[bool]</code> <p>Whether to return an svg or png image</p> <code>True</code> <code>highlight_mode</code> <code>Optional[str]</code> <p>the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown</p> <code>'lasso'</code> <code>highlight_bond_width_multiplier</code> <code>int</code> <p>the multiplier to use for the bond width when using the 'fill' mode</p> <code>12</code> <code>**kwargs</code> <code>Any</code> <p>Additional arguments to pass to the drawing function. See RDKit documentation related to <code>MolDrawOptions</code> for more details at https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.</p> <code>{}</code> Source code in <code>safe/viz.py</code> <pre><code>def to_image(\n    safe_str: str,\n    fragments: Optional[Union[str, dm.Mol]] = None,\n    legend: Union[str, None] = None,\n    mol_size: Union[Tuple[int, int], int] = (300, 300),\n    use_svg: Optional[bool] = True,\n    highlight_mode: Optional[str] = \"lasso\",\n    highlight_bond_width_multiplier: int = 12,\n    **kwargs: Any,\n):\n    \"\"\"Display a safe string by highlighting the fragments that make it.\n\n    Args:\n        safe_str: the safe string to display\n        fragments: list of fragment to highlight on the molecules. If None, will use safe decomposition of the molecule.\n        legend: A string to use as the legend under the molecule.\n        mol_size: The size of the image to be returned\n        use_svg: Whether to return an svg or png image\n        highlight_mode: the highlight mode to use. One of [\"lasso\", \"fill\", \"color\"]. If None, no highlight will be shown\n        highlight_bond_width_multiplier: the multiplier to use for the bond width when using the 'fill' mode\n        **kwargs: Additional arguments to pass to the drawing function. See RDKit\n            documentation related to `MolDrawOptions` for more details at\n            https://www.rdkit.org/docs/source/rdkit.Chem.Draw.rdMolDraw2D.html.\n\n    \"\"\"\n\n    kwargs[\"legends\"] = legend\n    kwargs[\"mol_size\"] = mol_size\n    kwargs[\"use_svg\"] = use_svg\n    if highlight_bond_width_multiplier is not None:\n        kwargs[\"highlightBondWidthMultiplier\"] = highlight_bond_width_multiplier\n\n    if highlight_mode == \"color\":\n        kwargs[\"continuousHighlight\"] = False\n        kwargs[\"circleAtoms\"] = kwargs.get(\"circleAtoms\", False) or False\n\n    if isinstance(fragments, (str, dm.Mol)):\n        fragments = [fragments]\n\n    if fragments is None and highlight_mode is not None:\n        fragments = [\n            sf.decode(x, as_mol=False, remove_dummies=False, ignore_errors=False)\n            for x in safe_str.split(\".\")\n        ]\n    elif fragments and len(fragments) &gt; 0:\n        parsed_fragments = []\n        for fg in fragments:\n            if isinstance(fg, str) and dm.to_mol(fg) is None:\n                fg = sf.decode(fg, as_mol=False, remove_dummies=False, ignore_errors=False)\n            parsed_fragments.append(fg)\n        fragments = parsed_fragments\n    else:\n        fragments = []\n    mol = dm.to_mol(safe_str, remove_hs=False)\n    cm = plt.get_cmap(\"gist_rainbow\")\n    current_colors = [cm(1.0 * i / len(fragments)) for i in range(len(fragments))]\n\n    if highlight_mode == \"lasso\":\n        return dm.viz.lasso_highlight_image(mol, fragments, **kwargs)\n\n    atom_indices = []\n    bond_indices = []\n    atom_colors = {}\n    bond_colors = {}\n\n    for i, frag in enumerate(fragments):\n        frag = dm.from_smarts(frag)\n        atom_matches, bond_matches = dm.substructure_matching_bonds(mol, frag)\n        atom_matches = list(itertools.chain(*atom_matches))\n        bond_matches = list(itertools.chain(*bond_matches))\n        atom_indices.extend(atom_matches)\n        bond_indices.extend(bond_matches)\n        atom_colors.update({x: current_colors[i] for x in atom_matches})\n        bond_colors.update({x: current_colors[i] for x in bond_matches})\n\n    return dm.viz.to_image(\n        mol,\n        highlight_atom=[atom_indices],\n        highlight_bond=[bond_indices],\n        highlightAtomColors=[atom_colors],\n        highlightBondColors=[bond_colors],\n        **kwargs,\n    )\n</code></pre>"},{"location":"tutorials/design-with-safe.html","title":"Molecular design","text":"In\u00a0[2]: Copied! <pre>import os\n\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n\n\nimport safe as sf\nimport datamol as dm\n</pre> import os   os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"   import safe as sf import datamol as dm  <p>Load the default pretrained Safe model.</p> <p>We will use this unique model for all the downstream molecular design tasks.</p> In\u00a0[3]: Copied! <pre>designer = sf.SAFEDesign.load_default(verbose=True)\n\ndesigner.model\n</pre> designer = sf.SAFEDesign.load_default(verbose=True)  designer.model  Out[3]: <pre>SAFEDoubleHeadsModel(\n  (transformer): GPT2Model(\n    (wte): Embedding(1880, 768)\n    (wpe): Embedding(1024, 768)\n    (drop): Dropout(p=0.1, inplace=False)\n    (h): ModuleList(\n      (0-11): 12 x GPT2Block(\n        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n        (attn): GPT2Attention(\n          (c_attn): Conv1D()\n          (c_proj): Conv1D()\n          (attn_dropout): Dropout(p=0.1, inplace=False)\n          (resid_dropout): Dropout(p=0.1, inplace=False)\n        )\n        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n        (mlp): GPT2MLP(\n          (c_fc): Conv1D()\n          (c_proj): Conv1D()\n          (act): NewGELUActivation()\n          (dropout): Dropout(p=0.1, inplace=False)\n        )\n      )\n    )\n    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n  )\n  (lm_head): Linear(in_features=768, out_features=1880, bias=False)\n  (multiple_choice_head): PropertyHead(\n    (summary): Linear(in_features=768, out_features=64, bias=True)\n    (activation): ReLU()\n    (out): Linear(in_features=64, out_features=1, bias=True)\n  )\n)</pre> <p>Let's start with the below molecule.</p> In\u00a0[4]: Copied! <pre>candidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\"\ncandidate_mol = dm.to_mol(candidate_smiles)\n\ndm.to_image(candidate_mol)\n</pre> candidate_smiles = \"O=C(C#CCN1CCCCC1)Nc1ccc2ncnc(Nc3cccc(Br)c3)c2c1\" candidate_mol = dm.to_mol(candidate_smiles)  dm.to_image(candidate_mol)  Out[4]: In\u00a0[6]: Copied! <pre>generated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12)\n\ngenerated_smiles[:5]\n</pre> generated_smiles = designer.de_novo_generation(sanitize=True, n_samples_per_trial=12)  generated_smiles[:5]  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams&gt;1` or unset `early_stopping`.\n  warnings.warn(\n2023-10-28 11:37:25.393 | INFO     | safe.sample:de_novo_generation:581 - After sanitization, 82 / 100 (82.00 %) generated molecules are valid !\n</pre> Out[6]: <pre>['CCCCOc1c(Br)cc(C)cc1-c1nc(C2(CC)CCN(C(C)C)CC2)cn2nc(C)nc12',\n 'CC(C)(C)OC(=O)Nc1ccc(C[NH+]2CC[C@@H]3OCCC[C@H]3C2)cn1',\n 'Cc1ccc(Br)c(NCCC(C)C(C)C)c1',\n 'CCOC(=O)C1=C(C)N=c2s/c(=C/c3c(C)[nH]c4ccccc34)c(=O)n2[C@@H]1c1ccc(OC)cc1',\n 'CCc1ccccc1-n1cc(O)c(C(=O)Nc2ccc(Cl)c(F)c2)n1']</pre> In\u00a0[7]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[7]: In\u00a0[8]: Copied! <pre>scaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\"\n\ndm.to_image(scaffold)\n</pre> scaffold = \"[*]N-c1ccc2ncnc(-N[*])c2c1\"  dm.to_image(scaffold)  Out[8]: In\u00a0[9]: Copied! <pre>generated_smiles = designer.scaffold_decoration(\n    scaffold=scaffold,\n    n_samples_per_trial=12,\n    n_trials=2,\n    sanitize=True,\n    do_not_fragment_further=True,\n)\n\ngenerated_mols = [dm.to_mol(x) for x in generated_smiles]\n</pre> generated_smiles = designer.scaffold_decoration(     scaffold=scaffold,     n_samples_per_trial=12,     n_trials=2,     sanitize=True,     do_not_fragment_further=True, )  generated_mols = [dm.to_mol(x) for x in generated_smiles]  <pre>  0%|          | 0/2 [00:00&lt;?, ?it/s]</pre> <pre>/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams&gt;1` or unset `early_stopping`.\n  warnings.warn(\n2023-10-28 11:37:48.620 | INFO     | safe.sample:scaffold_decoration:542 - After sanitization, 21 / 24 (87.50 %)  generated molecules are valid !\n</pre> In\u00a0[10]: Copied! <pre>dm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1)\n</pre> dm.viz.lasso_highlight_image(generated_mols[:12], dm.from_smarts(scaffold), mol_size=(350, 200), color_list=[\"#ff80b5\"], scale_padding=0.1)  Out[10]: In\u00a0[11]: Copied! <pre>superstructure = \"c1ccc2ncncc2c1\"\n\ndm.to_image(superstructure)\n</pre> superstructure = \"c1ccc2ncncc2c1\"  dm.to_image(superstructure)  Out[11]: In\u00a0[12]: Copied! <pre>generated_smiles = designer.super_structure(\n    core=superstructure,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    attachment_point_depth=3,\n)\n\ngenerated_smiles\n</pre> generated_smiles = designer.super_structure(     core=superstructure,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     attachment_point_depth=3, )  generated_smiles  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>/home/hadim/local/micromamba/envs/safe/lib/python3.11/site-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams&gt;1` or unset `early_stopping`.\n  warnings.warn(\n2023-10-28 11:38:24.884 | INFO     | safe.sample:super_structure:496 - After sanitization, 12 / 12 (100.00 %)  generated molecules are valid !\n</pre> Out[12]: <pre>['c1ncc2c(N3CCOCC3)ccc(N3CCNCC3)c2n1',\n 'N[C@H](CNc1ccc(C(F)(F)F)c2ncncc12)C(F)(F)F',\n 'C=CCCCNC(=S)Nc1ccc(C(F)(F)F)c2cncnc12',\n 'O=C(N[C@@H](CO)CCF)c1ccc(C(=O)[O-])c2ncncc12',\n 'O=C(CC=Nc1ccc(OC(F)(F)F)c2ncncc12)C(F)(F)F',\n 'NC(=Nc1ccc([N+](=O)[O-])c2cncnc12)C(F)(F)F',\n 'O=C(CCC(F)=C(F)F)Nc1ccc(C(F)(F)F)c2ncncc12',\n 'O=S(=O)(CCC(F)(F)F)Nc1cccc2cncnc12',\n 'O=S(=O)(Cl)c1ccc(C(F)(F)F)c2ncncc12',\n 'c1ncc2c(N3CCCCCC3)ccc(-c3cn[nH]c3)c2n1',\n 'NC(=O)CSCC(=O)Nc1ccc(C(=O)[O-])c2ncncc12',\n 'c1ncc2c(-n3cncn3)ccc(C3CCCCN3)c2n1']</pre> In\u00a0[14]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[14]: In\u00a0[15]: Copied! <pre>motif = \"[*]-N1CCCCC1\"\n\ndm.to_image(motif)\n</pre> motif = \"[*]-N1CCCCC1\"  dm.to_image(motif)  Out[15]: In\u00a0[26]: Copied! <pre># let's make some long sequence\ngenerated_smiles = designer.motif_extension(\n    motif=motif,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    min_length=25,\n    max_length=80,\n)\n\ngenerated_smiles\n</pre> # let's make some long sequence generated_smiles = designer.motif_extension(     motif=motif,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     min_length=25,     max_length=80, )  generated_smiles  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>2023-10-28 11:41:52.959 | INFO     | safe.sample:scaffold_decoration:542 - After sanitization, 10 / 12 (83.33 %)  generated molecules are valid !\n</pre> Out[26]: <pre>['C1CCN([C@@H]2CCCC[C@@H]2[NH+]2CCOCC2)CC1',\n 'FC(F)(F)C(F)(F)CN1CCCCC1',\n 'O=NN(/C(=C/N1CCCCC1)N1CCCCC1)c1ccccc1',\n 'C1CCC(CC2(CC3CCCC3)CCCCC2C2CCCCCC2N2CCCCC2)CC1',\n '[Na+].[Na+].[O-]S(=S)(=S)N1CCCCC1',\n 'NC(CS)C(O)=NC(O)C(=O)N1CCCCC1',\n 'O=P(O)(O)CCOCCOP(=O)(O)SCCN1CCCCC1',\n 'C1CCN(N=c2nn[nH][nH]2)CC1.O.O',\n 'N.N#CC1C=CCN1N1CCCCC1',\n 'O=C1CCCCC1.O=C1COCCCN1N1CCCCC1']</pre> In\u00a0[27]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[27]: In\u00a0[28]: Copied! <pre>side_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\"\n\ndm.to_image(side_chains)\n</pre> side_chains = \"[1*]C(=O)C#CCN1CCCCC1.[2*]c1cccc(Br)c1\"  dm.to_image(side_chains)  Out[28]: In\u00a0[29]: Copied! <pre>generated_smiles = designer.scaffold_morphing(\n    side_chains=side_chains,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    random_seed=100,\n)\n\ndm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> generated_smiles = designer.scaffold_morphing(     side_chains=side_chains,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     random_seed=100, )  dm.to_image(generated_smiles[:12], mol_size=(350, 200))  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>2023-10-28 11:42:05.888 | INFO     | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %)  generated molecules are valid !\n</pre> Out[29]: In\u00a0[30]: Copied! <pre>linker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"]\n\ndm.to_image(linker_generation)\n</pre> linker_generation = [\"[*]-N1CCCCC1\", \"Brc1cccc(Nc2ncnc3ccc(-[*])cc23)c1\"]  dm.to_image(linker_generation)  Out[30]: In\u00a0[31]: Copied! <pre>generated_smiles = designer.linker_generation(\n    *linker_generation,\n    n_samples_per_trial=12,\n    n_trials=1,\n    sanitize=True,\n    do_not_fragment_further=False,\n    random_seed=100,\n)\n\ngenerated_smiles\n</pre> generated_smiles = designer.linker_generation(     *linker_generation,     n_samples_per_trial=12,     n_trials=1,     sanitize=True,     do_not_fragment_further=False,     random_seed=100, )  generated_smiles  <pre>  0%|          | 0/1 [00:00&lt;?, ?it/s]</pre> <pre>2023-10-28 11:42:14.034 | INFO     | safe.sample:_fragment_linking:397 - After sanitization, 12 / 12 (100.00 %)  generated molecules are valid !\n</pre> Out[31]: <pre>['O=C(Oc1cccc(-c2nc(N3CCCCC3)nc3c2CCN3)c1)c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'O=C(Oc1cccc(-c2nc(-c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)nc3c2CCN3)c1)N1CCCCC1',\n 'N=C(N)NCCCN1C(=O)N(CN2CCCCC2)C(=O)C2CC(c3ccc4ncnc(Nc5cccc(Br)c5)c4c3)CC21',\n 'N=C(N)NCCCN1C(=O)N(Cc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)C(=O)C2CC(N3CCCCC3)CC21',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc5c4oc4c6ccccc6c(Nc6cccc(N7CCCCC7)c6)cc54)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cccc(Nc5cc6c7cccc(N8CCCCC8)c7oc6c6ccccc56)c4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4cc(-c5nc6n(n5)CC=C[C@H]6N5CCCCC5)ncn4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc([C@@H]4C=CCn5nc(-c6cc(N7CCCCC7)ncn6)nc54)cc23)c1',\n 'O=C1C[C@@H]2C[C@H]3[C@H](N4CCCCC4)CC4COCCC42O[C@@H]3CC(CCc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)O1',\n 'O=C1C[C@@H]2C[C@@H]3[C@@H](CC(CCN4CCCCC4)O1)OC21CCOCC1C[C@H]3c1ccc2ncnc(Nc3cccc(Br)c3)c2c1',\n 'Brc1cccc(Nc2ncnc3ccc(NNc4ccc(SCCCCCCc5ccc(N6CCCCC6)cc5)cc4)cc23)c1',\n 'Brc1cccc(Nc2ncnc3ccc(-c4ccc(CCCCCCSc5ccc(NNN6CCCCC6)cc5)cc4)cc23)c1']</pre> In\u00a0[32]: Copied! <pre>dm.to_image(generated_smiles[:12], mol_size=(350, 200))\n</pre> dm.to_image(generated_smiles[:12], mol_size=(350, 200))  Out[32]: <p>The End !</p>"},{"location":"tutorials/design-with-safe.html#de-novo-generation","title":"De novo generation\u00b6","text":"<p>Generation of novel molecules without any constraints.</p>"},{"location":"tutorials/design-with-safe.html#scaffold-decoration","title":"Scaffold Decoration\u00b6","text":"<p>For scaffold decoration, we wish to generate new molecules that would contain a given scaffold as core. Usually, the attachment point on the scaffold should dictate where the new vectors will be added.</p>"},{"location":"tutorials/design-with-safe.html#super-structure-generation","title":"Super structure generation\u00b6","text":"<p>In super structure generation, we just want to generate superstructure of a molecular subgraph</p>"},{"location":"tutorials/design-with-safe.html#motif-extension","title":"Motif Extension\u00b6","text":"<p>In motif extension, we are interested in generating a molecule containing a given motif as starting point.</p>"},{"location":"tutorials/design-with-safe.html#scaffold-morphing","title":"Scaffold Morphing\u00b6","text":"<p>In scaffold morphing, we wish to replace a scaffold by another one in a molecule. The process requires as input that the user provides either the side chains or the input molecules and the core</p>"},{"location":"tutorials/design-with-safe.html#linker-generation","title":"Linker generation\u00b6","text":"<p>Linker generation is mostly the same thing as scaffold morphing ...</p>"},{"location":"tutorials/extracting-representation-molfeat.html","title":"so really we just need our custom converter","text":"In\u00a0[1]: Copied! <pre>%load_ext autoreload\n%autoreload 2\n</pre> %load_ext autoreload %autoreload 2 In\u00a0[2]: Copied! <pre>import safe\nimport torch\nimport datamol as dm\nimport types\nfrom molfeat.trans.pretrained import PretrainedMolTransformer\nfrom molfeat.trans.pretrained import PretrainedHFTransformer\n\nfrom molfeat.trans.pretrained.hf_transformers import HFModel\nfrom safe.trainer.model import SAFEDoubleHeadsModel\nfrom safe.tokenizer import SAFETokenizer\n</pre> import safe import torch import datamol as dm import types from molfeat.trans.pretrained import PretrainedMolTransformer from molfeat.trans.pretrained import PretrainedHFTransformer  from molfeat.trans.pretrained.hf_transformers import HFModel from safe.trainer.model import SAFEDoubleHeadsModel from safe.tokenizer import SAFETokenizer  In\u00a0[3]: Copied! <pre>safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\nsafe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n</pre> safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\") safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\") <p>We now need to build the <code>molfeat</code>'s <code>HFModel</code> instance by wrapping our model.</p> In\u00a0[4]: Copied! <pre>safe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\n</pre> safe_hf_model = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained()) <p>You can put the above process in the <code>__init__</code> of the <code>SAFEMolTransformer</code> if you wish as we will be doing below.</p> In\u00a0[20]: Copied! <pre>class SAFEMolTransformer(PretrainedHFTransformer):\n    \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is \n    how we convert the input molecules into the safe format\"\"\"\n    def __init__(self, kind=None, notation=\"safe\", **kwargs):\n        if kind is None:\n            # we load the default SAFE model if the exact SAFE GPT model \n            # to use is not provided\n            safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")\n            safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")\n            kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())\n        super().__init__(kind, notation=None, **kwargs)\n        # now we change the internal converter\n        # overriding the internal converter of SmilesConverter leverages the exception handling\n        # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS\n        # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds.\n        self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe)\n        # you could also do any of the following:\n        # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode)\n        # self.converter = safe # the safe module\n</pre> class SAFEMolTransformer(PretrainedHFTransformer):     \"\"\"Build the SAFE Molecule transformers, the only thing we need to define is      how we convert the input molecules into the safe format\"\"\"     def __init__(self, kind=None, notation=\"safe\", **kwargs):         if kind is None:             # we load the default SAFE model if the exact SAFE GPT model              # to use is not provided             safe_model = SAFEDoubleHeadsModel.from_pretrained(\"datamol-io/safe-gpt\")             safe_tokenizer = SAFETokenizer.from_pretrained(\"datamol-io/safe-gpt\")             kind = HFModel.from_pretrained(safe_model, safe_tokenizer.get_pretrained())         super().__init__(kind, notation=None, **kwargs)         # now we change the internal converter         # overriding the internal converter of SmilesConverter leverages the exception handling         # The SAFE-GPT model was trained on a slightly different splitting algorithm compared to the default BRICS         # this does not change anything in theory, it just try harder to break bonds even if there are no BRICS bonds.         self.converter.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.utils.convert_to_safe)         # you could also do any of the following:         # self.converter = types.SimpleNamespace(decode=safe.decode, encode=safe.encode)         # self.converter = safe # the safe module  <pre>2023-12-20 22:57:39.310 | WARNING  | molfeat.trans.base:__init__:51 - The 'SAFEMolTransformer' interaction has been superseded by a new class with id 0x2ad77d6a0\n</pre> <p>Let's use the GPT pooler which uses the last non padding token (often <code>eos</code>) since the model is GPT2 like. For other options, see: https://molfeat-docs.datamol.io/stable/api/molfeat.utils.html#pooling</p> In\u00a0[116]: Copied! <pre># Let's use the GPT pooling method and only take the last hidden layer\nsafe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1])\nsafe_transformers\n</pre> # Let's use the GPT pooling method and only take the last hidden layer safe_transformers = SAFEMolTransformer(pooling=\"gpt\", concat_layers=[-1]) safe_transformers Out[116]: <pre>SAFEMolTransformer(dtype=np.float32)</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.SAFEMolTransformer<pre>SAFEMolTransformer(dtype=np.float32)</pre> In\u00a0[117]: Copied! <pre>mols = dm.data.freesolv().iloc[:10].smiles.values\n</pre> mols = dm.data.freesolv().iloc[:10].smiles.values In\u00a0[118]: Copied! <pre>safe_transformers(mols)\n</pre> safe_transformers(mols) Out[118]: <pre>array([[ 0.05216356,  0.10754181,  0.07509107, ...,  0.04756968,\n        -0.08228929, -0.11568106],\n       [ 0.02449008,  0.04048932,  0.14489463, ...,  0.11410899,\n        -0.02203353,  0.08706839],\n       [-0.07425696,  0.11859665,  0.19010407, ...,  0.10526019,\n         0.08878426, -0.06609854],\n       ...,\n       [ 0.07867863,  0.19300285,  0.23054805, ..., -0.00737952,\n         0.07542405,  0.00289541],\n       [ 0.12092628, -0.01785688,  0.19791883, ...,  0.13796932,\n         0.11520796, -0.15333697],\n       [-0.02005584,  0.13946685,  0.18568742, ...,  0.07080407,\n         0.06991849, -0.07151204]], dtype=float32)</pre> In\u00a0[119]: Copied! <pre>from sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.pipeline import Pipeline\n\ndf = dm.data.freesolv()\ndf[\"safe\"]  = df[\"smiles\"].apply(safe_transformers.converter.encode)\ndf = df.dropna(subset=\"safe\")\n# we have to remove the molecules that cannot be converted \n# (no breakable bonds with our default methodology)\n</pre> from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline  df = dm.data.freesolv() df[\"safe\"]  = df[\"smiles\"].apply(safe_transformers.converter.encode) df = df.dropna(subset=\"safe\") # we have to remove the molecules that cannot be converted  # (no breakable bonds with our default methodology)  In\u00a0[120]: Copied! <pre>X, y = df[\"smiles\"].values, df[\"expt\"].values\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2)\n\n# The Molfeat transformer seemingly integrates with Scikit-learn Pipeline!\npipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())])\n</pre>  X, y = df[\"smiles\"].values, df[\"expt\"].values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2)  # The Molfeat transformer seemingly integrates with Scikit-learn Pipeline! pipe = Pipeline([(\"feat\", safe_transformers), (\"rf\", RandomForestRegressor())]) In\u00a0[121]: Copied! <pre>with dm.without_rdkit_log():\n    pipe.fit(X_train, y_train)\n    score = pipe.score(X_test, y_test)\n    y_pred = pipe.predict(X_test)\n</pre> with dm.without_rdkit_log():     pipe.fit(X_train, y_train)     score = pipe.score(X_test, y_test)     y_pred = pipe.predict(X_test)  In\u00a0[122]: Copied! <pre>print(\"R2 score:\", score)\n</pre> print(\"R2 score:\", score) <pre>R2 score: 0.4971483821661925\n</pre> In\u00a0[123]: Copied! <pre>import matplotlib.pyplot as plt\n\nfig, ax = plt.subplots()\nax.scatter(y_test, y_pred)\nax.set_xlabel(\"Target\")\nax.set_ylabel(\"Preds\")\n</pre> import matplotlib.pyplot as plt  fig, ax = plt.subplots() ax.scatter(y_test, y_pred) ax.set_xlabel(\"Target\") ax.set_ylabel(\"Preds\") Out[123]: <pre>Text(0, 0.5, 'Preds')</pre> <p>Not really a great result. Any other model in <code>molfeat</code> would do better.</p>"},{"location":"tutorials/extracting-representation-molfeat.html#loading-the-safe-gpt-model-into-molfeat","title":"Loading the SAFE-GPT model into molfeat\u00b6","text":"<p>Because the SAFE model is not a standard HuggingFace <code>transformers</code> model, we need to wrap it.</p> <p>Why are we doing this ? Because we want to leverage the structure of <code>molfeat</code> and not have to write our own pooling for the model. This can be done by using the huggingface molecule transformer <code>PretrainedHFTransformer</code>  rather than the general purpose pretrained model class <code>PretrainedMolTransformer</code> where we will have to define our own <code>_embed</code> and <code>_convert</code> function.</p>"},{"location":"tutorials/extracting-representation-molfeat.html#building-the-safe-molecule-transformers","title":"Building the SAFE Molecule Transformers\u00b6","text":"<p>We have multiple options here, we can override the <code>_convert</code> method or even the <code>_embed</code> method but the best thing about <code>molfeat</code> is how flexible it is and all the shortcuts it provides.</p> <p>In this case, we just need to change the custom</p>"},{"location":"tutorials/extracting-representation-molfeat.html#so-really-we-just-need-our-custom-converter","title":"so really we just need our custom converter\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#basic-test","title":"Basic Test\u00b6","text":""},{"location":"tutorials/extracting-representation-molfeat.html#tips","title":"Tips\u00b6","text":"<ol> <li>Make sure that your inputs are SMILES or RDKit Molecules.</li> <li>If you are getting an error coming from some tokenization step, that means that you are likely getting <code>None</code> molecules at some steps in the conversion to SAFE. This can happen if there your slicing algorithm of choice is not working. In that case, please filter your datasets to remove molecules that fails the encoding steps first. You can always use the very robus <code>safe.utils.convert_to_safe</code>, which augment default BRICS slicing with some graph partitioning algorithm.</li> </ol>"},{"location":"tutorials/getting-started.html","title":"Getting Started with SAFE","text":"In\u00a0[2]: Copied! <pre>import safe as sf\nimport datamol as dm\n\ncelecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\"\ncelecoxib_mol = dm.to_mol(celecoxib)\n\ndisplay(dm.to_image(celecoxib_mol))\n</pre> import safe as sf import datamol as dm  celecoxib = \"Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1\" celecoxib_mol = dm.to_mol(celecoxib)  display(dm.to_image(celecoxib_mol))  In\u00a0[3]: Copied! <pre>safe_str = sf.encode(celecoxib_mol)\n\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n</pre> safe_str = sf.encode(celecoxib_mol)  print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")  <pre>c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\nRepresentation using 4 fragments\n</pre> <p>SAFE string are SMILES</p> <p>Any SAFE string is a valid SMILES and can be read by RDKit without any decoding trick.</p> In\u00a0[4]: Copied! <pre>reconstructed = dm.to_mol(safe_str)\n\ndisplay(dm.to_image(reconstructed))\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\n</pre> reconstructed = dm.to_mol(safe_str)  display(dm.to_image(reconstructed))  assert dm.same_mol(celecoxib_mol, reconstructed)  <p>SAFE supports randomization</p> <p>You can generate randomized SAFE strings.</p> In\u00a0[5]: Copied! <pre>random_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True)\n\nprint(random_safe_str)\n\nreconstructed = dm.to_mol(safe_str)\n\nassert dm.same_mol(celecoxib_mol, reconstructed)\n</pre> random_safe_str = sf.encode(celecoxib_mol, canonical=False, randomize=True)  print(random_safe_str)  reconstructed = dm.to_mol(safe_str)  assert dm.same_mol(celecoxib_mol, reconstructed)  <pre>c15ccc(S(N)(=O)=O)cc1.c16cc4nn15.C4(F)(F)F.c16ccc(C)cc1\n</pre> <p>Fragment order in SAFE does not matter</p> <p>Any permutation of the fragment order in a SAFE string preserve the molecule identity</p> In\u00a0[6]: Copied! <pre>import numpy as np\n\nfragments = safe_str.split(\".\")\nrandomized_fragment_safe_str = np.random.permutation(fragments).tolist()\nrandomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str)\n\nprint(randomized_fragment_safe_str, safe_str)\nassert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)\n</pre> import numpy as np  fragments = safe_str.split(\".\") randomized_fragment_safe_str = np.random.permutation(fragments).tolist() randomized_fragment_safe_str = \".\".join(randomized_fragment_safe_str)  print(randomized_fragment_safe_str, safe_str) assert dm.same_mol(celecoxib_mol, randomized_fragment_safe_str)  <pre>c14ccc(S(N)(=O)=O)cc1.c15cc3nn14.Cc1ccc5cc1.C3(F)(F)F c14ccc(S(N)(=O)=O)cc1.Cc1ccc5cc1.c15cc3nn14.C3(F)(F)F\n</pre> <p>Use your own slicing logic</p> <p>By default SAFE strings are generated using <code>BRICS</code>, however, the following are supported:</p> <ul> <li>Hussain-Rea (<code>hr</code>)</li> <li>RECAP (<code>recap</code>)</li> <li>RDKit's MMPA (<code>mmpa</code>)</li> <li>Any possible attachment points (<code>attach</code>)</li> </ul> <p>Furthermore, you can also provide your own slicing algorithm, which should return a pair of atoms corresponding to the bonds to break.</p> In\u00a0[7]: Copied! <pre>def my_slicer(mol):\n    \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\"\n    for bond in mol.GetBonds():\n        if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()):\n            yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())\n</pre> def my_slicer(mol):     \"\"\"Slice on non single bonds where at both atoms are in a distinct rings\"\"\"     for bond in mol.GetBonds():         if bond.GetBondType() == dm.SINGLE_BOND and not bond.IsInRing() and (bond.GetBeginAtom().IsInRing() and bond.GetEndAtom().IsInRing()):             yield (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())  In\u00a0[9]: Copied! <pre>safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n</pre> safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=my_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")  <pre>c14cc(C(F)(F)F)nn13.c13ccc(S(N)(=O)=O)cc1.Cc1ccc4cc1\nRepresentation using 3 fragments\n</pre> <p>Or simply use a SMARTS or a list of SMARTS.</p> In\u00a0[11]: Copied! <pre># The above is equivalent to using the following SMARTS:\nsmart_slicer = [\"[r]-;!@[r]\"]\nsafe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer)\nprint(safe_str)\nprint(f\"Representation using {len(safe_str.split('.'))} fragments\")\n</pre> # The above is equivalent to using the following SMARTS: smart_slicer = [\"[r]-;!@[r]\"] safe_str = sf.encode(celecoxib_mol, canonical=True, slicer=smart_slicer) print(safe_str) print(f\"Representation using {len(safe_str.split('.'))} fragments\")  <pre>c13cc(C(F)(F)F)nn14.c14ccc(S(N)(=O)=O)cc1.Cc1ccc3cc1\nRepresentation using 3 fragments\n</pre> In\u00a0[13]: Copied! <pre>safe_fragment = safe_str.split(\".\")\nsafe_fragment\n</pre> safe_fragment = safe_str.split(\".\") safe_fragment  Out[13]: <pre>['c13cc(C(F)(F)F)nn14', 'c14ccc(S(N)(=O)=O)cc1', 'Cc1ccc3cc1']</pre> In\u00a0[14]: Copied! <pre># the following will fail\ndm.to_mol(safe_fragment[0])\n</pre> # the following will fail dm.to_mol(safe_fragment[0])  <pre>[11:20:14] SMILES Parse Error: unclosed ring for input: 'c13cc(C(F)(F)F)nn14'\n</pre> In\u00a0[15]: Copied! <pre># while this works\nsf.decode(safe_fragment[0], as_mol=True)\n</pre> # while this works sf.decode(safe_fragment[0], as_mol=True)  Out[15]: In\u00a0[16]: Copied! <pre># if you want to keep the attachment points, then use remove_dummies=False\nsf.decode(safe_fragment[0], as_mol=True, remove_dummies=False)\n</pre> # if you want to keep the attachment points, then use remove_dummies=False sf.decode(safe_fragment[0], as_mol=True, remove_dummies=False)  Out[16]: In\u00a0[17]: Copied! <pre>sf.to_image(safe_str)\n</pre> sf.to_image(safe_str)  Out[17]: <p>There are 3 display modes for highlighting the fragments in a SAFE string. The difference between those modes is highlighted below using two different slicing algorithm.</p> <p>Overlapping fragments</p> <p>Note that because some fragment might be matching overlapping substructure of the molecules (for example the same fragment appearing multiple time in the molecule), the highlighting might assigned the same color to these fragments.</p> In\u00a0[18]: Copied! <pre>from IPython.display import display\nfrom ipywidgets import widgets, HBox\n\ndef display_image(safe_str):\n    image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml')\n    image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml')\n    image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml')\n    hbox = HBox([image_lasso, image_fill, image_color])\n    display(hbox)\n</pre>  from IPython.display import display from ipywidgets import widgets, HBox  def display_image(safe_str):     image_lasso = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"lasso\", legend=\"lasso mode\").data.encode(), format='svg+xml')     image_fill = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"fill\", legend=\"fill mode\").data.encode(), format='svg+xml')     image_color = widgets.Image(value=sf.to_image(safe_str, highlight_mode=\"color\", legend=\"color mode\").data.encode(), format='svg+xml')     hbox = HBox([image_lasso, image_fill, image_color])     display(hbox)  In\u00a0[19]: Copied! <pre># display for brics\nsafe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\")\ndisplay_image(safe_str_brics)\n</pre> # display for brics safe_str_brics = sf.encode(celecoxib_mol, canonical=True, slicer=\"brics\") display_image(safe_str_brics)  <pre>HBox(children=(Image(value=b'&lt;svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'&lt;s\u2026</pre> In\u00a0[20]: Copied! <pre># display with HR\nsafe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\")\ndisplay_image(safe_str_hr)\n</pre> # display with HR safe_str_hr = sf.encode(celecoxib_mol, canonical=True, slicer=\"mmpa\") display_image(safe_str_hr)  <pre>HBox(children=(Image(value=b'&lt;svg xmlns=\"http://www.w3.org/2000/svg\" ...', format='svg+xml'), Image(value=b'&lt;s\u2026</pre> <p>The End !</p>"},{"location":"tutorials/getting-started.html#getting-started-with-safe","title":"Getting Started with SAFE\u00b6","text":"<p>The SAFE encoding format is a rewriting of SMILES to ensure that any molecule can be written as a sequence of fragments where atoms or tokens corresponding to given fragments form a substring (ontiguous sequence) in the line notation representation.</p> <p>SAFE addresses some of the limitation of SMILES strings when it comes to generative design:</p> Safe Others - native support for (sub)structure-constrained design - different generative models for different generative tasks  - extensive substructure matching for filtering after generation - multiple steps generative process (e.g Liao et al. 2023 )  - graph based approaches with their limitations - any molecule generation as a simple NLP task (sequence completion or mask filling)  - a single autoregressive sequence model for both linker generation and scaffold decoration. - complex training and decoding schemes for scaffold-constrained generation (e.g Ar\u00fas-Pous et al. 2020 )  - complex sampling algorithms for scaffold-constrained generation (e.g Langevin et al. 2020) - SAFE strings are SMILES strings - requires a different chemical language (e.g Krenn et al. 2022)"},{"location":"tutorials/getting-started.html#using-safe","title":"Using SAFE\u00b6","text":"<p>In the following we will highlight how to use SAFE and some of the properties of SAFE strings.</p>"},{"location":"tutorials/getting-started.html#encoding","title":"Encoding\u00b6","text":"<p>SAFE represents fragments</p> <p>SAFE represents molecules as a set of N [Fragment_1].[Fragment_i].[Fragment_N]</p>"},{"location":"tutorials/getting-started.html#decoding","title":"Decoding\u00b6","text":"<p>Fragment order in SAFE does not matter</p> <p>Each <code>SAFE fragment</code> is a valid molecule itself, however, you need to use the decoder to recover molecules where all attachment point are not fullfiled.</p>"},{"location":"tutorials/getting-started.html#displaying-a-safe-encoding","title":"Displaying a SAFE encoding\u00b6","text":"<p>We provide a visualization module to display a safe string, with highlight of all the fragments that compose it.</p>"},{"location":"tutorials/how-it-works.html","title":"How SAFE encoding works?","text":"In\u00a0[1]: Copied! <pre>import datamol as dm\n\nfrom rdkit import Chem\nfrom rdkit.Chem.Draw import rdDepictor\nfrom rdkit.Chem import rdChemReactions as rdr\nrdDepictor.SetPreferCoordGen(True)\n</pre> import datamol as dm  from rdkit import Chem from rdkit.Chem.Draw import rdDepictor from rdkit.Chem import rdChemReactions as rdr rdDepictor.SetPreferCoordGen(True)  In\u00a0[2]: Copied! <pre>smiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"]\nlegends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"]\ndm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True)\n</pre> smiles = [\"c1ccccc1\", \"OC\", \"c1cc(*)ccc1\", \"O(*)C\", \"c1cc(*)ccc1.O(*)C\"] legends = [\"benzene\", \"methanol\", \"phenyl group\", \"Methoxy group\", \"composite\"] dm.viz.to_image([dm.to_mol(x) for x in smiles], legends=legends, n_cols=3, use_svg=True)  Out[2]: <p>In the example above, we can see that <code>phenol</code> can be represented as two fragments that can be connected given proper attachment point.</p> <p>To achieve this we are interested in attaching 2 fragments together (the <code>methoxy</code> and the <code>phenyl</code> groups). In RDKit, this can usually be achieved using chemical reactions. For convenience, we will prefer a standardized representation of attachment points that includes an atom mapping.</p> In\u00a0[3]: Copied! <pre>smiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] #\ndm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True)\n</pre> smiles = [ 'c1cc(*)ccc1.O(*)C', 'c1cc([*:1])ccc1.O([*:1])C'] # dm.viz.to_image([dm.to_mol(x) for x in smiles], n_cols=len(smiles), use_svg=True)  Out[3]: <p>To attach the two fragments, I can write a simple chemical transformation. Since smarts and smiles syntax do not mix very well when it comes to <code>*</code> I will assume an isotopic representation <code>[1*]</code> instead of <code>[*:1]</code></p> In\u00a0[4]: Copied! <pre>rxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]&gt;&gt;[*:1][*:2]\")\nrxn\n</pre> rxn = rdr.ReactionFromSmarts(\"[1*][*:1].[1*][*:2]&gt;&gt;[*:1][*:2]\") rxn  Out[4]: In\u00a0[5]: Copied! <pre># replace atom map by isotopes\nphenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\")\nmethoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\")\n\n# runreactions\nprod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy)))\nprod[0][0]\n</pre> # replace atom map by isotopes phenyl = \"c1cc([*:1])ccc1\".replace(\"[*:1]\", \"[1*]\") methoxy = \"O([*:1])C\".replace(\"[*:1]\", \"[1*]\")  # runreactions prod = rxn.RunReactants((dm.to_mol(phenyl), dm.to_mol(methoxy))) prod[0][0]  Out[5]: <p>We can achieve the same result by using rdkit API in an slightly more tedious way.</p> In\u00a0[6]: Copied! <pre>replacement_sub =  Chem.MolFromSmarts(\"[1*]\")\nprod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0)\nprod = dm.remove_dummies(prod[0], dummy=\"[1*]\")\nprod\n</pre> replacement_sub =  Chem.MolFromSmarts(\"[1*]\") prod = Chem.ReplaceSubstructs(dm.to_mol(phenyl), replacement_sub, dm.to_mol(methoxy), replacementConnectionPoint=0) prod = dm.remove_dummies(prod[0], dummy=\"[1*]\") prod  <pre>[11:14:08] WARNING: not removing hydrogen atom without neighbors\n</pre> Out[6]: <p>But wait, could we attach the fragment using only the string operations on the smiles ?</p> <p>Well, it's not possible by trying to perform substring replacement, but recall we just said that <code>numbers in smiles represents connectivity points</code> ?</p> In\u00a0[7]: Copied! <pre>phenyl = \"c1cc([*:1])ccc1\"\nmethoxy = \"O([*:1])C\"\ncomposite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C\ncompo = dm.to_mol(composite)\n</pre> phenyl = \"c1cc([*:1])ccc1\" methoxy = \"O([*:1])C\" composite = phenyl + \".\" + methoxy # c1cc([*:1])ccc1.O([*:1])C compo = dm.to_mol(composite)  <p>Since <code>1</code> \"connectivity point\" is already present in the phenyl group. We need to start by opening a new connectivity point: <code>2</code></p> In\u00a0[8]: Copied! <pre>attached_composite = composite.replace(\"[*:1]\", \"2\")\ndm.to_mol(attached_composite)\n</pre> attached_composite = composite.replace(\"[*:1]\", \"2\") dm.to_mol(attached_composite)  <pre>[11:14:10] SMILES Parse Error: syntax error while parsing: c1cc(2)ccc1.O(2)C\n[11:14:10] SMILES Parse Error: Failed parsing SMILES 'c1cc(2)ccc1.O(2)C' for input: 'c1cc(2)ccc1.O(2)C'\n</pre> <p>The previous line does not work because of violation in the smiles syntax. As we are not taking into account the branching bracket surrounding the attachment point.</p> <p>We could try to regenerate the smiles or scan the sequence and remove the brackets when it's possible, but we want to limit the operations to <code>str.replace</code>. So let's try again.</p> In\u00a0[9]: Copied! <pre>attached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\")\ndm.to_image(attached_composite, legends=[attached_composite])\n</pre> attached_composite = composite.replace(\"([*:1])\", \"2\").replace(\"[*:1]\", \"2\") dm.to_image(attached_composite, legends=[attached_composite])  Out[9]: <p>You can see that the phenol molecule is represented as two \"fragments\" <code>[Fragment1].[Fragment2]</code>. That is what SAFE is about.</p> <p>In summary, to build a SAFE string, we just need to follow the step below:</p> <p></p> <p>The End !</p>"},{"location":"tutorials/how-it-works.html#how-safe-encoding-works","title":"How SAFE encoding works?\u00b6","text":"<p>The intuition behind safe is quite simple: we want to represent any molecule as a  <code>set of connected fragments</code>.</p>"},{"location":"tutorials/how-it-works.html#preliminary-on-smiles","title":"Preliminary on SMILES.\u00b6","text":"<p>Let's start first by revisiting some information about the SMILES syntax:</p> <ul> <li><p>An asterisk <code>*</code> in a smiles is usually employed to indicate any atom OR an attachment point of any group. It's particularly useful for smarts matching.</p> </li> <li><p>Number in smiles syntax indicates connectivity points between two atoms. For 2 digits numbers they would need to be preceeded by <code>%</code>.</p> </li> </ul> <p>This is partially explained on the wikipedia ring section of SMILES.</p> <ul> <li>A dot <code>.</code> in smiles indicates the presence of additional fragments and is used to separate them.</li> </ul> <p>A good ressource on the subject is the DAYLIGHT page.</p> <p>We illustrate these informations below !</p>"}]}
\ No newline at end of file
diff --git a/main/sitemap.xml.gz b/main/sitemap.xml.gz
index ac67dee..c3ca0da 100644
Binary files a/main/sitemap.xml.gz and b/main/sitemap.xml.gz differ

Name	Type	Description	Default
`model`	+ `Union[SAFEDoubleHeadsModel, str]` +	+ + input SAFEDoubleHeadsModel to use for generation + +	+ required +
`tokenizer`	+ `Union[str, SAFETokenizer]` +	+ + input SAFETokenizer to use for generation + +	+ required +
`generation_config`	+ `Optional[Union[str, GenerationConfig]]` +	+ + input GenerationConfig to use for generation + +	+ `None` +
`safe_encoder`	+ `Optional[SAFEConverter]` +	+ + custom safe encoder to use + +	+ `None` +
`verbose`	+ `bool` +	+ + whether to print out logging information during generation + +	+ `True` +
Name	Type	Description	Default
`n_samples_per_trial`	+ `int` +	+ + number of new molecules to generate + +	+ `10` +
`sanitize`	+ `bool` +	+ + whether to perform sanitization, aka, perform control to ensure what is asked is what is returned + +	+ `False` +
`n_trials`	+ `Optional[int]` +	+ + number of randomization to perform + +	+ `None` +
`kwargs`	+ `Optional[Dict[Any, Any]]` +	+ + any argument to provide to the underlying generation function + +	+ `{}` +
Name	Type	Description	Default
`groups`	+ `Union[str, Mol]` +	+ + list of fragments to link together, they are joined in the order provided + +	+ `()` +
`n_samples_per_trial`	+ `int` +	+ + number of new molecules to generate for each randomization + +	+ `10` +
`n_trials`	+ `Optional[int]` +	+ + number of randomization to perform + +	+ `1` +
`do_not_fragment_further`	+ `Optional[bool]` +	+ + whether to fragment the scaffold further or not + +	+ `True` +
`sanitize`	+ `bool` +	+ + whether to sanitize the generated molecules + +	+ `False` +
`random_seed`	+ `Optional[int]` +	+ + random seed to use + +	+ `None` +
`model_only`	+ `Optional[bool]` +	+ + whether to use the model only ability and nothing more. + +	+ `False` +
`kwargs`	+ `Optional[Dict[Any, Any]]` +	+ + any argument to provide to the underlying generation function + +	+ `{}` +
Name	Type	Description	Default
`motif`	+ `Union[str, Mol]` +	+ + scaffold (with attachment points) to decorate + +	+ required +
`n_samples_per_trial`	+ `int` +	+ + number of new molecules to generate for each randomization + +	+ `10` +
`n_trials`	+ `Optional[int]` +	+ + number of randomization to perform + +	+ `1` +
`do_not_fragment_further`	+ `Optional[bool]` +	+ + whether to fragment the scaffold further or not + +	+ `True` +
`sanitize`	+ `bool` +	+ + whether to sanitize the generated molecules and check + +	+ `False` +
`random_seed`	+ `Optional[int]` +	+ + random seed to use + +	+ `None` +
`kwargs`	+ `Optional[Dict[Any, Any]]` +	+ + any argument to provide to the underlying generation function + +	+ `{}` +
Name	Type	Description	Default
`scaffold`	+ `Union[str, Mol]` +	+ + scaffold (with attachment points) to decorate + +	+ required +
`n_samples_per_trial`	+ `int` +	+ + number of new molecules to generate for each randomization + +	+ `10` +
`n_trials`	+ `Optional[int]` +	+ + number of randomization to perform + +	+ `1` +
`do_not_fragment_further`	+ `Optional[bool]` +	+ + whether to fragment the scaffold further or not + +	+ `True` +
`sanitize`	+ `bool` +	+ + whether to sanitize the generated molecules and check if the scaffold is still present + +	+ `False` +
`random_seed`	+ `Optional[int]` +	+ + random seed to use + +	+ `None` +
`kwargs`	+ `Optional[Dict[Any, Any]]` +	+ + any argument to provide to the underlying generation function + +	+ `{}` +
Name	Type	Description	Default
`side_chains`	+ `Optional[Union[Mol, str, List[Union[str, Mol]]]]` +	+ + side chains to use to perform scaffold morphing (joining as best as possible the set of fragments) + +	+ `None` +
`mol`	+ `Optional[Union[Mol, str]]` +	+ + input molecules when side_chains are not provided + +	+ `None` +
`core`	+ `Optional[Union[Mol, str]]` +	+ + core to morph into another scaffold + +	+ `None` +
`n_samples_per_trial`	+ `int` +	+ + number of new molecules to generate for each randomization + +	+ `10` +
`n_trials`	+ `Optional[int]` +	+ + number of randomization to perform + +	+ `1` +
`do_not_fragment_further`	+ `Optional[bool]` +	+ + whether to fragment the scaffold further or not + +	+ `True` +
`sanitize`	+ `bool` +	+ + whether to sanitize the generated molecules + +	+ `False` +
`random_seed`	+ `Optional[int]` +	+ + random seed to use + +	+ `None` +
`kwargs`	+ `Optional[Dict[Any, Any]]` +	+ + any argument to provide to the underlying generation function + +	+ `{}` +
Name	Type	Description	Default
`core`	+ `Union[str, Mol]` +	+ + input substructure to use. We aim to generate super structures of this molecule + +	+ required +
`n_samples_per_trial`	+ `int` +	+ + number of new molecules to generate for each randomization + +	+ `10` +
`n_trials`	+ `Optional[int]` +	+ + number of different attachment points to consider + +	+ `1` +
`do_not_fragment_further`	+ `Optional[bool]` +	+ + whether to fragment the scaffold further or not + +	+ `True` +
`sanitize`	+ `bool` +	+ + whether to sanitize the generated molecules + +	+ `False` +
`random_seed`	+ `Optional[int]` +	+ + random seed to use + +	+ `None` +
`attachment_point_depth`	+ `Optional[int]` +	+ + depth of opening the attachment points. +Increasing this, means you increase the number of substitution point to consider. + +	+ `None` +
`kwargs`	+ `Optional[Dict[Any, Any]]` +	+ + any argument to provide to the underlying generation function + +	+ `{}` +