diff --git a/online_install/index.html b/online_install/index.html
index ca8031eb3..a7bad3b5f 100644
--- a/online_install/index.html
+++ b/online_install/index.html
@@ -3630,28 +3630,18 @@ <h1>Online install</h1>
 
 <div><h2 id="installation-online-huggingface-space">Installation (Online HuggingFace Space)<a class="headerlink" href="#installation-online-huggingface-space" title="Permanent link">¶</a></h2>
 <ol>
-<li>
-<p>Go to <a href="https://huggingface.co/spaces/cin-model/kotaemon_template">kotaemon_template</a></p>
-</li>
-<li>
-<p>Use Duplicate function to create your own space</p>
-</li>
-</ol>
-<p><img alt="Duplicate space" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/duplicate_space.png"></p>
-<p><img alt="Change space params" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/change_space_params.png"></p>
-<ol>
-<li>Wait for the build to complete and start up (apprx 10 mins).</li>
-</ol>
-<p><img alt="Wait space build" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/space_build.png"></p>
-<p><img alt="Close space build" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/close_logs_space.png"></p>
-<ol>
-<li>Follow the first setup instructions (and register for Cohere API key if needed)</li>
-</ol>
-<p><img alt="Cohere API" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/cohere_api_key.png"></p>
-<ol>
-<li>Complete the setup and use your own private space!</li>
-</ol>
-<p><img alt="App Startup" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/initial_startup.png"></p></div>
+<li>Go to <a href="https://huggingface.co/spaces/cin-model/kotaemon_template">kotaemon_template</a></li>
+<li>Use Duplicate function to create your own space
+   <img alt="Duplicate space" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/duplicate_space.png">
+   <img alt="Change space params" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/change_space_params.png"></li>
+<li>Wait for the build to complete and start up (apprx 10 mins).
+   <img alt="Wait space build" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/space_build.png">
+   <img alt="Close space build" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/close_logs_space.png"></li>
+<li>Follow the first setup instructions (and register for Cohere API key if needed)\
+   <img alt="Cohere API" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/cohere_api_key.png"></li>
+<li>Complete the setup and use your own private space!
+   <img alt="App Startup" src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/initial_startup.png"></li>
+</ol></div>
 
 
 
diff --git a/reference/Summary/index.html b/reference/Summary/index.html
index 7b9b7c3e1..f47ad10fa 100644
--- a/reference/Summary/index.html
+++ b/reference/Summary/index.html
@@ -3724,7 +3724,7 @@ <h1>Summary</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:40+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:28+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3734,7 +3734,7 @@ <h1>Summary</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:40+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:28+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/base/index.html b/reference/agents/base/index.html
index 5f25eb005..4fbb96e81 100644
--- a/reference/agents/base/index.html
+++ b/reference/agents/base/index.html
@@ -3914,7 +3914,7 @@ <h3 id="agents.base.BaseAgent.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3924,7 +3924,7 @@ <h3 id="agents.base.BaseAgent.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/index.html b/reference/agents/index.html
index 846ccf07e..1cabb71b8 100644
--- a/reference/agents/index.html
+++ b/reference/agents/index.html
@@ -8727,7 +8727,7 @@ <h2 id="agents.WikipediaTool" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:40+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -8737,7 +8737,7 @@ <h2 id="agents.WikipediaTool" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:40+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/io/base/index.html b/reference/agents/io/base/index.html
index 1d757bb9d..e1ea8682c 100644
--- a/reference/agents/io/base/index.html
+++ b/reference/agents/io/base/index.html
@@ -5385,7 +5385,7 @@ <h2 id="agents.io.base.check_log" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5395,7 +5395,7 @@ <h2 id="agents.io.base.check_log" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/io/index.html b/reference/agents/io/index.html
index 62971cbe3..15fbe030c 100644
--- a/reference/agents/io/index.html
+++ b/reference/agents/io/index.html
@@ -5338,7 +5338,7 @@ <h3 id="agents.io.BaseScratchPad.critical" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5348,7 +5348,7 @@ <h3 id="agents.io.BaseScratchPad.critical" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/langchain_based/index.html b/reference/agents/langchain_based/index.html
index 77bfcbc13..1a9dd0470 100644
--- a/reference/agents/langchain_based/index.html
+++ b/reference/agents/langchain_based/index.html
@@ -3870,7 +3870,7 @@ <h2 id="agents.langchain_based.LangchainAgent" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3880,7 +3880,7 @@ <h2 id="agents.langchain_based.LangchainAgent" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/react/agent/index.html b/reference/agents/react/agent/index.html
index 275338675..dc24a5224 100644
--- a/reference/agents/react/agent/index.html
+++ b/reference/agents/react/agent/index.html
@@ -4926,7 +4926,7 @@ <h3 id="agents.react.agent.ReactAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4936,7 +4936,7 @@ <h3 id="agents.react.agent.ReactAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/react/index.html b/reference/agents/react/index.html
index 5899ff4aa..95d9402a1 100644
--- a/reference/agents/react/index.html
+++ b/reference/agents/react/index.html
@@ -4926,7 +4926,7 @@ <h3 id="agents.react.ReactAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4936,7 +4936,7 @@ <h3 id="agents.react.ReactAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/react/prompt/index.html b/reference/agents/react/prompt/index.html
index da753395b..d5ab50e6e 100644
--- a/reference/agents/react/prompt/index.html
+++ b/reference/agents/react/prompt/index.html
@@ -3687,7 +3687,7 @@ <h1>Prompt</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3697,7 +3697,7 @@ <h1>Prompt</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/rewoo/agent/index.html b/reference/agents/rewoo/agent/index.html
index d6bdb76d9..10650066d 100644
--- a/reference/agents/rewoo/agent/index.html
+++ b/reference/agents/rewoo/agent/index.html
@@ -4769,7 +4769,7 @@ <h3 id="agents.rewoo.agent.RewooAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4779,7 +4779,7 @@ <h3 id="agents.rewoo.agent.RewooAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/rewoo/index.html b/reference/agents/rewoo/index.html
index 21c61a20b..5de44a9ae 100644
--- a/reference/agents/rewoo/index.html
+++ b/reference/agents/rewoo/index.html
@@ -4769,7 +4769,7 @@ <h3 id="agents.rewoo.RewooAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4779,7 +4779,7 @@ <h3 id="agents.rewoo.RewooAgent.stream" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/rewoo/planner/index.html b/reference/agents/rewoo/planner/index.html
index 4df5b98e8..836a2f748 100644
--- a/reference/agents/rewoo/planner/index.html
+++ b/reference/agents/rewoo/planner/index.html
@@ -3928,7 +3928,7 @@ <h2 id="agents.rewoo.planner.Planner" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3938,7 +3938,7 @@ <h2 id="agents.rewoo.planner.Planner" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/rewoo/prompt/index.html b/reference/agents/rewoo/prompt/index.html
index 7161b5888..5bf7a6004 100644
--- a/reference/agents/rewoo/prompt/index.html
+++ b/reference/agents/rewoo/prompt/index.html
@@ -3687,7 +3687,7 @@ <h1>Prompt</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3697,7 +3697,7 @@ <h1>Prompt</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/rewoo/solver/index.html b/reference/agents/rewoo/solver/index.html
index 3735edd6b..952a5af78 100644
--- a/reference/agents/rewoo/solver/index.html
+++ b/reference/agents/rewoo/solver/index.html
@@ -3916,7 +3916,7 @@ <h2 id="agents.rewoo.solver.Solver" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3926,7 +3926,7 @@ <h2 id="agents.rewoo.solver.Solver" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/tools/base/index.html b/reference/agents/tools/base/index.html
index a15530b34..2f3451704 100644
--- a/reference/agents/tools/base/index.html
+++ b/reference/agents/tools/base/index.html
@@ -4447,7 +4447,7 @@ <h2 id="agents.tools.base.ComponentTool" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4457,7 +4457,7 @@ <h2 id="agents.tools.base.ComponentTool" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/tools/google/index.html b/reference/agents/tools/google/index.html
index e639831e1..3020ac270 100644
--- a/reference/agents/tools/google/index.html
+++ b/reference/agents/tools/google/index.html
@@ -3687,7 +3687,7 @@ <h1>Google</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3697,7 +3697,7 @@ <h1>Google</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/tools/index.html b/reference/agents/tools/index.html
index 35402d134..0b62e97ac 100644
--- a/reference/agents/tools/index.html
+++ b/reference/agents/tools/index.html
@@ -4481,7 +4481,7 @@ <h2 id="agents.tools.WikipediaTool" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4491,7 +4491,7 @@ <h2 id="agents.tools.WikipediaTool" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/tools/llm/index.html b/reference/agents/tools/llm/index.html
index 4d230a79b..deae043b3 100644
--- a/reference/agents/tools/llm/index.html
+++ b/reference/agents/tools/llm/index.html
@@ -3687,7 +3687,7 @@ <h1>Llm</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3697,7 +3697,7 @@ <h1>Llm</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:42+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/tools/wikipedia/index.html b/reference/agents/tools/wikipedia/index.html
index 6d0509c47..639c84ae8 100644
--- a/reference/agents/tools/wikipedia/index.html
+++ b/reference/agents/tools/wikipedia/index.html
@@ -3966,7 +3966,7 @@ <h2 id="agents.tools.wikipedia.WikipediaTool" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3976,7 +3976,7 @@ <h2 id="agents.tools.wikipedia.WikipediaTool" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/agents/utils/index.html b/reference/agents/utils/index.html
index 006f076a7..5bf5ee245 100644
--- a/reference/agents/utils/index.html
+++ b/reference/agents/utils/index.html
@@ -3806,7 +3806,7 @@ <h2 id="agents.utils.calculate_cost" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3816,7 +3816,7 @@ <h2 id="agents.utils.calculate_cost" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:41+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:29+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/base/component/index.html b/reference/base/component/index.html
index addae65d0..c27d72a50 100644
--- a/reference/base/component/index.html
+++ b/reference/base/component/index.html
@@ -3989,7 +3989,7 @@ <h3 id="base.component.BaseComponent.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3999,7 +3999,7 @@ <h3 id="base.component.BaseComponent.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/base/index.html b/reference/base/index.html
index 19a7ad265..0c1174279 100644
--- a/reference/base/index.html
+++ b/reference/base/index.html
@@ -4448,7 +4448,7 @@ <h2 id="base.RetrievedDocument" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4458,7 +4458,7 @@ <h2 id="base.RetrievedDocument" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/base/schema/index.html b/reference/base/schema/index.html
index 6e85a57e0..8c5797bb3 100644
--- a/reference/base/schema/index.html
+++ b/reference/base/schema/index.html
@@ -4317,7 +4317,7 @@ <h2 id="base.schema.ExtractorOutput" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4327,7 +4327,7 @@ <h2 id="base.schema.ExtractorOutput" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/chatbot/base/index.html b/reference/chatbot/base/index.html
index 6efbfa395..f7c639520 100644
--- a/reference/chatbot/base/index.html
+++ b/reference/chatbot/base/index.html
@@ -4332,7 +4332,7 @@ <h2 id="chatbot.base.session_chat_storage" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4342,7 +4342,7 @@ <h2 id="chatbot.base.session_chat_storage" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/chatbot/index.html b/reference/chatbot/index.html
index e2243ed2a..b8f8ed523 100644
--- a/reference/chatbot/index.html
+++ b/reference/chatbot/index.html
@@ -4262,7 +4262,7 @@ <h2 id="chatbot.SimpleRespondentChatbot" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4272,7 +4272,7 @@ <h2 id="chatbot.SimpleRespondentChatbot" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:43+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:30+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/chatbot/simple_respondent/index.html b/reference/chatbot/simple_respondent/index.html
index c96c775cd..f1a964a18 100644
--- a/reference/chatbot/simple_respondent/index.html
+++ b/reference/chatbot/simple_respondent/index.html
@@ -3810,7 +3810,7 @@ <h2 id="chatbot.simple_respondent.SimpleRespondentChatbot" class="doc doc-headin
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3820,7 +3820,7 @@ <h2 id="chatbot.simple_respondent.SimpleRespondentChatbot" class="doc doc-headin
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/cli/index.html b/reference/cli/index.html
index 99640d25f..5ec58f35a 100644
--- a/reference/cli/index.html
+++ b/reference/cli/index.html
@@ -4232,7 +4232,7 @@ <h2 id="cli.start_project" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:40+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:28+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4242,7 +4242,7 @@ <h2 id="cli.start_project" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:40+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:28+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/embeddings/base/index.html b/reference/embeddings/base/index.html
index 1e8a68226..2dd7849d2 100644
--- a/reference/embeddings/base/index.html
+++ b/reference/embeddings/base/index.html
@@ -3736,7 +3736,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3746,7 +3746,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/embeddings/endpoint_based/index.html b/reference/embeddings/endpoint_based/index.html
index 788d259bc..188c2ac5a 100644
--- a/reference/embeddings/endpoint_based/index.html
+++ b/reference/embeddings/endpoint_based/index.html
@@ -4016,7 +4016,7 @@ <h3 id="embeddings.endpoint_based.EndpointEmbeddings.run" class="doc doc-heading
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4026,7 +4026,7 @@ <h3 id="embeddings.endpoint_based.EndpointEmbeddings.run" class="doc doc-heading
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/embeddings/fastembed/index.html b/reference/embeddings/fastembed/index.html
index 7876de7e9..bef69e2d5 100644
--- a/reference/embeddings/fastembed/index.html
+++ b/reference/embeddings/fastembed/index.html
@@ -3988,7 +3988,7 @@ <h3 id="embeddings.fastembed.FastEmbedEmbeddings.ainvoke" class="doc doc-heading
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3998,7 +3998,7 @@ <h3 id="embeddings.fastembed.FastEmbedEmbeddings.ainvoke" class="doc doc-heading
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/embeddings/index.html b/reference/embeddings/index.html
index b63aeca3b..4b3129945 100644
--- a/reference/embeddings/index.html
+++ b/reference/embeddings/index.html
@@ -5309,7 +5309,7 @@ <h3 id="embeddings.OpenAIEmbeddings.openai_response" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5319,7 +5319,7 @@ <h3 id="embeddings.OpenAIEmbeddings.openai_response" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/embeddings/langchain_based/index.html b/reference/embeddings/langchain_based/index.html
index a91934f0d..ba3370b5e 100644
--- a/reference/embeddings/langchain_based/index.html
+++ b/reference/embeddings/langchain_based/index.html
@@ -4230,7 +4230,7 @@ <h2 id="embeddings.langchain_based.LCHuggingFaceEmbeddings" class="doc doc-headi
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4240,7 +4240,7 @@ <h2 id="embeddings.langchain_based.LCHuggingFaceEmbeddings" class="doc doc-headi
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/embeddings/openai/index.html b/reference/embeddings/openai/index.html
index f1ae7c159..4c7354b91 100644
--- a/reference/embeddings/openai/index.html
+++ b/reference/embeddings/openai/index.html
@@ -5030,7 +5030,7 @@ <h2 id="embeddings.openai.split_text_by_chunk_size" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5040,7 +5040,7 @@ <h2 id="embeddings.openai.split_text_by_chunk_size" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:44+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/base/index.html b/reference/indices/base/index.html
index 5015b334b..be457d512 100644
--- a/reference/indices/base/index.html
+++ b/reference/indices/base/index.html
@@ -4381,7 +4381,7 @@ <h2 id="indices.base.BaseRetrieval" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4391,7 +4391,7 @@ <h2 id="indices.base.BaseRetrieval" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/extractors/doc_parsers/index.html b/reference/indices/extractors/doc_parsers/index.html
index df6a908a9..1e4a73d77 100644
--- a/reference/indices/extractors/doc_parsers/index.html
+++ b/reference/indices/extractors/doc_parsers/index.html
@@ -3738,7 +3738,7 @@ <h1>Doc Parsers</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3748,7 +3748,7 @@ <h1>Doc Parsers</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/extractors/index.html b/reference/indices/extractors/index.html
index 3052273f5..62ae2d198 100644
--- a/reference/indices/extractors/index.html
+++ b/reference/indices/extractors/index.html
@@ -3691,7 +3691,7 @@ <h1>Extractors</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3701,7 +3701,7 @@ <h1>Extractors</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/index.html b/reference/indices/index.html
index 76855a096..5920e6c7b 100644
--- a/reference/indices/index.html
+++ b/reference/indices/index.html
@@ -4846,7 +4846,7 @@ <h3 id="indices.VectorRetrieval.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4856,7 +4856,7 @@ <h3 id="indices.VectorRetrieval.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/ingests/files/index.html b/reference/indices/ingests/files/index.html
index db87cfdbe..62556fea0 100644
--- a/reference/indices/ingests/files/index.html
+++ b/reference/indices/ingests/files/index.html
@@ -4184,7 +4184,7 @@ <h3 id="indices.ingests.files.DocumentIngestor.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4194,7 +4194,7 @@ <h3 id="indices.ingests.files.DocumentIngestor.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/ingests/index.html b/reference/indices/ingests/index.html
index 5aadbe6bd..42fe917c0 100644
--- a/reference/indices/ingests/index.html
+++ b/reference/indices/ingests/index.html
@@ -4113,7 +4113,7 @@ <h3 id="indices.ingests.DocumentIngestor.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4123,7 +4123,7 @@ <h3 id="indices.ingests.DocumentIngestor.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/qa/citation/index.html b/reference/indices/qa/citation/index.html
index 2db1a17e3..facbb8722 100644
--- a/reference/indices/qa/citation/index.html
+++ b/reference/indices/qa/citation/index.html
@@ -4033,7 +4033,7 @@ <h2 id="indices.qa.citation.CitationPipeline" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4043,7 +4043,7 @@ <h2 id="indices.qa.citation.CitationPipeline" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/qa/index.html b/reference/indices/qa/index.html
index dc5ec0602..e1c21da31 100644
--- a/reference/indices/qa/index.html
+++ b/reference/indices/qa/index.html
@@ -4058,7 +4058,7 @@ <h2 id="indices.qa.CitationQAPipeline" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4068,7 +4068,7 @@ <h2 id="indices.qa.CitationQAPipeline" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/qa/text_based/index.html b/reference/indices/qa/text_based/index.html
index 826ab6054..0494c419b 100644
--- a/reference/indices/qa/text_based/index.html
+++ b/reference/indices/qa/text_based/index.html
@@ -3908,7 +3908,7 @@ <h2 id="indices.qa.text_based.CitationQAPipeline" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3918,7 +3918,7 @@ <h2 id="indices.qa.text_based.CitationQAPipeline" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/rankings/base/index.html b/reference/indices/rankings/base/index.html
index dbfaf5b3b..000751a4e 100644
--- a/reference/indices/rankings/base/index.html
+++ b/reference/indices/rankings/base/index.html
@@ -3875,7 +3875,7 @@ <h3 id="indices.rankings.base.BaseReranking.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3885,7 +3885,7 @@ <h3 id="indices.rankings.base.BaseReranking.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/rankings/cohere/index.html b/reference/indices/rankings/cohere/index.html
index a94e5abf6..178437267 100644
--- a/reference/indices/rankings/cohere/index.html
+++ b/reference/indices/rankings/cohere/index.html
@@ -4047,7 +4047,7 @@ <h3 id="indices.rankings.cohere.CohereReranking.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4057,7 +4057,7 @@ <h3 id="indices.rankings.cohere.CohereReranking.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/rankings/index.html b/reference/indices/rankings/index.html
index 84d0178cd..f09b8ec78 100644
--- a/reference/indices/rankings/index.html
+++ b/reference/indices/rankings/index.html
@@ -4991,7 +4991,7 @@ <h3 id="indices.rankings.LLMTrulensScoring.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5001,7 +5001,7 @@ <h3 id="indices.rankings.LLMTrulensScoring.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/rankings/llm/index.html b/reference/indices/rankings/llm/index.html
index 8831c4ccb..9efb7a49d 100644
--- a/reference/indices/rankings/llm/index.html
+++ b/reference/indices/rankings/llm/index.html
@@ -4012,7 +4012,7 @@ <h3 id="indices.rankings.llm.LLMReranking.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4022,7 +4022,7 @@ <h3 id="indices.rankings.llm.LLMReranking.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/rankings/llm_scoring/index.html b/reference/indices/rankings/llm_scoring/index.html
index 4beef466e..7b8e99654 100644
--- a/reference/indices/rankings/llm_scoring/index.html
+++ b/reference/indices/rankings/llm_scoring/index.html
@@ -4014,7 +4014,7 @@ <h3 id="indices.rankings.llm_scoring.LLMScoring.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4024,7 +4024,7 @@ <h3 id="indices.rankings.llm_scoring.LLMScoring.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/rankings/llm_trulens/index.html b/reference/indices/rankings/llm_trulens/index.html
index f74d14a12..f94f292e7 100644
--- a/reference/indices/rankings/llm_trulens/index.html
+++ b/reference/indices/rankings/llm_trulens/index.html
@@ -4441,7 +4441,7 @@ <h2 id="indices.rankings.llm_trulens.re_0_10_rating" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4451,7 +4451,7 @@ <h2 id="indices.rankings.llm_trulens.re_0_10_rating" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/splitters/index.html b/reference/indices/splitters/index.html
index 1beb78325..b20032a05 100644
--- a/reference/indices/splitters/index.html
+++ b/reference/indices/splitters/index.html
@@ -3804,7 +3804,7 @@ <h2 id="indices.splitters.BaseSplitter" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3814,7 +3814,7 @@ <h2 id="indices.splitters.BaseSplitter" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/indices/vectorindex/index.html b/reference/indices/vectorindex/index.html
index 5eae46138..5fa6423fc 100644
--- a/reference/indices/vectorindex/index.html
+++ b/reference/indices/vectorindex/index.html
@@ -4941,7 +4941,7 @@ <h3 id="indices.vectorindex.VectorRetrieval.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4951,7 +4951,7 @@ <h3 id="indices.vectorindex.VectorRetrieval.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:45+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:31+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/base/index.html b/reference/llms/base/index.html
index 3631725ed..0eb53d1dd 100644
--- a/reference/llms/base/index.html
+++ b/reference/llms/base/index.html
@@ -3736,7 +3736,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3746,7 +3746,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/branching/index.html b/reference/llms/branching/index.html
index 46642db40..b123d4f23 100644
--- a/reference/llms/branching/index.html
+++ b/reference/llms/branching/index.html
@@ -4756,7 +4756,7 @@ <h3 id="llms.branching.GatedBranchingPipeline.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4766,7 +4766,7 @@ <h3 id="llms.branching.GatedBranchingPipeline.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/chats/base/index.html b/reference/llms/chats/base/index.html
index 51091bd94..49962e3e6 100644
--- a/reference/llms/chats/base/index.html
+++ b/reference/llms/chats/base/index.html
@@ -3738,7 +3738,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3748,7 +3748,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/chats/endpoint_based/index.html b/reference/llms/chats/endpoint_based/index.html
index 654578d23..b4980b035 100644
--- a/reference/llms/chats/endpoint_based/index.html
+++ b/reference/llms/chats/endpoint_based/index.html
@@ -4180,7 +4180,7 @@ <h3 id="llms.chats.endpoint_based.EndpointChatLLM.invoke" class="doc doc-heading
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4190,7 +4190,7 @@ <h3 id="llms.chats.endpoint_based.EndpointChatLLM.invoke" class="doc doc-heading
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/chats/index.html b/reference/llms/chats/index.html
index 7352b0c99..8bac7b417 100644
--- a/reference/llms/chats/index.html
+++ b/reference/llms/chats/index.html
@@ -5875,7 +5875,7 @@ <h3 id="llms.chats.ChatOpenAI.openai_response" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5885,7 +5885,7 @@ <h3 id="llms.chats.ChatOpenAI.openai_response" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/chats/langchain_based/index.html b/reference/llms/chats/langchain_based/index.html
index 1f44c16cb..9250e15d6 100644
--- a/reference/llms/chats/langchain_based/index.html
+++ b/reference/llms/chats/langchain_based/index.html
@@ -4339,7 +4339,7 @@ <h3 id="llms.chats.langchain_based.LCChatMixin.invoke" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4349,7 +4349,7 @@ <h3 id="llms.chats.langchain_based.LCChatMixin.invoke" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/chats/llamacpp/index.html b/reference/llms/chats/llamacpp/index.html
index 38f07f410..93a3a4ffb 100644
--- a/reference/llms/chats/llamacpp/index.html
+++ b/reference/llms/chats/llamacpp/index.html
@@ -4236,7 +4236,7 @@ <h3 id="llms.chats.llamacpp.LlamaCppChat.client_object" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4246,7 +4246,7 @@ <h3 id="llms.chats.llamacpp.LlamaCppChat.client_object" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/chats/openai/index.html b/reference/llms/chats/openai/index.html
index 3304518dd..6fe949c4b 100644
--- a/reference/llms/chats/openai/index.html
+++ b/reference/llms/chats/openai/index.html
@@ -5439,7 +5439,7 @@ <h3 id="llms.chats.openai.AzureChatOpenAI.openai_response" class="doc doc-headin
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5449,7 +5449,7 @@ <h3 id="llms.chats.openai.AzureChatOpenAI.openai_response" class="doc doc-headin
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/completions/base/index.html b/reference/llms/completions/base/index.html
index 3b1abd07a..68f17ca02 100644
--- a/reference/llms/completions/base/index.html
+++ b/reference/llms/completions/base/index.html
@@ -3738,7 +3738,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3748,7 +3748,7 @@ <h1>Base</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/completions/index.html b/reference/llms/completions/index.html
index e32ff02f8..a64e5ef96 100644
--- a/reference/llms/completions/index.html
+++ b/reference/llms/completions/index.html
@@ -4076,7 +4076,7 @@ <h2 id="llms.completions.OpenAI" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4086,7 +4086,7 @@ <h2 id="llms.completions.OpenAI" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/completions/langchain_based/index.html b/reference/llms/completions/langchain_based/index.html
index 247413df9..f9ee74d73 100644
--- a/reference/llms/completions/langchain_based/index.html
+++ b/reference/llms/completions/langchain_based/index.html
@@ -4150,7 +4150,7 @@ <h2 id="llms.completions.langchain_based.LlamaCpp" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4160,7 +4160,7 @@ <h2 id="llms.completions.langchain_based.LlamaCpp" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/cot/index.html b/reference/llms/cot/index.html
index 3a8fca1e0..a38ecc51f 100644
--- a/reference/llms/cot/index.html
+++ b/reference/llms/cot/index.html
@@ -4488,7 +4488,7 @@ <h3 id="llms.cot.ManualSequentialChainOfThought.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4498,7 +4498,7 @@ <h3 id="llms.cot.ManualSequentialChainOfThought.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/index.html b/reference/llms/index.html
index 741aefbe4..8f6dbea36 100644
--- a/reference/llms/index.html
+++ b/reference/llms/index.html
@@ -9824,7 +9824,7 @@ <h3 id="llms.PromptTemplate.partial_populate" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -9834,7 +9834,7 @@ <h3 id="llms.PromptTemplate.partial_populate" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:46+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/linear/index.html b/reference/llms/linear/index.html
index fe3c1f77b..5f5ebe005 100644
--- a/reference/llms/linear/index.html
+++ b/reference/llms/linear/index.html
@@ -4765,7 +4765,7 @@ <h3 id="llms.linear.GatedLinearPipeline.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4775,7 +4775,7 @@ <h3 id="llms.linear.GatedLinearPipeline.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:32+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/prompts/base/index.html b/reference/llms/prompts/base/index.html
index ed75e1f3d..6df69f818 100644
--- a/reference/llms/prompts/base/index.html
+++ b/reference/llms/prompts/base/index.html
@@ -4466,7 +4466,7 @@ <h3 id="llms.prompts.base.BasePromptComponent.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4476,7 +4476,7 @@ <h3 id="llms.prompts.base.BasePromptComponent.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/prompts/index.html b/reference/llms/prompts/index.html
index 9e9786eb3..7b5f64ef3 100644
--- a/reference/llms/prompts/index.html
+++ b/reference/llms/prompts/index.html
@@ -5195,7 +5195,7 @@ <h3 id="llms.prompts.PromptTemplate.partial_populate" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5205,7 +5205,7 @@ <h3 id="llms.prompts.PromptTemplate.partial_populate" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:47+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/llms/prompts/template/index.html b/reference/llms/prompts/template/index.html
index db6a3ac46..eca89e819 100644
--- a/reference/llms/prompts/template/index.html
+++ b/reference/llms/prompts/template/index.html
@@ -4598,7 +4598,7 @@ <h3 id="llms.prompts.template.PromptTemplate.partial_populate" class="doc doc-he
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4608,7 +4608,7 @@ <h3 id="llms.prompts.template.PromptTemplate.partial_populate" class="doc doc-he
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/adobe_loader/index.html b/reference/loaders/adobe_loader/index.html
index e414e6f6a..a43938a10 100644
--- a/reference/loaders/adobe_loader/index.html
+++ b/reference/loaders/adobe_loader/index.html
@@ -4507,7 +4507,7 @@ <h3 id="loaders.adobe_loader.AdobeReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4517,7 +4517,7 @@ <h3 id="loaders.adobe_loader.AdobeReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/azureai_document_intelligence_loader/index.html b/reference/loaders/azureai_document_intelligence_loader/index.html
index 71cf9fa57..f3c76f5dc 100644
--- a/reference/loaders/azureai_document_intelligence_loader/index.html
+++ b/reference/loaders/azureai_document_intelligence_loader/index.html
@@ -4640,7 +4640,7 @@ <h2 id="loaders.azureai_document_intelligence_loader.crop_image" class="doc doc-
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4650,7 +4650,7 @@ <h2 id="loaders.azureai_document_intelligence_loader.crop_image" class="doc doc-
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/base/index.html b/reference/loaders/base/index.html
index b1ed0449a..45166eb7f 100644
--- a/reference/loaders/base/index.html
+++ b/reference/loaders/base/index.html
@@ -4061,7 +4061,7 @@ <h2 id="loaders.base.LIReaderMixin" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4071,7 +4071,7 @@ <h2 id="loaders.base.LIReaderMixin" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/composite_loader/index.html b/reference/loaders/composite_loader/index.html
index e2664ade9..6ac6035b7 100644
--- a/reference/loaders/composite_loader/index.html
+++ b/reference/loaders/composite_loader/index.html
@@ -4082,7 +4082,7 @@ <h2 id="loaders.composite_loader.DirectoryReader" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4092,7 +4092,7 @@ <h2 id="loaders.composite_loader.DirectoryReader" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/docx_loader/index.html b/reference/loaders/docx_loader/index.html
index 03eaa588a..1453a5760 100644
--- a/reference/loaders/docx_loader/index.html
+++ b/reference/loaders/docx_loader/index.html
@@ -4214,7 +4214,7 @@ <h3 id="loaders.docx_loader.DocxReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4224,7 +4224,7 @@ <h3 id="loaders.docx_loader.DocxReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/excel_loader/index.html b/reference/loaders/excel_loader/index.html
index 5793c5ef4..f57fc2863 100644
--- a/reference/loaders/excel_loader/index.html
+++ b/reference/loaders/excel_loader/index.html
@@ -4765,7 +4765,7 @@ <h3 id="loaders.excel_loader.ExcelReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4775,7 +4775,7 @@ <h3 id="loaders.excel_loader.ExcelReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/html_loader/index.html b/reference/loaders/html_loader/index.html
index e389b36fd..15592ee36 100644
--- a/reference/loaders/html_loader/index.html
+++ b/reference/loaders/html_loader/index.html
@@ -4535,7 +4535,7 @@ <h3 id="loaders.html_loader.MhtmlReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4545,7 +4545,7 @@ <h3 id="loaders.html_loader.MhtmlReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/index.html b/reference/loaders/index.html
index 2d38916ed..76b695645 100644
--- a/reference/loaders/index.html
+++ b/reference/loaders/index.html
@@ -10067,7 +10067,7 @@ <h3 id="loaders.UnstructuredReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -10077,7 +10077,7 @@ <h3 id="loaders.UnstructuredReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:48+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:33+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/mathpix_loader/index.html b/reference/loaders/mathpix_loader/index.html
index 131236626..4c161d258 100644
--- a/reference/loaders/mathpix_loader/index.html
+++ b/reference/loaders/mathpix_loader/index.html
@@ -4384,7 +4384,7 @@ <h3 id="loaders.mathpix_loader.MathpixPDFReader.clean_pdf" class="doc doc-headin
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4394,7 +4394,7 @@ <h3 id="loaders.mathpix_loader.MathpixPDFReader.clean_pdf" class="doc doc-headin
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/ocr_loader/index.html b/reference/loaders/ocr_loader/index.html
index f51afb879..aa243bff4 100644
--- a/reference/loaders/ocr_loader/index.html
+++ b/reference/loaders/ocr_loader/index.html
@@ -4754,7 +4754,7 @@ <h3 id="loaders.ocr_loader.ImageReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4764,7 +4764,7 @@ <h3 id="loaders.ocr_loader.ImageReader.load_data" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/pdf_loader/index.html b/reference/loaders/pdf_loader/index.html
index 18efe4229..71e700205 100644
--- a/reference/loaders/pdf_loader/index.html
+++ b/reference/loaders/pdf_loader/index.html
@@ -4234,7 +4234,7 @@ <h2 id="loaders.pdf_loader.get_page_thumbnails" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4244,7 +4244,7 @@ <h2 id="loaders.pdf_loader.get_page_thumbnails" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/txt_loader/index.html b/reference/loaders/txt_loader/index.html
index 32df41f4e..9f98e9a71 100644
--- a/reference/loaders/txt_loader/index.html
+++ b/reference/loaders/txt_loader/index.html
@@ -3736,7 +3736,7 @@ <h1>Txt Loader</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3746,7 +3746,7 @@ <h1>Txt Loader</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/unstructured_loader/index.html b/reference/loaders/unstructured_loader/index.html
index 6264f78e4..51632d555 100644
--- a/reference/loaders/unstructured_loader/index.html
+++ b/reference/loaders/unstructured_loader/index.html
@@ -4165,7 +4165,7 @@ <h3 id="loaders.unstructured_loader.UnstructuredReader.load_data" class="doc doc
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4175,7 +4175,7 @@ <h3 id="loaders.unstructured_loader.UnstructuredReader.load_data" class="doc doc
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/utils/adobe/index.html b/reference/loaders/utils/adobe/index.html
index 2701b8aff..64b70b3bf 100644
--- a/reference/loaders/utils/adobe/index.html
+++ b/reference/loaders/utils/adobe/index.html
@@ -4625,7 +4625,7 @@ <h2 id="loaders.utils.adobe.generate_figure_captions" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4635,7 +4635,7 @@ <h2 id="loaders.utils.adobe.generate_figure_captions" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/utils/box/index.html b/reference/loaders/utils/box/index.html
index e52e593ea..cb3129648 100644
--- a/reference/loaders/utils/box/index.html
+++ b/reference/loaders/utils/box/index.html
@@ -4555,7 +4555,7 @@ <h2 id="loaders.utils.box.sort_funsd_reading_order" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4565,7 +4565,7 @@ <h2 id="loaders.utils.box.sort_funsd_reading_order" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/utils/gpt4v/index.html b/reference/loaders/utils/gpt4v/index.html
index d3baf66af..68e88807c 100644
--- a/reference/loaders/utils/gpt4v/index.html
+++ b/reference/loaders/utils/gpt4v/index.html
@@ -3738,7 +3738,7 @@ <h1>Gpt4V</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3748,7 +3748,7 @@ <h1>Gpt4V</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/utils/index.html b/reference/loaders/utils/index.html
index 3fa90296c..9832961d6 100644
--- a/reference/loaders/utils/index.html
+++ b/reference/loaders/utils/index.html
@@ -3691,7 +3691,7 @@ <h1>Utils</h1>
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3701,7 +3701,7 @@ <h1>Utils</h1>
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:34+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/utils/pdf_ocr/index.html b/reference/loaders/utils/pdf_ocr/index.html
index 6c85117d9..5fb2cd600 100644
--- a/reference/loaders/utils/pdf_ocr/index.html
+++ b/reference/loaders/utils/pdf_ocr/index.html
@@ -4569,7 +4569,7 @@ <h2 id="loaders.utils.pdf_ocr.parse_ocr_output" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4579,7 +4579,7 @@ <h2 id="loaders.utils.pdf_ocr.parse_ocr_output" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:49+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/loaders/utils/table/index.html b/reference/loaders/utils/table/index.html
index 2a82b7d4d..ed787524d 100644
--- a/reference/loaders/utils/table/index.html
+++ b/reference/loaders/utils/table/index.html
@@ -5241,7 +5241,7 @@ <h2 id="loaders.utils.table.table_cells_to_markdown" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -5251,7 +5251,7 @@ <h2 id="loaders.utils.table.table_cells_to_markdown" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/parsers/index.html b/reference/parsers/index.html
index 8862ff2b7..82c6013dc 100644
--- a/reference/parsers/index.html
+++ b/reference/parsers/index.html
@@ -4618,7 +4618,7 @@ <h3 id="parsers.RegexExtractor.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4628,7 +4628,7 @@ <h3 id="parsers.RegexExtractor.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/parsers/regex_extractor/index.html b/reference/parsers/regex_extractor/index.html
index 0ba0cdb42..c567808f7 100644
--- a/reference/parsers/regex_extractor/index.html
+++ b/reference/parsers/regex_extractor/index.html
@@ -4716,7 +4716,7 @@ <h3 id="parsers.regex_extractor.RegexExtractor.run" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4726,7 +4726,7 @@ <h3 id="parsers.regex_extractor.RegexExtractor.run" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/docstores/base/index.html b/reference/storages/docstores/base/index.html
index 155409687..b2a95c125 100644
--- a/reference/storages/docstores/base/index.html
+++ b/reference/storages/docstores/base/index.html
@@ -4347,7 +4347,7 @@ <h3 id="storages.docstores.base.BaseDocumentStore.drop" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4357,7 +4357,7 @@ <h3 id="storages.docstores.base.BaseDocumentStore.drop" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/docstores/elasticsearch/index.html b/reference/storages/docstores/elasticsearch/index.html
index bac00336d..533f938b6 100644
--- a/reference/storages/docstores/elasticsearch/index.html
+++ b/reference/storages/docstores/elasticsearch/index.html
@@ -4862,7 +4862,7 @@ <h3 id="storages.docstores.elasticsearch.ElasticsearchDocumentStore.drop" class=
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4872,7 +4872,7 @@ <h3 id="storages.docstores.elasticsearch.ElasticsearchDocumentStore.drop" class=
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/docstores/in_memory/index.html b/reference/storages/docstores/in_memory/index.html
index c49e36f74..b68e6693f 100644
--- a/reference/storages/docstores/in_memory/index.html
+++ b/reference/storages/docstores/in_memory/index.html
@@ -4538,7 +4538,7 @@ <h3 id="storages.docstores.in_memory.InMemoryDocumentStore.drop" class="doc doc-
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4548,7 +4548,7 @@ <h3 id="storages.docstores.in_memory.InMemoryDocumentStore.drop" class="doc doc-
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/docstores/index.html b/reference/storages/docstores/index.html
index 69b96eb4b..2b2b0590d 100644
--- a/reference/storages/docstores/index.html
+++ b/reference/storages/docstores/index.html
@@ -7019,7 +7019,7 @@ <h3 id="storages.docstores.SimpleFileDocumentStore.drop" class="doc doc-heading"
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -7029,7 +7029,7 @@ <h3 id="storages.docstores.SimpleFileDocumentStore.drop" class="doc doc-heading"
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/docstores/lancedb/index.html b/reference/storages/docstores/lancedb/index.html
index 2d11398ca..9c9a272d4 100644
--- a/reference/storages/docstores/lancedb/index.html
+++ b/reference/storages/docstores/lancedb/index.html
@@ -4414,7 +4414,7 @@ <h3 id="storages.docstores.lancedb.LanceDBDocumentStore.drop" class="doc doc-hea
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4424,7 +4424,7 @@ <h3 id="storages.docstores.lancedb.LanceDBDocumentStore.drop" class="doc doc-hea
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/docstores/simple_file/index.html b/reference/storages/docstores/simple_file/index.html
index 506b96d1d..2f8d87984 100644
--- a/reference/storages/docstores/simple_file/index.html
+++ b/reference/storages/docstores/simple_file/index.html
@@ -4220,7 +4220,7 @@ <h3 id="storages.docstores.simple_file.SimpleFileDocumentStore.drop" class="doc
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4230,7 +4230,7 @@ <h3 id="storages.docstores.simple_file.SimpleFileDocumentStore.drop" class="doc
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:51+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/index.html b/reference/storages/index.html
index 9fd0df15a..0e295b659 100644
--- a/reference/storages/index.html
+++ b/reference/storages/index.html
@@ -9470,7 +9470,7 @@ <h2 id="storages.SimpleFileVectorStore" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -9480,7 +9480,7 @@ <h2 id="storages.SimpleFileVectorStore" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:50+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:35+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/base/index.html b/reference/storages/vectorstores/base/index.html
index 52cfc8d21..8125d806b 100644
--- a/reference/storages/vectorstores/base/index.html
+++ b/reference/storages/vectorstores/base/index.html
@@ -4942,7 +4942,7 @@ <h3 id="storages.vectorstores.base.LlamaIndexVectorStore.query" class="doc doc-h
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4952,7 +4952,7 @@ <h3 id="storages.vectorstores.base.LlamaIndexVectorStore.query" class="doc doc-h
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/chroma/index.html b/reference/storages/vectorstores/chroma/index.html
index 40ca0baff..2a8dd2186 100644
--- a/reference/storages/vectorstores/chroma/index.html
+++ b/reference/storages/vectorstores/chroma/index.html
@@ -4114,7 +4114,7 @@ <h3 id="storages.vectorstores.chroma.ChromaVectorStore.drop" class="doc doc-head
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4124,7 +4124,7 @@ <h3 id="storages.vectorstores.chroma.ChromaVectorStore.drop" class="doc doc-head
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/in_memory/index.html b/reference/storages/vectorstores/in_memory/index.html
index 1e9d3b3d5..16b06810c 100644
--- a/reference/storages/vectorstores/in_memory/index.html
+++ b/reference/storages/vectorstores/in_memory/index.html
@@ -4180,7 +4180,7 @@ <h3 id="storages.vectorstores.in_memory.InMemoryVectorStore.drop" class="doc doc
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4190,7 +4190,7 @@ <h3 id="storages.vectorstores.in_memory.InMemoryVectorStore.drop" class="doc doc
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/index.html b/reference/storages/vectorstores/index.html
index bf7ec5d8b..03ca9b47b 100644
--- a/reference/storages/vectorstores/index.html
+++ b/reference/storages/vectorstores/index.html
@@ -6144,7 +6144,7 @@ <h2 id="storages.vectorstores.SimpleFileVectorStore" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -6154,7 +6154,7 @@ <h2 id="storages.vectorstores.SimpleFileVectorStore" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:36+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/lancedb/index.html b/reference/storages/vectorstores/lancedb/index.html
index 6c5424974..136f39302 100644
--- a/reference/storages/vectorstores/lancedb/index.html
+++ b/reference/storages/vectorstores/lancedb/index.html
@@ -4068,7 +4068,7 @@ <h3 id="storages.vectorstores.lancedb.LanceDBVectorStore.drop" class="doc doc-he
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4078,7 +4078,7 @@ <h3 id="storages.vectorstores.lancedb.LanceDBVectorStore.drop" class="doc doc-he
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/milvus/index.html b/reference/storages/vectorstores/milvus/index.html
index ed1c31214..99e47cb05 100644
--- a/reference/storages/vectorstores/milvus/index.html
+++ b/reference/storages/vectorstores/milvus/index.html
@@ -4010,7 +4010,7 @@ <h2 id="storages.vectorstores.milvus.MilvusVectorStore" class="doc doc-heading">
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4020,7 +4020,7 @@ <h2 id="storages.vectorstores.milvus.MilvusVectorStore" class="doc doc-heading">
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/qdrant/index.html b/reference/storages/vectorstores/qdrant/index.html
index c1cceeced..280c460d2 100644
--- a/reference/storages/vectorstores/qdrant/index.html
+++ b/reference/storages/vectorstores/qdrant/index.html
@@ -4122,7 +4122,7 @@ <h3 id="storages.vectorstores.qdrant.QdrantVectorStore.drop" class="doc doc-head
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -4132,7 +4132,7 @@ <h3 id="storages.vectorstores.qdrant.QdrantVectorStore.drop" class="doc doc-head
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:52+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/reference/storages/vectorstores/simple_file/index.html b/reference/storages/vectorstores/simple_file/index.html
index 9607e5b08..0546c0d78 100644
--- a/reference/storages/vectorstores/simple_file/index.html
+++ b/reference/storages/vectorstores/simple_file/index.html
@@ -3918,7 +3918,7 @@ <h2 id="storages.vectorstores.simple_file.SimpleFileVectorStore" class="doc doc-
     <span class="md-icon" title="Last update">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:53+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
@@ -3928,7 +3928,7 @@ <h2 id="storages.vectorstores.simple_file.SimpleFileVectorStore" class="doc doc-
     <span class="md-icon" title="Created">
       <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg>
     </span>
-    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T14:58:53+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
+    <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class="timeago" datetime="2024-09-23T15:01:37+00:00" locale="en"></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2024-09-23</span>
   </span>
 
     
diff --git a/search/search_index.json b/search/search_index.json
index a34293309..5df330dc9 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Quick Start","text":""},{"location":"#getting-started-with-kotaemon","title":"Getting Started with Kotaemon","text":"<p>This page is intended for end users who want to use the <code>kotaemon</code> tool for Question Answering on local documents. If you are a developer who wants contribute to the project, please visit the development page.</p>"},{"location":"#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"<p>Visit this guide.</p>"},{"location":"#installation-offline","title":"Installation (Offline)","text":""},{"location":"#download","title":"Download","text":"<p>Download the <code>kotaemon-app.zip</code> file from the latest release.</p>"},{"location":"#run-setup-script","title":"Run setup script","text":"<ol> <li>Unzip the downloaded file.</li> <li>Navigate to the <code>scripts</code> folder and start an installer that matches your OS:<ul> <li>Windows: <code>run_windows.bat</code>. Just double click the file.</li> <li>macOS: <code>run_macos.sh</code><ol> <li>Right click on your file and select Open with and Other.</li> <li>Enable All Applications and choose Terminal.</li> <li>NOTE: If you always want to open that file with Terminal, then check Always Open With.</li> <li>From now on, double click on your file and it should work.</li> </ol> </li> <li>Linux: <code>run_linux.sh</code>. Please run the script using <code>bash run_linux.sh</code> in your terminal.</li> </ul> </li> <li>After the installation, the installer will ask to launch the ktem's UI, answer to continue.</li> <li>If launched, the application will be open automatically in your browser.</li> </ol>"},{"location":"#launch","title":"Launch","text":"<p>To launch the app after initial setup or any change, simply run the <code>run_*</code> script again.</p> <p>A browser window will be opened and greets you with this screen:</p> <p></p>"},{"location":"#usage","title":"Usage","text":"<p>For how to use the application, see Usage. This page will also be available to you within the application.</p>"},{"location":"#feedback","title":"Feedback","text":"<p>Feel free to create a bug report or a feature request on our repo.</p>"},{"location":"about/","title":"About Kotaemon","text":""},{"location":"about/#about-kotaemon","title":"About Kotaemon","text":"<p>An open-source tool for chatting with your documents. Built with both end users and developers in mind.</p> <p>Source Code | Live Demo</p> <p>User Guide | Developer Guide | Feedback</p> <p>Dark Mode | Light Mode</p>"},{"location":"local_model/","title":"Setup local LLMs &amp; Embedding models","text":""},{"location":"local_model/#setup-local-llms-embedding-models","title":"Setup local LLMs &amp; Embedding models","text":""},{"location":"local_model/#prepare-local-models","title":"Prepare local models","text":""},{"location":"local_model/#note","title":"NOTE","text":"<p>In the case of using Docker image, please replace <code>http://localhost</code> with <code>http://host.docker.internal</code> to correctly communicate with service on the host machine. See more detail.</p>"},{"location":"local_model/#ollama-openai-compatible-server-recommended","title":"Ollama OpenAI compatible server (recommended)","text":"<p>Install ollama and start the application.</p> <p>Pull your model (e.g):</p> <pre><code>ollama pull llama3.1:8b\nollama pull nomic-embed-text\n</code></pre> <p>Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to Ollama:</p> <pre><code>api_key: ollama\nbase_url: http://localhost:11434/v1/\nmodel: gemma2:2b (for llm) | nomic-embed-text (for embedding)\n</code></pre> <p></p>"},{"location":"local_model/#oobaboogatext-generation-webui-openai-compatible-server","title":"oobabooga/text-generation-webui OpenAI compatible server","text":"<p>Install oobabooga/text-generation-webui.</p> <p>Follow the setup guide to download your models (GGUF, HF). Also take a look at OpenAI compatible server for detail instructions.</p> <p>Here is a short version</p> <pre><code># install sentence-transformer for embeddings creation\npip install sentence_transformers\n# change to text-generation-webui src dir\npython server.py --api\n</code></pre> <p>Use the <code>Models</code> tab to download new model and press Load.</p> <p>Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to <code>text-generation-webui</code>:</p> <pre><code>api_key: dummy\nbase_url: http://localhost:5000/v1/\nmodel: any\n</code></pre>"},{"location":"local_model/#llama-cpp-python-server-llm-only","title":"llama-cpp-python server (LLM only)","text":"<p>See llama-cpp-python OpenAI server.</p> <p>Download any GGUF model weight on HuggingFace or other source. Place it somewhere on your local machine.</p> <p>Run</p> <pre><code>LOCAL_MODEL=&lt;path/to/GGUF&gt; python scripts/serve_local.py\n</code></pre> <p>Setup LLM model on Resources tab with type OpenAI. Set these model parameters to connect to <code>llama-cpp-python</code>:</p> <pre><code>api_key: dummy\nbase_url: http://localhost:8000/v1/\nmodel: model_name\n</code></pre>"},{"location":"local_model/#use-local-models-for-rag","title":"Use local models for RAG","text":"<ul> <li>Set default LLM and Embedding model to a local variant.</li> </ul> <ul> <li>Set embedding model for the File Collection to a local model (e.g: <code>ollama</code>)</li> </ul> <ul> <li>Go to Retrieval settings and choose LLM relevant scoring model as a local model (e.g: <code>ollama</code>). Or, you can choose to disable this feature if your machine cannot handle a lot of parallel LLM requests at the same time.</li> </ul> <p>You are set! Start a new conversation to test your local RAG pipeline.</p>"},{"location":"online_install/","title":"Online install","text":""},{"location":"online_install/#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"<ol> <li> <p>Go to kotaemon_template</p> </li> <li> <p>Use Duplicate function to create your own space</p> </li> </ol> <p></p> <p></p> <ol> <li>Wait for the build to complete and start up (apprx 10 mins).</li> </ol> <p></p> <p></p> <ol> <li>Follow the first setup instructions (and register for Cohere API key if needed)</li> </ol> <p></p> <ol> <li>Complete the setup and use your own private space!</li> </ol> <p></p>"},{"location":"usage/","title":"Basic Usage","text":""},{"location":"usage/#1-add-your-ai-models","title":"1. Add your AI models","text":"<ul> <li>The tool uses Large Language Model (LLMs) to perform various tasks in a QA pipeline.   So, you need to provide the application with access to the LLMs you want   to use.</li> <li>You only need to provide at least one. However, tt is recommended that you include all the LLMs   that you have access to, you will be able to switch between them while using the   application.</li> </ul> <p>To add a model:</p> <ol> <li>Navigate to the <code>Resources</code> tab.</li> <li>Select the <code>LLMs</code> sub-tab.</li> <li>Select the <code>Add</code> sub-tab.</li> <li>Config the model to add:<ul> <li>Give it a name.</li> <li>Pick a vendor/provider (e.g. <code>ChatOpenAI</code>).</li> <li>Provide the specifications.</li> <li>(Optional) Set the model as default.</li> </ul> </li> <li>Click <code>Add</code> to add the model.</li> <li>Select <code>Embedding Models</code> sub-tab and repeat the step 3 to 5 to add an embedding model.</li> </ol> (Optional) Configure model via the .env file <p>Alternatively, you can configure the models via the <code>.env</code> file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.</p> <p>Currently, the following providers are supported:</p>"},{"location":"usage/#openai","title":"OpenAI","text":"<p>In the <code>.env</code> file, set the <code>OPENAI_API_KEY</code> variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.</p> <pre><code>OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=&lt;your OpenAI API key here&gt;\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n</code></pre>"},{"location":"usage/#azure-openai","title":"Azure OpenAI","text":"<p>For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.</p> <pre><code>AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview # could be different for you\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo # change to your deployment name\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 # change to your deployment name\n</code></pre>"},{"location":"usage/#local-models","title":"Local models","text":"<p>Pros:</p> <ul> <li>Privacy. Your documents will be stored and process locally.</li> <li>Choices. There are a wide range of LLMs in terms of size, domain, language to choose   from.</li> <li>Cost. It's free.</li> </ul> <p>Cons:</p> <ul> <li>Quality. Local models are much smaller and thus have lower generative quality than   paid APIs.</li> <li>Speed. Local models are deployed using your machine so the processing speed is   limited by your hardware.</li> </ul>"},{"location":"usage/#find-and-download-a-llm","title":"Find and download a LLM","text":"<p>You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:</p> <ul> <li>GGUF</li> </ul> <p>You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that take up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.</p> <p>Here are some recommendations and their size in memory:</p> <ul> <li>Qwen1.5-1.8B-Chat-GGUF:   around 2 GB</li> </ul>"},{"location":"usage/#enable-local-models","title":"Enable local models","text":"<p>To add a local model to the model pool, set the <code>LOCAL_MODEL</code> variable in the <code>.env</code> file to the path of the model file.</p> <pre><code>LOCAL_MODEL=&lt;full path to your model file&gt;\n</code></pre> <p>Here is how to get the full path of your model file:</p> <ul> <li>On Windows 11: right click the file and select <code>Copy as Path</code>.</li> </ul>"},{"location":"usage/#2-upload-your-documents","title":"2. Upload your documents","text":"<p>In order to do QA on your documents, you need to upload them to the application first. Navigate to the <code>File Index</code> tab and you will see 2 sections:</p> <ol> <li>File upload:<ul> <li>Drag and drop your file to the UI or select it from your file system.    Then click <code>Upload and Index</code>.</li> <li>The application will take some time to process the file and show a message once it is done.</li> </ul> </li> <li>File list:<ul> <li>This section shows the list of files that have been uploaded to the application and allows users to delete them.</li> </ul> </li> </ol>"},{"location":"usage/#3-chat-with-your-documents","title":"3. Chat with your documents","text":"<p>Now navigate back to the <code>Chat</code> tab. The chat tab is divided into 3 regions:</p> <ol> <li>Conversation Settings Panel<ul> <li>Here you can select, create, rename, and delete conversations.<ul> <li>By default, a new conversation is created automatically if no conversation is selected.</li> </ul> </li> <li>Below that you have the file index, where you can choose whether to disable, select all files, or select which files to retrieve references from.<ul> <li>If you choose \"Disabled\", no files will be considered as context during chat.</li> <li>If you choose \"Search All\", all files will be considered during chat.</li> <li>If you choose \"Select\", a dropdown will appear for you to select the    files to be considered during chat. If no files are selected, then no    files will be considered during chat.</li> </ul> </li> </ul> </li> <li>Chat Panel<ul> <li>This is where you can chat with the chatbot.</li> </ul> </li> <li>Information Panel</li> </ol> <p></p> <ul> <li>Supporting information such as the retrieved evidence and reference will be   displayed here.</li> <li>Direct citation for the answer produced by the LLM is highlighted.</li> <li>The confidence score of the answer and relevant scores of evidences are displayed to quickly assess the quality of the answer and retrieved content.</li> </ul> <ul> <li>Meaning of the score displayed:<ul> <li>Answer confidence: answer confidence level from the LLM model.</li> <li>Relevance score: overall relevant score between evidence and user question.</li> <li>Vectorstore score: relevant score from vector embedding similarity calculation (show <code>full-text search</code> if retrieved from full-text search DB).</li> <li>LLM relevant score: relevant score from LLM model (which judge relevancy between question and evidence using specific prompt).</li> <li>Reranking score: relevant score from Cohere reranking model.</li> </ul> </li> </ul> <p>Generally, the score quality is <code>LLM relevant score</code> &gt; <code>Reranking score</code> &gt; <code>Vectorscore</code>. By default, overall relevance score is taken directly from LLM relevant score. Evidences are sorted based on their overall relevance score and whether they have citation or not.</p>"},{"location":"development/","title":"Development","text":""},{"location":"development/#kotaemon","title":"kotaemon","text":"<p>An open-source clean &amp; customizable RAG UI for chatting with your documents. Built with both end users and developers in mind.</p> <p></p> <p>Live Demo | Source Code</p> <p>User Guide | Developer Guide | Feedback</p> <p> </p> <p></p>"},{"location":"development/#introduction","title":"Introduction","text":"<p>This project serves as a functional RAG UI for both end users who want to do QA on their documents and developers who want to build their own RAG pipeline.</p> <ul> <li>For end users:<ul> <li>A clean &amp; minimalistic UI for RAG-based QA.</li> <li>Supports LLM API providers (OpenAI, AzureOpenAI, Cohere, etc) and local LLMs   (via <code>ollama</code> and <code>llama-cpp-python</code>).</li> <li>Easy installation scripts.</li> </ul> </li> <li>For developers:<ul> <li>A framework for building your own RAG-based document QA pipeline.</li> <li>Customize and see your RAG pipeline in action with the provided UI (built with Gradio ).</li> <li>If you use Gradio for development, check out our theme here: kotaemon-gradio-theme.</li> </ul> </li> </ul> <pre><code>+----------------------------------------------------------------------------+\n| End users: Those who use apps built with `kotaemon`.                       |\n| (You use an app like the one in the demo above)                            |\n|     +----------------------------------------------------------------+     |\n|     | Developers: Those who built with `kotaemon`.                   |     |\n|     | (You have `import kotaemon` somewhere in your project)         |     |\n|     |     +----------------------------------------------------+     |     |\n|     |     | Contributors: Those who make `kotaemon` better.    |     |     |\n|     |     | (You make PR to this repo)                         |     |     |\n|     |     +----------------------------------------------------+     |     |\n|     +----------------------------------------------------------------+     |\n+----------------------------------------------------------------------------+\n</code></pre> <p>This repository is under active development. Feedback, issues, and PRs are highly appreciated.</p>"},{"location":"development/#key-features","title":"Key Features","text":"<ul> <li>Host your own document QA (RAG) web-UI. Support multi-user login, organize your files in private / public collections, collaborate and share your favorite chat with others.</li> </ul> <ul> <li>Organize your LLM &amp; Embedding models. Support both local LLMs &amp; popular API providers (OpenAI, Azure, Ollama, Groq).</li> </ul> <ul> <li>Hybrid RAG pipeline. Sane default RAG pipeline with hybrid (full-text &amp; vector) retriever + re-ranking to ensure best retrieval quality.</li> </ul> <ul> <li>Multi-modal QA support. Perform Question Answering on multiple documents with figures &amp; tables support. Support multi-modal document parsing (selectable options on UI).</li> </ul> <ul> <li>Advance citations with document preview. By default the system will provide detailed citations to ensure the correctness of LLM answers. View your citations (incl. relevant score) directly in the in-browser PDF viewer with highlights. Warning when retrieval pipeline return low relevant articles.</li> </ul> <ul> <li>Support complex reasoning methods. Use question decomposition to answer your complex / multi-hop question. Support agent-based reasoning with ReAct, ReWOO and other agents.</li> </ul> <ul> <li>Configurable settings UI. You can adjust most important aspects of retrieval &amp; generation process on the UI (incl. prompts).</li> </ul> <ul> <li>Extensible. Being built on Gradio, you are free to customize / add any UI elements as you like. Also, we aim to support multiple strategies for document indexing &amp; retrieval. <code>GraphRAG</code> indexing pipeline is provided as an example.</li> </ul>"},{"location":"development/#installation","title":"Installation","text":""},{"location":"development/#for-end-users","title":"For end users","text":"<p>This document is intended for developers. If you just want to install and use the app as it is, please follow the non-technical User Guide. Use the most recent release <code>.zip</code> to include latest features and bug-fixes.</p>"},{"location":"development/#for-developers","title":"For developers","text":""},{"location":"development/#with-docker-recommended","title":"With Docker (recommended)","text":"<p>We support <code>lite</code> &amp; <code>full</code> version of Docker images. With <code>full</code>, the extra packages of <code>unstructured</code> will be installed as well, it can support additional file types (.doc, .docx, ...) but the cost is larger docker image size. For most users, the <code>lite</code> image should work well in most cases.</p> <ul> <li>To use the <code>lite</code> version.</li> </ul> <pre><code>docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-lite\n</code></pre> <ul> <li>To use the <code>full</code> version.</li> </ul> <pre><code>docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-full\n</code></pre> <p>Currently, two platforms: <code>linux/amd64</code> and <code>linux/arm64</code> (for newer Mac) are provided &amp; tested. User can specify the platform by passing <code>--platform</code> in the docker run command. For example:</p> <pre><code># To run docker with platform linux/arm64\ndocker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\n--platform linux/arm64 \\\nghcr.io/cinnamon/kotaemon:main-lite\n</code></pre> <p>If everything is set up fine, navigate to <code>http://localhost:7860/</code> to access the web UI.</p> <p>We use GHCR to store docker images, all images can be found here.</p>"},{"location":"development/#without-docker","title":"Without Docker","text":"<ul> <li>Clone and install required packages on a fresh python environment.</li> </ul> <pre><code># optional (setup env)\nconda create -n kotaemon python=3.10\nconda activate kotaemon\n\n# clone this repo\ngit clone https://github.com/Cinnamon/kotaemon\ncd kotaemon\n\npip install -e \"libs/kotaemon[all]\"\npip install -e \"libs/ktem\"\n</code></pre> <ul> <li>View and edit your environment variables (API keys, end-points) in <code>.env</code>.</li> </ul> <ul> <li>(Optional) To enable in-browser PDF_JS viewer, download PDF_JS_DIST and extract it to <code>libs/ktem/ktem/assets/prebuilt</code></li> </ul> <ul> <li>Start the web server:</li> </ul> <pre><code>python app.py\n</code></pre> <p>The app will be automatically launched in your browser.</p> <p>Default username / password are: <code>admin</code> / <code>admin</code>. You can setup additional users directly on the UI.</p> <p></p>"},{"location":"development/#setup-local-models-for-local-private-rag","title":"Setup local models (for local / private RAG)","text":"<p>See Local model setup.</p>"},{"location":"development/#customize-your-application","title":"Customize your application","text":"<p>By default, all application data are stored in <code>./ktem_app_data</code> folder. You can backup or copy this folder to move your installation to a new machine.</p> <p>For advance users or specific use-cases, you can customize those files:</p> <ul> <li><code>flowsettings.py</code></li> <li><code>.env</code></li> </ul>"},{"location":"development/#flowsettingspy","title":"<code>flowsettings.py</code>","text":"<p>This file contains the configuration of your application. You can use the example here as the starting point.</p> Notable settings <pre><code># setup your preferred document store (with full-text search capabilities)\nKH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore)\n\n# setup your preferred vectorstore (for vector-based search)\nKH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Qdrant)\n\n# Enable / disable multimodal QA\nKH_REASONINGS_USE_MULTIMODAL=True\n\n# Setup your new reasoning pipeline or modify existing one.\nKH_REASONINGS = [\n    \"ktem.reasoning.simple.FullQAPipeline\",\n    \"ktem.reasoning.simple.FullDecomposeQAPipeline\",\n    \"ktem.reasoning.react.ReactAgentPipeline\",\n    \"ktem.reasoning.rewoo.RewooAgentPipeline\",\n]\n)\n</code></pre>"},{"location":"development/#env","title":"<code>.env</code>","text":"<p>This file provides another way to configure your models and credentials.</p> Configure model via the .env file <p>Alternatively, you can configure the models via the <code>.env</code> file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.</p> <p>Currently, the following providers are supported:</p>"},{"location":"development/#openai","title":"OpenAI","text":"<p>In the <code>.env</code> file, set the <code>OPENAI_API_KEY</code> variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.</p> <pre><code>OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=&lt;your OpenAI API key here&gt;\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n</code></pre>"},{"location":"development/#azure-openai","title":"Azure OpenAI","text":"<p>For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.</p> <pre><code>AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002\n</code></pre>"},{"location":"development/#local-models","title":"Local models","text":""},{"location":"development/#using-ollama-openai-compatible-server","title":"Using ollama OpenAI compatible server","text":"<p>Install ollama and start the application.</p> <p>Pull your model (e.g):</p> <pre><code>ollama pull llama3.1:8b\nollama pull nomic-embed-text\n</code></pre> <p>Set the model names on web UI and make it as default.</p> <p></p>"},{"location":"development/#using-gguf-with-llama-cpp-python","title":"Using GGUF with llama-cpp-python","text":"<p>You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:</p> <ul> <li>GGUF</li> </ul> <p>You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.</p> <p>Here are some recommendations and their size in memory:</p> <ul> <li>Qwen1.5-1.8B-Chat-GGUF:   around 2 GB</li> </ul> <p>Add a new LlamaCpp model with the provided model name on the web uI.</p>"},{"location":"development/#adding-your-own-rag-pipeline","title":"Adding your own RAG pipeline","text":""},{"location":"development/#custom-reasoning-pipeline","title":"Custom reasoning pipeline","text":"<p>First, check the default pipeline implementation in here. You can make quick adjustment to how the default QA pipeline work.</p> <p>Next, if you feel comfortable adding new pipeline, add new <code>.py</code> implementation in <code>libs/ktem/ktem/reasoning/</code> and later include it in <code>flowssettings</code> to enable it on the UI.</p>"},{"location":"development/#custom-indexing-pipeline","title":"Custom indexing pipeline","text":"<p>Check sample implementation in <code>libs/ktem/ktem/index/file/graph</code></p> <p>(more instruction WIP).</p>"},{"location":"development/#developer-guide","title":"Developer guide","text":"<p>Please refer to the Developer Guide for more details.</p>"},{"location":"development/#star-history","title":"Star History","text":""},{"location":"development/contributing/","title":"Contributing","text":""},{"location":"development/contributing/#contributing","title":"Contributing","text":""},{"location":"development/contributing/#setting-up","title":"Setting up","text":"<ul> <li> <p>Clone the repo</p> <pre><code>git clone git@github.com:Cinnamon/kotaemon.git\ncd kotaemon\n</code></pre> </li> </ul> <ul> <li> <p>Install the environment</p> <ul> <li> <p>Create a conda environment (python &gt;= 3.10 is recommended)</p> <pre><code>conda create -n kotaemon python=3.10\nconda activate kotaemon\n\n# install dependencies\ncd libs/kotaemon\npip install -e \".[all]\"\n</code></pre> </li> </ul> <ul> <li> <p>Or run the installer (one of the <code>scripts/run_*</code> scripts depends on your OS), then   you will have all the dependencies installed as a conda environment at   <code>install_dir/env</code>.</p> <pre><code>conda activate install_dir/env\n</code></pre> </li> </ul> </li> </ul> <ul> <li> <p>Pre-commit</p> <pre><code>pre-commit install\n</code></pre> </li> </ul> <ul> <li> <p>Test</p> <pre><code>pytest tests\n</code></pre> </li> </ul>"},{"location":"development/contributing/#package-overview","title":"Package overview","text":"<p><code>kotaemon</code> library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities:</p> <ul> <li>Base interfaces: <code>kotaemon</code> defines the base interface of a component in a pipeline. A pipeline is also a component. By clearly define this interface, a pipeline of steps can be easily constructed and orchestrated.</li> <li>Core components: <code>kotaemon</code> implements (or wraps 3rd-party libraries   like Langchain, llama-index,... when possible) commonly used components in   kotaemon use cases. Some of these components are: LLM, vector store,   document store, retriever... For a detailed list and description of these   components, please refer to the API Reference section.</li> <li>List of utilities: <code>kotaemon</code> provides utilities and tools that are   usually needed in client project. For example, it provides a prompt   engineering UI for AI developers in a project to quickly create a prompt   engineering tool for DMs and QALs. It also provides a command to quickly spin   up a project code base. For a full list and description of these utilities,   please refer to the Utilities section.</li> </ul> <pre><code>mindmap\n  root((kotaemon))\n    Base Interfaces\n      Document\n      LLMInterface\n      RetrievedDocument\n      BaseEmbeddings\n      BaseChat\n      BaseCompletion\n      ...\n    Core Components\n      LLMs\n        AzureOpenAI\n        OpenAI\n      Embeddings\n        AzureOpenAI\n        OpenAI\n        HuggingFaceEmbedding\n      VectorStore\n        InMemoryVectorstore\n        ChromaVectorstore\n      Agent\n      Tool\n      DocumentStore\n      ...\n    Utilities\n      Scaffold project\n      PromptUI\n      Documentation Support</code></pre>"},{"location":"development/contributing/#common-conventions","title":"Common conventions","text":"<ul> <li>PR title: One-line description (example: Feat: Declare BaseComponent and decide LLM call interface).</li> <li>[Encouraged] Provide a quick description in the PR, so that:<ul> <li>Reviewers can quickly understand the direction of the PR.</li> <li>It will be included in the commit message when the PR is merged.</li> </ul> </li> </ul>"},{"location":"development/contributing/#environment-caching-on-pr","title":"Environment caching on PR","text":"<ul> <li>To speed up CI, environments are cached based on the version specified in <code>__init__.py</code>.</li> <li>Since dependencies versions in <code>setup.py</code> are not pinned, you need to pump the version in order to use a new environment. That environment will then be cached and used by your subsequence commits within the PR, until you pump the version again</li> <li>The new environment created during your PR is cached and will be available to others once the PR is merged.</li> <li>If you are experimenting with new dependencies and want a fresh environment every time, add <code>[ignore cache]</code> in your commit message. The CI will create a fresh environment to run your commit and then discard it.</li> <li>If your PR include updated dependencies, the recommended workflow would be:<ul> <li>Doing development as usual.</li> <li>When you want to run the CI, push a commit with the message containing <code>[ignore cache]</code>.</li> <li>Once the PR is final, pump the version in <code>__init__.py</code> and push a final commit not containing <code>[ignore cache]</code>.</li> </ul> </li> </ul>"},{"location":"development/contributing/#merge-pr-guideline","title":"Merge PR guideline","text":"<ul> <li>Use squash and merge option</li> <li>1st line message is the PR title.</li> <li>The text area is the PR description.</li> </ul>"},{"location":"development/create-a-component/","title":"Creating a Component","text":""},{"location":"development/create-a-component/#creating-a-component","title":"Creating a component","text":"<p>A fundamental concept in kotaemon is \"component\".</p> <p>Anything that isn't data or data structure is a \"component\". A component can be thought of as a step within a pipeline. It takes in some input, processes it, and returns an output, just the same as a Python function! The output will then become an input for the next component in a pipeline. In fact, a pipeline is just a component. More appropriately, a nested component: a component that makes use of one or more other components in the processing step. So in reality, there isn't a difference between a pipeline and a component! Because of that, in kotaemon, we will consider them the same as \"component\".</p> <p>To define a component, you will:</p> <ol> <li>Create a class that subclasses from <code>kotaemon.base.BaseComponent</code></li> <li>Declare init params with type annotation</li> <li>Declare nodes (nodes are just other components!) with type annotation</li> <li>Implement the processing logic in <code>run</code>.</li> </ol> <p>The syntax of a component is as follow:</p> <pre><code>from kotaemon.base import BaseComponent\nfrom kotaemon.llms import LCAzureChatOpenAI\nfrom kotaemon.parsers import RegexExtractor\n\n\nclass FancyPipeline(BaseComponent):\n    param1: str = \"This is param1\"\n    param2: int = 10\n    param3: float\n\n    node1: BaseComponent    # this is a node because of BaseComponent type annotation\n    node2: LCAzureChatOpenAI  # this is also a node because LCAzureChatOpenAI subclasses BaseComponent\n    node3: RegexExtractor   # this is also a node bceause RegexExtractor subclasses BaseComponent\n\n    def run(self, some_text: str):\n        prompt = (self.param1 + some_text) * int(self.param2 + self.param3)\n        llm_pred = self.node2(prompt).text\n        matches = self.node3(llm_pred)\n        return matches\n</code></pre> <p>Then this component can be used as follow:</p> <pre><code>llm = LCAzureChatOpenAI(endpoint=\"some-endpont\")\nextractor = RegexExtractor(pattern=[\"yes\", \"Yes\"])\n\ncomponent = FancyPipeline(\n    param1=\"Hello\"\n    param3=1.5\n    node1=llm,\n    node2=llm,\n    node3=extractor\n)\ncomponent(\"goodbye\")\n</code></pre> <p>This way, we can define each operation as a reusable component, and use them to compose larger reusable components!</p>"},{"location":"development/create-a-component/#benefits-of-component","title":"Benefits of component","text":"<p>By defining a component as above, we formally encapsulate all the necessary information inside a single class. This introduces several benefits:</p> <ol> <li>Allow tools like promptui to inspect the inner working of a component in    order to automatically generate the promptui.</li> <li>Allow visualizing a pipeline for debugging purpose.</li> </ol>"},{"location":"development/data-components/","title":"Data &amp; Data Structure Components","text":""},{"location":"development/data-components/#data-data-structure-components","title":"Data &amp; Data Structure Components","text":"<p>The data &amp; data structure components include:</p> <ul> <li>The <code>Document</code> class.</li> <li>The document store.</li> <li>The vector store.</li> </ul>"},{"location":"development/data-components/#data-loader","title":"Data Loader","text":"<ul> <li>PdfLoader</li> <li> <p>Layout-aware with table parsing PdfLoader</p> <ul> <li>MathPixLoader: To use this loader, you need MathPix API key, refer to mathpix docs for more information</li> <li>OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module).</li> <li> <p>Output:</p> <ul> <li> <p>Document: text + metadata to identify whether it is table or not</p> <pre><code>- \"source\": source file name\n- \"type\": \"table\" or \"text\"\n- \"table_origin\": original table in markdown format (to be feed to LLM or visualize using external tools)\n- \"page_label\": page number in the original PDF document\n</code></pre> </li> </ul> </li> </ul> </li> </ul>"},{"location":"development/data-components/#document-store","title":"Document Store","text":"<ul> <li>InMemoryDocumentStore</li> </ul>"},{"location":"development/data-components/#vector-store","title":"Vector Store","text":"<ul> <li>ChromaVectorStore</li> <li>InMemoryVectorStore</li> </ul>"},{"location":"development/utilities/","title":"Utilities","text":""},{"location":"development/utilities/#utilities","title":"Utilities","text":""},{"location":"development/utilities/#prompt-engineering-ui","title":"Prompt engineering UI","text":"<p>Important: despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...).</p> <p>In the development process, developers typically build the pipeline. However, for use cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more effective. To facilitate this, <code>kotaemon</code> offers a user-friendly prompt engineering UI that developers integrate into their pipelines. This enables non-technical members to adjust prompts and parameters, run experiments, and export results for optimization.</p> <p>As of Sept 2023, there are 2 kinds of prompt engineering UI:</p> <ul> <li>Simple pipeline: run one-way from start to finish.</li> <li>Chat pipeline: interactive back-and-forth.</li> </ul>"},{"location":"development/utilities/#simple-pipeline","title":"Simple pipeline","text":"<p>For simple pipeline, the supported client project workflow looks as follow:</p> <ol> <li>[tech] Build pipeline</li> <li>[tech] Export pipeline to config: <code>$ kotaemon promptui export &lt;module.path.piplineclass&gt; --output &lt;path/to/config/file.yml&gt;</code></li> <li>[tech] Customize the config</li> <li>[tech] Spin up prompt engineering UI: <code>$ kotaemon promptui run &lt;path/to/config/file.yml&gt;</code></li> <li>[non-tech] Change params, run inference</li> <li>[non-tech] Export to Excel</li> <li>[non-tech] Select the set of params that achieve the best output</li> </ol> <p>The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally done by the developers, while step 7 happens exclusively in Excel file).</p>"},{"location":"development/utilities/#step-2-export-pipeline-to-config","title":"Step 2 - Export pipeline to config","text":"<p>Command:</p> <pre><code>$ kotaemon promptui export &lt;module.path.piplineclass&gt; --output &lt;path/to/config/file.yml&gt;\n</code></pre> <p>where:</p> <ul> <li><code>&lt;module.path.pipelineclass&gt;</code> is a dot-separated path to the pipeline. For example, if your pipeline can be accessed with <code>from projectA.pipelines import AnsweringPipeline</code>, then this value is <code>projectA.pipelines.AnswerPipeline</code>.</li> <li><code>&lt;path/to/config/file.yml&gt;</code> is the target file path that the config will be exported to. If the config file already exists, and contains information of other pipelines, the config of current pipeline will additionally be added. If it contains information of the current pipeline (in the past), the old information will be replaced.</li> </ul> <p>By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with <code>ignore_ui=True</code>, and they will be ignored in the config generation process. Example:</p> <pre><code>class Pipeline(BaseComponent):\n    param1: str = Param(default=\"hello\")\n    param2: str = Param(default=\"goodbye\", ignore_ui=True)\n</code></pre> <p>Declared as above, and <code>param1</code> will show up in the config YAML file, while <code>param2</code> will not.</p>"},{"location":"development/utilities/#step-3-customize-the-config","title":"Step 3 - Customize the config","text":"<p>developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema:</p> <pre><code>&lt;module.path.pipelineclass1&gt;:\n  params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.)\n  inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.)\n  outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.)\n  logs: ... (Detail what information should show up in the log.)\n</code></pre>"},{"location":"development/utilities/#input-and-params","title":"Input and params","text":"<p>The inputs section have the overall schema as follow:</p> <pre><code>inputs:\n  &lt;input-variable-name-1&gt;:\n    component: &lt;supported-UI-component&gt;\n    params: # this section is optional)\n      value: &lt;default-value&gt;\n  &lt;input-variable-name-2&gt;: ... # similar to above\nparams:\n  &lt;param-variable-name-1&gt;: ... # similar to those in the inputs\n</code></pre> <p>The list of supported prompt UI and their corresponding gradio UI components:</p> <pre><code>COMPONENTS_CLASS = {\n    \"text\": gr.components.Textbox,\n    \"checkbox\": gr.components.CheckboxGroup,\n    \"dropdown\": gr.components.Dropdown,\n    \"file\": gr.components.File,\n    \"image\": gr.components.Image,\n    \"number\": gr.components.Number,\n    \"radio\": gr.components.Radio,\n    \"slider\": gr.components.Slider,\n}\n</code></pre>"},{"location":"development/utilities/#outputs","title":"Outputs","text":"<p>The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration:</p> <pre><code>outputs:\n  - component: &lt;supported-UI-component&gt;\n    step: &lt;name-of-pipeline-step&gt;\n    item: &lt;jsonpath way to retrieve the info&gt;\n  - ... # similar to above\n</code></pre> <p>where:</p> <ul> <li>component: the same text string and corresponding Gradio UI as in inputs &amp; params</li> <li>step: the pipeline step that we wish to look fetch and show output on the UI</li> <li>item: the jsonpath mechanism to get the targeted variable from the step above</li> </ul>"},{"location":"development/utilities/#logs","title":"Logs","text":"<p>The logs show a list of sheetname and how to retrieve the desired information.</p> <pre><code>logs:\n  &lt;logname&gt;:\n    inputs:\n      - name: &lt;column name&gt;\n        step: &lt;the pipeline step that we would wish to see the input&gt;\n        variable: &lt;the variable in the step&gt;\n      - ...\n    outputs:\n      - name: &lt;column name&gt;\n        step: &lt;the pipeline step that we would wish to see the output&gt;\n        item: &lt;how to retrieve the output of that step&gt;\n</code></pre>"},{"location":"development/utilities/#step-4-5-spin-up-prompt-engineering-ui-perform-prompt-engineering","title":"Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering","text":"<p>Command:</p> <pre><code>$ kotaemon promptui run &lt;path/to/config/file.yml&gt;\n</code></pre> <p>This will generate an UI as follow:</p> <p></p> <p>where:</p> <ul> <li>The tabs at the top of the UI corresponds to the pipeline to do prompt engineering.</li> <li>The inputs and params tabs allow users to edit (these corresponds to the inputs and params in the config file).</li> <li>The outputs panel holds the UI elements to show the outputs defined in config file.</li> <li>The Run button: will execute pipeline with the supplied inputs and params, and render result in the outputs panel.</li> <li>The Export button: will export the logs of all the run to an Excel files users to inspect for best set of params.</li> </ul>"},{"location":"development/utilities/#step-6-export-to-excel","title":"Step 6 - Export to Excel","text":"<p>Upon clicking export, the users can download Excel file.</p>"},{"location":"development/utilities/#chat-pipeline","title":"Chat pipeline","text":"<p>Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow:</p> <ol> <li>Set the desired parameters.</li> <li>Click \"New chat\" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect.</li> <li>Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot.</li> <li>During chat, the log of the chat will show up in the \"Output\" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings.</li> <li>When finishing chat, select your preference in the radio box. Click \"End chat\". This will save the chat log and the preference to disk.</li> <li>To compare the result of different run, click \"Export\" to get an Excel spreadsheet summary of different run.</li> </ol>"},{"location":"pages/app/customize-flows/","title":"Customize flow logic","text":""},{"location":"pages/app/customize-flows/#add-new-indexing-and-reasoning-pipeline-to-the-application","title":"Add new indexing and reasoning pipeline to the application","text":"<p>@trducng</p> <p>At high level, to add new indexing and reasoning pipeline:</p> <ol> <li>You define your indexing or reasoning pipeline as a class from    <code>BaseComponent</code>.</li> <li>You declare that class in the setting files <code>flowsettings.py</code>.</li> </ol> <p>Then when <code>python app.py</code>, the application will dynamically load those pipelines.</p> <p>The below sections talk in more detail about how the pipelines should be constructed.</p>"},{"location":"pages/app/customize-flows/#define-a-pipeline-as-a-class","title":"Define a pipeline as a class","text":"<p>In essence, a pipeline will subclass from <code>kotaemon.base.BaseComponent</code>. Each pipeline has 2 main parts:</p> <ul> <li>All declared arguments and sub-pipelines.</li> <li>The logic inside the pipeline.</li> </ul> <p>An example pipeline:</p> <pre><code>from kotaemon.base import BaseComponent\n\n\nclass SoSimple(BaseComponent):\n    arg1: int\n    arg2: str\n\n    def run(self, arg3: str):\n        return self.arg1 * self.arg2 + arg3\n</code></pre> <p>This pipeline is simple for demonstration purpose, but we can imagine pipelines with much more arguments, that can take other pipelines as arguments, and have more complicated logic in the <code>run</code> method.</p> <p>An indexing or reasoning pipeline is just a class subclass from <code>BaseComponent</code> like above.</p> <p>For more detail on this topic, please refer to Creating a Component</p>"},{"location":"pages/app/customize-flows/#run-signatures","title":"Run signatures","text":"<p>Note: this section is tentative at the moment. We will finalize <code>def run</code> function signature by latest early April.</p> <p>The indexing pipeline:</p> <pre><code>    def run(\n        self,\n        file_paths: str | Path | list[str | Path],\n        reindex: bool = False,\n        **kwargs,\n    ):\n        \"\"\"Index files to intermediate representation (e.g. vector, database...)\n\n        Args:\n            file_paths: the list of paths to files\n            reindex: if True, files in `file_paths` that already exists in database\n                should be reindex.\n        \"\"\"\n</code></pre> <p>The reasoning pipeline:</p> <pre><code>    def run(self, question: str, history: list, **kwargs) -&gt; Document:\n        \"\"\"Answer the question\n\n        Args:\n            question: the user input\n            history: the chat history [(user_msg1, bot_msg1), (user_msg2, bot_msg2)...]\n\n        Returns:\n            kotaemon.base.Document: the final answer\n        \"\"\"\n</code></pre>"},{"location":"pages/app/customize-flows/#register-your-pipeline-to-ktem","title":"Register your pipeline to ktem","text":"<p>To register your pipelines to ktem, you declare it in the <code>flowsettings.py</code> file. This file locates at the current working directory where you start the ktem. In most use cases, it is this one.</p> <pre><code>KH_REASONING = [\"&lt;python.module.path.to.the.reasoning.class&gt;\"]\n\nKH_INDEX = \"&lt;python.module.path.to.the.indexing.class&gt;\"\n</code></pre> <p>You can register multiple reasoning pipelines to ktem by populating the <code>KH_REASONING</code> list. The user can select which reasoning pipeline to use in their Settings page.</p> <p>For now, there's only one supported index option for <code>KH_INDEX</code>.</p> <p>Make sure that your class is discoverable by Python.</p>"},{"location":"pages/app/customize-flows/#allow-users-to-customize-your-pipeline-in-the-app-settings","title":"Allow users to customize your pipeline in the app settings","text":"<p>To allow the users to configure your pipeline, you need to declare what you allow the users to configure as a dictionary. <code>ktem</code> will include them into the application settings.</p> <p>In your pipeline class, add a classmethod <code>get_user_settings</code> that returns a setting dictionary, add a classmethod <code>get_info</code> that returns an info dictionary. Example:</p> <pre><code>class SoSimple(BaseComponent):\n\n    ... # as above\n\n    @classmethod\n    def get_user_settings(cls) -&gt; dict:\n        \"\"\"The settings to the user\"\"\"\n        return {\n            \"setting_1\": {\n                \"name\": \"Human-friendly name\",\n                \"value\": \"Default value\",\n                \"choices\": [(\"Human-friendly Choice 1\", \"choice1-id\"), (\"HFC 2\", \"choice2-id\")], # optional\n                \"component\": \"Which Gradio UI component to render, can be: text, number, checkbox, dropdown, radio, checkboxgroup\"\n            },\n            \"setting_2\": {\n                # follow the same rule as above\n            }\n        }\n\n    @classmethod\n    def get_info(cls) -&gt; dict:\n        \"\"\"Pipeline information for bookkeeping purpose\"\"\"\n        return {\n            \"id\": \"a unique id to differentiate this pipeline from other pipeline\",\n            \"name\": \"Human-friendly name of the pipeline\",\n            \"description\": \"Can be a short description of this pipeline\"\n        }\n</code></pre> <p>Once adding these methods to your pipeline class, <code>ktem</code> will automatically extract and add them to the settings.</p>"},{"location":"pages/app/customize-flows/#construct-to-pipeline-object","title":"Construct to pipeline object","text":"<p>Once <code>ktem</code> runs your pipeline, it will call your classmethod <code>get_pipeline</code> with the full user settings and expect to obtain the pipeline object. Within this <code>get_pipeline</code> method, you implement all the necessary logics to initiate the pipeline object. Example:</p> <pre><code>class SoSimple(BaseComponent):\n    ... # as above\n\n    @classmethod\n    def get_pipeline(self, setting):\n        obj = cls(arg1=setting[\"reasoning.id.setting1\"])\n        return obj\n</code></pre>"},{"location":"pages/app/customize-flows/#reasoning-stream-output-to-ui","title":"Reasoning: Stream output to UI","text":"<p>For fast user experience, you can stream the output directly to UI. This way, user can start observing the output as soon as the LLM model generates the 1st token, rather than having to wait the pipeline finishes to read the whole message.</p> <p>To stream the output, you need to;</p> <ol> <li>Turn the <code>run</code> function to async.</li> <li>Pass in the output to a special queue with <code>self.report_output</code>.</li> </ol> <pre><code>    async def run(self, question: str, history: list, **kwargs) -&gt; Document:\n        for char in \"This is a long messages\":\n            self.report_output({\"output\": text.text})\n</code></pre> <p>The argument to <code>self.report_output</code> is a dictionary, that contains either or all of these 2 keys: \"output\", \"evidence\". The \"output\" string will be streamed to the chat message, and the \"evidence\" string will be streamed to the information panel.</p>"},{"location":"pages/app/customize-flows/#access-application-llms-embeddings","title":"Access application LLMs, Embeddings","text":"<p>You can access users' collections of LLMs and embedding models with:</p> <pre><code>from ktem.embeddings.manager import embeddings\nfrom ktem.llms.manager import llms\n\n\nllm = llms.get_default()\nembedding_model = embeddings.get_default()\n</code></pre> <p>You can also allow the users to specifically select which llms or embedding models they want to use through the settings.</p> <pre><code>    @classmethod\n    def get_user_settings(cls) -&gt; dict:\n        from ktem.llms.manager import llms\n\n        return {\n            \"citation_llm\": {\n                \"name\": \"LLM for citation\",\n                \"value\": llms.get_default(),\n                \"component: \"dropdown\",\n                \"choices\": list(llms.options().keys()),\n            },\n            ...\n        }\n</code></pre>"},{"location":"pages/app/customize-flows/#optional-access-application-data","title":"Optional: Access application data","text":"<p>You can access the user's application database, vector store as follow:</p> <pre><code># get the database that contains the source files\nfrom ktem.db.models import Source, Index, Conversation, User\n\n# get the vector store\n</code></pre>"},{"location":"pages/app/features/","title":"Features","text":""},{"location":"pages/app/features/#chat","title":"Chat","text":"<p>The kotaemon focuses on question and answering over a corpus of data. Below is the gentle introduction about the chat functionality.</p> <ul> <li>Users can upload corpus of files.</li> <li>Users can converse to the chatbot to ask questions about the corpus of files.</li> <li>Users can view the reference in the files.</li> </ul>"},{"location":"pages/app/functional-description/","title":"Functional description","text":""},{"location":"pages/app/functional-description/#user-group-tenant-management","title":"User group / tenant management","text":""},{"location":"pages/app/functional-description/#create-new-user-group","title":"Create new user group","text":"<p>(6 man-days)</p> <p>Description: each client has a dedicated user group. Each user group has an admin user who can do administrative tasks (e.g. creating user account in that user group...). The workflow for creating new user group is as follow:</p> <ol> <li>Cinnamon accesses the user group management UI.</li> <li>On \"Create user group\" panel, we supply:    a. Client name: e.g. Apple.    b. Sub-domain name: e.g. apple.    c. Admin email, username &amp; password.</li> <li>The system will:    a. An Aurora Platform deployment with the specified sub-domain.    b. Send an email to the admin, with the username &amp; password.</li> </ol> <p>Expectation:</p> <ul> <li>The admin can go to the deployed Aurora Platform.</li> <li>The admin can login with the specified username &amp; password.</li> </ul> <p>Condition:</p> <ul> <li>When sub-domain name already exists, raise error.</li> <li>If error sending email to the client, raise the error, and delete the   newly-created user-group.</li> <li>Password rule:<ul> <li>Have at least 8 characters.</li> <li>Must contain uppercase, lowercase, number and symbols.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#delete-user-group","title":"Delete user group","text":"<p>(2 man-days)</p> <p>Description: in the tenant management page, we can delete the selected user group. The user flow is as follow:</p> <ol> <li>Cinnamon accesses the user group management UI,</li> <li>View list of user groups.</li> <li>Next to target user group, click delete.</li> <li>Confirm whether to delete.</li> <li>If Yes, delete the user group. If No, cancel the operation.</li> </ol> <p>Expectation: when a user group is deleted, we expect to delete everything related to the user groups: domain, files, databases, caches, deployments.</p>"},{"location":"pages/app/functional-description/#user-management","title":"User management","text":""},{"location":"pages/app/functional-description/#create-user-account-for-admin-user","title":"Create user account (for admin user)","text":"<p>(1 man-day)</p> <p>Description: the admin user in the client's account can create user account for that user group. To create the new user, the client admin do:</p> <ol> <li>Navigate to \"Admin\" &gt; \"Users\"</li> <li>In the \"Create user\" panel, supply:<ul> <li>Username</li> <li>Password</li> <li>Confirm password</li> </ul> </li> <li>Click \"Create\"</li> </ol> <p>Expectation:</p> <ul> <li>The user can create the account.</li> <li>The username:<ul> <li>Is case-insensitive (e.g. Moon and moon will be the same)</li> <li>Can only contains these characters: a-z A-Z 0-9 _ + - .</li> <li>Has maximum length of 32 characters</li> </ul> </li> <li>The password is subjected to the following rule:<ul> <li>8-character minimum length</li> <li>Contains at least 1 number</li> <li>Contains at least 1 lowercase letter</li> <li>Contains at least 1 uppercase letter</li> <li>Contains at least 1 special character from the following set, or a   non-leading, non-trailing space character: <code>^ $ * . [ ] { } ( ) ? - \" ! @ # % &amp; / \\ , &gt; &lt; ' : ; | _ ~</code> + =</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#delete-user-account-for-admin-user","title":"Delete user account (for admin user)","text":"<p>Description: the admin user in the client's account can delete user account. Once an user account is deleted, he/she cannot login to Aurora Platform.</p> <ol> <li>The admin user navigates to \"Admin\" &gt; \"Users\".</li> <li>In the user list panel, next to the username, the admin click on the \"Delete\"    button. The Confirmation dialog appears.</li> <li>If \"Delete\", the user account is deleted. If \"Cancel\", do nothing. The    Confirmation dialog disappears.</li> </ol> <p>Expectation:</p> <ul> <li>Once the user is deleted, the following information relating to the user will   be deleted:<ul> <li>His/her personal setting.</li> <li>His/her conversations.</li> </ul> </li> <li>The following information relating to the user will still be retained:<ul> <li>His/her uploaded files.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#edit-user-account-for-admin-user","title":"Edit user account (for admin user)","text":"<p>Description: the admin user can change any information about the user account, including password. To change user information:</p> <ol> <li>The admin user navigates to \"Admin\" &gt; \"Users\".</li> <li>In the user list panel, next to the username, the admin click on the \"Edit\"    button.</li> <li>The user list disappears, the user detail appears, with the following    information show up:<ul> <li>Username: (prefilled the username)</li> <li>Password: (blank)</li> <li>Confirm password: (blank)</li> </ul> </li> <li>The admin can edit any of the information, and click \"Save\" or \"Cancel\".<ul> <li>If \"Save\": the information will be updated to the database, or show    error per Expectation below.</li> <li>If \"Cancel\": skip.</li> </ul> </li> <li>If Save success or Cancel, transfer back to the user list UI, where the user    information is updated accordingly.</li> </ol> <p>Expectation:</p> <ul> <li>If the \"Password\" &amp; \"Confirm password\" are different from each other, show   error: \"Password mismatch\".</li> <li>If both \"Password\" &amp; *\"Confirm password\" are blank, don't change the user   password.</li> <li>If changing password, the password rule is subjected to the same rule when   creating user.</li> <li>It's possible to change username. If changing username, the target user has to   use the new username.</li> </ul>"},{"location":"pages/app/functional-description/#sign-in","title":"Sign-in","text":"<p>(3 man-days)</p> <p>Description: the users can sign-in to Aurora Platform as follow:</p> <ol> <li>User navigates to the URL.</li> <li>If the user is not logged in, the UI just shows the login screen.</li> <li>User types username &amp; password.</li> <li>If correct, the user will proceed to normal working UI.</li> <li>If incorrect, the login screen shows text error.</li> </ol>"},{"location":"pages/app/functional-description/#sign-out","title":"Sign-out","text":"<p>(1 man-day)</p> <p>Description: the user can sign-out of Aurora Platform as follow:</p> <ol> <li>User navigates to the Settings &gt; User page.</li> <li>User click on logout.</li> <li>The user is signed out to the UI login screen.</li> </ol> <p>Expectation: the user is completely signed out. Next time he/she uses the Aurora Platform, he/she has to login again.</p>"},{"location":"pages/app/functional-description/#change-password","title":"Change password","text":"<p>Description: the user can change their password as follow:</p> <ol> <li>User navigates to the Settings &gt; User page.</li> <li>In the change password section, the user provides these info and click    Change:<ul> <li>Current password</li> <li>New password</li> <li>Confirm new password</li> </ul> </li> <li>If changing successfully, then the password is changed. Otherwise, show the    error on the UI.</li> </ol> <p>Expectation:</p> <ul> <li>If changing password succeeds, next time they logout/login to the system, they   can use the new password.</li> <li>Password rule (Same as normal password rule when creating user)</li> <li>Errors:<ul> <li>Password does not match.</li> <li>Violated password rules.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#chat","title":"Chat","text":""},{"location":"pages/app/functional-description/#chat-to-the-bot","title":"Chat to the bot","text":"<p>Description: the Aurora Platform focuses on question and answering over the uploaded data. Each chat has the following components:</p> <ul> <li>Chat message: show the exchange between bots and humans.</li> <li>Text input + send button: for the user to input the message.</li> <li>Data source panel: for selecting the files that will scope the context for the   bot.</li> <li>Information panel: showing evidence as the bot answers user's questions.</li> </ul> <p>The chat workflow looks as follow:</p> <ol> <li>[Optional] User select files that they want to scope the context for the bot.    If the user doesn't select any files, then all files on Aurora Platform will    be the context for the bot.<ul> <li>The user can type multi-line messages, using \"Shift + Enter\" for    line-break.</li> </ul> </li> <li>User sends the message (either clicking the Send button or hitting the Enter    key).</li> <li>The bot in the chat conversation will return \"Thinking...\" while it    processes.</li> <li>The information panel on the right begin to show data related to the user    message.</li> <li>The bot begins to generate answer. The \"Thinking...\" placeholder disappears..</li> </ol> <p>Expecatation:</p> <ul> <li>Messages:<ul> <li>User can send multi-line messages, using \"Shift + Enter\" for line-break.</li> <li>User can thumbs up, thumbs down the AI response. This information is   recorded in the database.</li> <li>User can click on a copy button on the chat message to copy the content to   clipboard.</li> </ul> </li> <li>Information panel:<ul> <li>The information panel shows the latest evidence.</li> <li>The user can click on the message, and the reference for that message will   show up on the \"Reference panel\" (feature in-planning).</li> <li>The user can click on the title to show/hide the content.</li> <li>The whole information panel can be collapsed.</li> </ul> </li> <li>Chatbot quality:<ul> <li>The user can converse with the bot. The bot answer the user's requests in a   natural manner.</li> <li>The bot message should be streamed to the UI. The bot don't wait to gather   alll the text response, then dump all of them at once.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#conversation-switch","title":"Conversation - switch","text":"<p>Description: users can jump around between different conversations. They can see the list of all conversations, can select an old converation, and continue the chat under the context of the old conversation. The switching workflow is like this:</p> <ol> <li>Users click on the conversation dropdown. It will show a list of    conversations.</li> <li>Within that dropdown, the user selects one conversation.</li> <li>The chat messages, information panel, and selected data will show the content    in that old chat.</li> <li>The user can continue chatting as normal under the context of this old chat.</li> </ol> <p>Expectation:</p> <ul> <li>In the conversation drop down list, the conversations are ordered in created   date order.</li> <li>When there is no conversation, the conversation list is empty.</li> <li>When there is no conversation, the user can still converse with the chat bot.   When doing so, it automatically create new conversation.</li> </ul>"},{"location":"pages/app/functional-description/#conversation-create","title":"Conversation - create","text":"<p>Description: the user can explicitly start a new conversation with the chatbot:</p> <ol> <li>User click on the \"New\" button.</li> <li>The new conversation is automatically created.</li> </ol> <p>Expectation:</p> <ul> <li>The default conversation name is the current datetime.</li> <li>It become selected.</li> <li>It is added to the conversation list.</li> </ul>"},{"location":"pages/app/functional-description/#conversation-rename","title":"Conversation - rename","text":"<p>Description: user can rename the chatbot by typing the name, and click on the Rename button next to it.</p> <ul> <li>If rename succeeds: the name shown in the 1st dropdown will change accordingly</li> <li>If rename doesn't succeed: show error message in red color below the rename section</li> </ul> <p>Condition:</p> <ul> <li>Name constraint:<ul> <li>Min characters: 1</li> <li>Max characters: 40</li> <li>Could not having the same name with an existing conversation of the same   user.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#conversation-delete","title":"Conversation - delete","text":"<p>Description: user can delete the existing conversation as follow:</p> <ol> <li>Click on Delete button.</li> <li>The UI show confirmation with 2 buttons:<ul> <li>Delete</li> <li>Cancel.</li> </ul> </li> <li>If Delete, delete the conversation, switch to the next oldest conversation,    close the confirmation panel.</li> <li>If cancel, just close the confirmation panel.</li> </ol>"},{"location":"pages/app/functional-description/#file-management","title":"File management","text":"<p>The file management allows users to upload, list and delete files that they upload to the Aurora Platform</p>"},{"location":"pages/app/functional-description/#upload-file","title":"Upload file","text":"<p>Description: the user can upload files to the Aurora Platform. The uploaded files will be served as context for our chatbot to refer to when it converses with the user. To upload file, the user:</p> <ol> <li>Navigate to the File tab.</li> <li>Within the File tab, there is an Upload section.</li> <li>User can add files to the Upload section through drag &amp; drop, and or by click    on the file browser.</li> <li>User can select some options relating to uploading and indexing. Depending on    the project, these options can be different. Nevertheless, they will discuss    below.</li> <li>User click on \"Upload and Index\" button.</li> <li>The app show notifications when indexing starts and finishes, and when errors    happen on the top right corner.</li> </ol> <p>Options:</p> <ul> <li>Force re-index file. When user tries to upload files that already exists on   the system:<ul> <li>If this option is True: will re-index those files.</li> <li>If this option is False: will skip indexing those files.</li> </ul> </li> </ul> <p>Condition:</p> <ul> <li>Max number of files: 100 files.</li> <li>Max number of pages per file: 500 pages</li> <li>Max file size: 10 MB</li> </ul>"},{"location":"pages/app/functional-description/#list-all-files","title":"List all files","text":"<p>Description: the user can know which files are on the system by:</p> <ol> <li>Navigate to the File tab.</li> <li>By default, it will show all the uploaded files, each with the following    information: file name, file size, number of pages, uploaded date</li> <li>The UI also shows total number of pages, and total number of sizes in MB.</li> </ol>"},{"location":"pages/app/functional-description/#delete-file","title":"Delete file","text":"<p>Description: users can delete files from this UI to free up the space, or to remove outdated information. To remove the files:</p> <ol> <li>User navigate to the File tab.</li> <li>In the list of file, next to each file, there is a Delete button.</li> <li>The user clicks on the Delete button. Confirmation dialog appear.</li> <li>If Delete, delete the file. If Cancel, close the confirmation dialog.</li> </ol> <p>Expectation: once the file is deleted:</p> <ul> <li>The database entry of that file is deleted.</li> <li>The file is removed from \"Chat - Data source\".</li> <li>The total number of pages and MB sizes are reduced accordingly.</li> <li>The reference to the file in the information panel is still retained.</li> </ul>"},{"location":"pages/app/ext/user-management/","title":"User management","text":"<p><code>ktem</code> provides user management as an extension. To enable user management, in your <code>flowsettings.py</code>, set the following variables:</p> <ul> <li><code>KH_FEATURE_USER_MANAGEMENT</code>: True to enable.</li> <li><code>KH_FEATURE_USER_MANAGEMENT_ADMIN</code>: the admin username. This user will be   created when the app 1st start.</li> <li><code>KH_FEATURE_USER_MANAGEMENT_PASSWORD</code>: the admin password. This value   accompanies the admin username.</li> </ul> <p>Once enabled, you have access to the following features:</p> <ul> <li>User login/logout (located in Settings Tab)</li> <li>User changing password (located in Settings Tab)</li> <li>Create / List / Edit / Delete user (located in Admin &gt; User Management Tab)</li> </ul>"},{"location":"pages/app/index/file/","title":"File index","text":"<p>The file index stores files in a local folder and index them for retrieval. This file index provides the following infrastructure to support the indexing:</p> <ul> <li>SQL table Source: store the list of files that are indexed by the system</li> <li>Vector store: contain the embedding of segments of the files</li> <li>Document store: contain the text of segments of the files. Each text stored   in this document store is associated with a vector in the vector store.</li> <li>SQL table Index: store the relationship between (1) the source and the   docstore, and (2) the source and the vector store.</li> </ul> <p>The indexing and retrieval pipelines are encouraged to use the above software infrastructure.</p>"},{"location":"pages/app/index/file/#indexing-pipeline","title":"Indexing pipeline","text":"<p>The ktem has default indexing pipeline: <code>ktem.index.file.pipelines.IndexDocumentPipeline</code>.</p> <p>This default pipeline works as follow:</p> <ul> <li>Input: list of file paths</li> <li>Output: list of nodes that are indexed into database</li> <li>Process:<ul> <li>Read files into texts. Different file types has different ways to read texts.</li> <li>Split text files into smaller segments</li> <li>Run each segments into embeddings.</li> <li>Store the embeddings into vector store. Store the texts of each segment   into docstore. Store the list of files in Source. Store the linking   between Sources and docstore + vectorstore in Index table.</li> </ul> </li> </ul> <p>You can customize this default pipeline if your indexing process is close to the default pipeline. You can create your own indexing pipeline if there are too much different logic.</p>"},{"location":"pages/app/index/file/#customize-the-default-pipeline","title":"Customize the default pipeline","text":"<p>The default pipeline provides the contact points in <code>flowsettings.py</code>.</p> <ol> <li><code>FILE_INDEX_PIPELINE_FILE_EXTRACTORS</code>. Supply overriding file extractor,    based on file extension. Example: <code>{\".pdf\": \"path.to.PDFReader\", \".xlsx\": \"path.to.ExcelReader\"}</code></li> <li><code>FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE</code>. The expected number of characters    of each text segment. Example: 1024.</li> <li><code>FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP</code>. The expected number of    characters that consecutive text segments should overlap with each other.    Example: 256.</li> </ol>"},{"location":"pages/app/index/file/#create-your-own-indexing-pipeline","title":"Create your own indexing pipeline","text":"<p>Your indexing pipeline will subclass <code>BaseFileIndexIndexing</code>.</p> <p>You should define the following methods:</p> <ul> <li><code>run(self, file_paths)</code>: run the indexing given the pipeline</li> <li><code>get_pipeline(cls, user_settings, index_settings)</code>: return the   fully-initialized pipeline, ready to be used by ktem.<ul> <li><code>user_settings</code>: is a dictionary contains user settings (e.g. <code>{\"pdf_mode\": True, \"num_retrieval\": 5}</code>). You can declare these settings in the <code>get_user_settings</code> classmethod. ktem will collect these settings into the app Settings page, and will supply these user settings to your <code>get_pipeline</code> method.</li> <li><code>index_settings</code>: is a dictionary. Currently it's empty for File Index.</li> </ul> </li> <li><code>get_user_settings</code>: to declare user settings, return a dictionary.</li> </ul> <p>By subclassing <code>BaseFileIndexIndexing</code>, You will have access to the following resources:</p> <ul> <li><code>self._Source</code>: the source table</li> <li><code>self._Index</code>: the index table</li> <li><code>self._VS</code>: the vector store</li> <li><code>self._DS</code>: the docstore</li> </ul> <p>Once you have prepared your pipeline, register it in <code>flowsettings.py</code>: <code>FILE_INDEX_PIPELINE = \"&lt;python.path.to.your.pipeline&gt;\"</code>.</p>"},{"location":"pages/app/index/file/#retrieval-pipeline","title":"Retrieval pipeline","text":"<p>The ktem has default retrieval pipeline: <code>ktem.index.file.pipelines.DocumentRetrievalPipeline</code>. This pipeline works as follow:</p> <ul> <li>Input: user text query &amp; optionally a list of source file ids</li> <li>Output: the output segments that match the user text query</li> <li>Process:<ul> <li>If a list of source file ids is given, get the list of vector ids that   associate with those file ids.</li> <li>Embed the user text query.</li> <li>Query the vector store. Provide a list of vector ids to limit query scope   if the user restrict.</li> <li>Return the matched text segments</li> </ul> </li> </ul>"},{"location":"pages/app/index/file/#create-your-own-retrieval-pipeline","title":"Create your own retrieval pipeline","text":"<p>Your retrieval pipeline will subclass <code>BaseFileIndexRetriever</code>. The retriever has the same database, vectorstore and docstore accesses like the indexing pipeline.</p> <p>You should define the following methods:</p> <ul> <li><code>run(self, query, file_ids)</code>: retrieve relevant documents relating to the   query. If <code>file_ids</code> is given, you should restrict your search within these   <code>file_ids</code>.</li> <li><code>get_pipeline(cls, user_settings, index_settings, selected)</code>: return the   fully-initialized pipeline, ready to be used by ktem.<ul> <li><code>user_settings</code>: is a dictionary contains user settings (e.g. <code>{\"pdf_mode\": True, \"num_retrieval\": 5}</code>). You can declare these settings in the <code>get_user_settings</code> classmethod. ktem will collect these settings into the app Settings page, and will supply these user settings to your <code>get_pipeline</code> method.<ul> <li><code>index_settings</code>: is a dictionary. Currently it's empty for File Index.</li> <li><code>selected</code>: a list of file ids selected by user. If user doesn't select   anything, this variable will be None.</li> </ul> </li> </ul> </li> <li><code>get_user_settings</code>: to declare user settings, return a dictionary.</li> </ul> <p>Once you build the retrieval pipeline class, you can register it in <code>flowsettings.py</code>: <code>FILE_INDEXING_RETRIEVER_PIPELIENS = [\"path.to.retrieval.pipelie\"]</code>. Because there can be multiple parallel pipelines within an index, this variable takes a list of string rather than a string.</p>"},{"location":"pages/app/index/file/#software-infrastructure","title":"Software infrastructure","text":"Infra Access Schema Ref SQL table Source self._Source - id (int): id of the source (auto)- name (str): the name of the file- path (str): the path of the file- size (int): the file size in bytes- note (dict): allow extra optional information about the file- date_created (datetime): the time the file is created (auto) This is SQLALchemy ORM class. Can consult SQL table Index self._Index - id (int): id of the index entry (auto)- source_id (int): the id of a file in the Source table- target_id: the id of the segment in docstore or vector store- relation_type (str): if the link is \"document\" or \"vector\" This is SQLAlchemy ORM class Vector store self._VS - self._VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)- self._VS.delete: delete vector entries based on ids- self._VS.query: get embeddings based on embeddings. kotaemon &gt; storages &gt; vectorstores &gt; BaseVectorStore Doc store self._DS - self._DS.add: add the segments to document stores- self._DS.get: get the segments based on id- self._DS.get_all: get all segments- self._DS.delete: delete segments based on id kotaemon &gt; storages &gt; docstores &gt; base &gt; BaseDocumentStore"},{"location":"pages/app/settings/overview/","title":"Settings","text":""},{"location":"pages/app/settings/overview/#overview","title":"Overview","text":"<p>There are 3 kinds of settings in <code>ktem</code>, geared towards different stakeholders for different use cases:</p> <ul> <li>Developer settings. These settings are meant for very basic app customization, such as database URL, cloud config, logging config, which features to enable... You will be interested in the developer settings if you deploy <code>ktem</code> to your customers, or if you build extension for <code>ktem</code> for developers. These settings are declared inside <code>flowsettings.py</code>.</li> <li>Admin settings. These settings show up in the Admin page, and are meant to allow admin-level user to customize low level features, such as which credentials to connect to data sources, which keys to use for LLM...</li> <li>User settings. These settings are meant for run-time users to tweak ktem to their personal needs, such as which output languages the chatbot should generate, which reasoning type to use...</li> </ul>"},{"location":"pages/app/settings/user-settings/","title":"User settings","text":""},{"location":"pages/app/settings/user-settings/#user-settings","title":"User settings","text":"<p><code>ktem</code> allows developers to extend the index and the reasoning pipeline. In many cases, these components can have settings that should be modified by users at run-time, (e.g. <code>topk</code>, <code>chunksize</code>...). These are the user settings.</p> <p><code>ktem</code> allows developers to declare such user settings in their code. Once declared, <code>ktem</code> will render them in a Settings page.</p> <p>There are 2 places that <code>ktem</code> looks for declared user settings. You can refer to the respective pages.</p> <ul> <li>In the index.</li> <li>In the reasoning pipeline.</li> </ul>"},{"location":"pages/app/settings/user-settings/#syntax-of-a-settings","title":"Syntax of a settings","text":"<p>A collection of settings is a dictionary of type <code>dict[str, dict]</code>, where the key is a setting id, and the value is the description of the setting.</p> <pre><code>settings = {\n    \"topk\": {\n        \"name\": \"Top-k chunks\",\n        \"value\": 10,\n        \"component\": \"number\",\n    },\n    \"lang\": {\n        \"name\": \"Languages\",\n        \"value\": \"en\",\n        \"component\": \"dropdown\",\n        \"choices\": [(\"en\", \"English\"), (\"cn\", \"Chinese\")],\n    }\n}\n</code></pre> <p>Each setting description must have:</p> <ul> <li>name: the human-understandable name of the settings.</li> <li>value: the default value of the settings.</li> <li> <p>component: the UI component to render such setting on the UI. Available:</p> <ul> <li>\"text\": single-value</li> <li>\"number\": single-value</li> <li>\"checkbox\": single-value</li> <li>\"dropdown\": choices</li> <li>\"radio\": choices</li> <li>\"checkboxgroup\": choices</li> </ul> </li> </ul> <ul> <li>choices: the list of choices, if the component type allows.</li> </ul>"},{"location":"pages/app/settings/user-settings/#settings-page-structure","title":"Settings page structure","text":""},{"location":"reference/Summary/","title":"Summary","text":"<ul> <li>Agents     * Base     * Io         * Base     * Langchain Based     * React         * Agent         * Prompt     * Rewoo         * Agent         * Planner         * Prompt         * Solver     * Tools         * Base         * Google         * Llm         * Wikipedia     * Utils</li> <li>Base     * Component     * Schema</li> <li>Chatbot     * Base     * Simple Respondent</li> <li>CLI</li> <li>Embeddings     * Base     * Endpoint Based     * Fastembed     * Langchain Based     * Openai</li> <li>Indices     * Base     * Extractors         * Doc Parsers     * Ingests         * Files     * Qa         * Citation         * Text Based     * Rankings         * Base         * Cohere         * Llm         * Llm Scoring         * Llm Trulens     * Splitters     * Vectorindex</li> <li>LLMs     * Base     * Branching     * Chats         * Base         * Endpoint Based         * Langchain Based         * Llamacpp         * Openai     * Completions         * Base         * Langchain Based     * Cot     * Linear     * Prompts         * Base         * Template</li> <li>Loaders     * Adobe Loader     * Azureai Document Intelligence Loader     * Base     * Composite Loader     * Docx Loader     * Excel Loader     * Html Loader     * Mathpix Loader     * Ocr Loader     * Pdf Loader     * Txt Loader     * Unstructured Loader     * Utils         * Adobe         * Box         * Gpt4V         * Pdf Ocr         * Table</li> <li>Parsers     * Regex Extractor</li> <li>Storages     * Docstores         * Base         * Elasticsearch         * In Memory         * Lancedb         * Simple File     * Vectorstores         * Base         * Chroma         * In Memory         * Lancedb         * Milvus         * Qdrant         * Simple File</li> </ul>"},{"location":"reference/cli/","title":"CLI","text":""},{"location":"reference/cli/#cli.export","title":"export","text":"<pre><code>export(export_path, output)\n</code></pre> <p>Export a pipeline to a config file</p> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@promptui.command()\n@click.argument(\"export_path\", nargs=1)\n@click.option(\"--output\", default=\"promptui.yml\", show_default=True, required=False)\ndef export(export_path, output):\n    \"\"\"Export a pipeline to a config file\"\"\"\n    import sys\n\n    from theflow.utils.modules import import_dotted_string\n\n    from kotaemon.contribs.promptui.config import export_pipeline_to_config\n\n    sys.path.append(os.getcwd())\n    cls = import_dotted_string(export_path, safe=False)\n    export_pipeline_to_config(cls, output)\n    check_config_format(output)\n</code></pre>"},{"location":"reference/cli/#cli.run","title":"run","text":"<pre><code>run(run_path, share, username, password, appname, port)\n</code></pre> <p>Run the UI from a config file</p> <p>Examples:</p> <pre><code>\n# Run with default config file\n$ kh promptui run\n\n\n# Run with username and password supplied\n$ kh promptui run --username admin --password password\n\n\n# Run with username and prompted password\n$ kh promptui run --username admin\n\n# Run and share to promptui\n# kh promptui run --username admin --password password --share --appname hey                 --port 7861\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@promptui.command()\n@click.argument(\"run_path\", required=False, default=\"promptui.yml\")\n@click.option(\n    \"--share\",\n    is_flag=True,\n    show_default=True,\n    default=False,\n    help=\"Share the app through Gradio. Requires --username to enable authentication.\",\n)\n@click.option(\n    \"--username\",\n    required=False,\n    help=(\n        \"Username for the user. If not provided, the promptui will not have \"\n        \"authentication.\"\n    ),\n)\n@click.option(\n    \"--password\",\n    required=False,\n    help=\"Password for the user. If not provided, will be prompted.\",\n)\n@click.option(\n    \"--appname\",\n    required=False,\n    help=\"The share app subdomain. Requires --share and --username\",\n)\n@click.option(\n    \"--port\",\n    required=False,\n    help=\"Port to run the app. If not provided, will $GRADIO_SERVER_PORT (7860)\",\n)\ndef run(run_path, share, username, password, appname, port):\n    \"\"\"Run the UI from a config file\n\n    Examples:\n\n        \\b\n        # Run with default config file\n        $ kh promptui run\n\n        \\b\n        # Run with username and password supplied\n        $ kh promptui run --username admin --password password\n\n        \\b\n        # Run with username and prompted password\n        $ kh promptui run --username admin\n\n        # Run and share to promptui\n        # kh promptui run --username admin --password password --share --appname hey \\\n                --port 7861\n    \"\"\"\n    import sys\n\n    from kotaemon.contribs.promptui.ui import build_from_dict\n\n    sys.path.append(os.getcwd())\n\n    check_config_format(run_path)\n    demo = build_from_dict(run_path)\n\n    params: dict = {}\n    if username is not None:\n        if password is not None:\n            auth = (username, password)\n        else:\n            auth = (username, click.prompt(\"Password\", hide_input=True))\n        params[\"auth\"] = auth\n\n    port = int(port) if port else int(os.getenv(\"GRADIO_SERVER_PORT\", \"7860\"))\n    params[\"server_port\"] = port\n\n    if share:\n        if username is None:\n            raise ValueError(\n                \"Username must be provided to enable authentication for sharing\"\n            )\n        if appname:\n            from kotaemon.contribs.promptui.tunnel import Tunnel\n\n            tunnel = Tunnel(\n                appname=str(appname), username=str(username), local_port=port\n            )\n            url = tunnel.run()\n            print(f\"App is shared at {url}\")\n        else:\n            params[\"share\"] = True\n            print(\"App is shared at Gradio\")\n\n    demo.launch(**params)\n</code></pre>"},{"location":"reference/cli/#cli.makedoc","title":"makedoc","text":"<pre><code>makedoc(module, output, separation_level)\n</code></pre> <p>Make documentation for module <code>module</code></p> <p>Example:</p> <pre><code>\n# Make component documentation for kotaemon library\n$ kh makedoc kotaemon\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@main.command()\n@click.argument(\"module\", required=True)\n@click.option(\n    \"--output\", default=\"docs.md\", required=False, help=\"The output markdown file\"\n)\n@click.option(\n    \"--separation-level\", required=False, default=1, help=\"Organize markdown layout\"\n)\ndef makedoc(module, output, separation_level):\n    \"\"\"Make documentation for module `module`\n\n    Example:\n\n        \\b\n        # Make component documentation for kotaemon library\n        $ kh makedoc kotaemon\n    \"\"\"\n    from kotaemon.contribs.docs import make_doc\n\n    make_doc(module, output, separation_level)\n    print(f\"Documentation exported to {output}\")\n</code></pre>"},{"location":"reference/cli/#cli.start_project","title":"start_project","text":"<pre><code>start_project(template)\n</code></pre> <p>Start a project from a template.</p> <p>Important: the value for --template corresponds to the name of the template folder, which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates The default value is \"project-default\", which should work when you are starting a client project.</p> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@main.command()\n@click.option(\n    \"--template\",\n    default=\"project-default\",\n    required=False,\n    help=\"Template name\",\n    show_default=True,\n)\ndef start_project(template):\n    \"\"\"Start a project from a template.\n\n    Important: the value for --template corresponds to the name of the template folder,\n    which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates\n    The default value is \"project-default\", which should work when you are starting a\n    client project.\n    \"\"\"\n\n    print(\"Retrieving template...\")\n    os.system(\n        \"cookiecutter git@github.com:Cinnamon/kotaemon.git \"\n        f\"--directory='templates/{template}'\"\n    )\n</code></pre>"},{"location":"reference/agents/","title":"Agents","text":""},{"location":"reference/agents/#agents.BaseAgent","title":"BaseAgent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define base agent interface</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -&gt; None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/#agents.BaseAgent.add_tools","title":"add_tools","text":"<pre><code>add_tools(tools)\n</code></pre> <p>Helper method to add tools and update agent state if needed</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def add_tools(self, tools: list[BaseTool]) -&gt; None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n</code></pre>"},{"location":"reference/agents/#agents.BaseAgent.run","title":"run","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/#agents.AgentFinish","title":"AgentFinish","text":"<p>               Bases: <code>NamedTuple</code></p> <p>Agent's return value when finishing execution.</p> <p>Parameters:</p> Name Type Description Default <code>return_values</code> <p>The return values of the agent.</p> required <code>log</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n</code></pre>"},{"location":"reference/agents/#agents.AgentOutput","title":"AgentOutput","text":"<p>               Bases: <code>LLMInterface</code></p> <p>Output from an agent.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <p>The text output from the agent.</p> required <code>agent_type</code> <p>The type of agent.</p> required <code>status</code> <p>The status after executing the agent.</p> required <code>error</code> <p>The error message if any.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n</code></pre>"},{"location":"reference/agents/#agents.AgentType","title":"AgentType","text":"<p>               Bases: <code>Enum</code></p> <p>Enumerated type for agent types.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad","title":"BaseScratchPad","text":"<p>Base class for output handlers.</p>"},{"location":"reference/agents/#agents.BaseScratchPad--attributes","title":"Attributes:","text":"<p>logger : logging.Logger     The logger object to log messages.</p>"},{"location":"reference/agents/#agents.BaseScratchPad--methods","title":"Methods:","text":"<p>stop():     Stop the output.</p> <p>update_status(output: str, **kwargs):     Update the status of the output.</p> <p>thinking(name: str):     Log that a process is thinking.</p> <p>done(_all=False):     Log that the process is done.</p> <p>stream_print(item: str):     Not implemented.</p> <p>json_print(item: Dict[str, Any]):     Log a JSON object.</p> <p>panel_print(item: Any, title: str = \"Output\", stream: bool = False):     Log a panel output.</p> <p>clear():     Not implemented.</p> <p>print(content: str, **kwargs):     Log arbitrary content.</p> <p>format_json(json_obj: str):     Format a JSON object.</p> <p>debug(content: str, **kwargs):     Log a debug message.</p> <p>info(content: str, **kwargs):     Log an informational message.</p> <p>warning(content: str, **kwargs):     Log a warning message.</p> <p>error(content: str, **kwargs):     Log an error message.</p> <p>critical(content: str, **kwargs):     Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.stop","title":"stop","text":"<pre><code>stop()\n</code></pre> <p>Stop the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.update_status","title":"update_status","text":"<pre><code>update_status(output, **kwargs)\n</code></pre> <p>Update the status of the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.thinking","title":"thinking","text":"<pre><code>thinking(name)\n</code></pre> <p>Log that a process is thinking.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.done","title":"done","text":"<pre><code>done(_all=False)\n</code></pre> <p>Log that the process is done.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.stream_print","title":"stream_print","text":"<pre><code>stream_print(item)\n</code></pre> <p>Stream print.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.json_print","title":"json_print","text":"<pre><code>json_print(item)\n</code></pre> <p>Log a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.panel_print","title":"panel_print","text":"<pre><code>panel_print(item, title='Output', stream=False)\n</code></pre> <p>Log a panel output.</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <p>Any The item to log.</p> required <code>title</code> <p>str, optional The title of the panel, defaults to \"Output\".</p> <code>'Output'</code> <code>stream</code> <p>bool, optional</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Not implemented.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.print","title":"print","text":"<pre><code>print(content, **kwargs)\n</code></pre> <p>Log arbitrary content.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.format_json","title":"format_json","text":"<pre><code>format_json(json_obj)\n</code></pre> <p>Format a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.debug","title":"debug","text":"<pre><code>debug(content, **kwargs)\n</code></pre> <p>Log a debug message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.info","title":"info","text":"<pre><code>info(content, **kwargs)\n</code></pre> <p>Log an informational message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.warning","title":"warning","text":"<pre><code>warning(content, **kwargs)\n</code></pre> <p>Log a warning message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.error","title":"error","text":"<pre><code>error(content, **kwargs)\n</code></pre> <p>Log an error message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.critical","title":"critical","text":"<pre><code>critical(content, **kwargs)\n</code></pre> <p>Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.LangchainAgent","title":"LangchainAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Wrapper for Langchain Agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/langchain_based.py</code> <pre><code>class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -&gt; None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -&gt; AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent","title":"ReactAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -&gt; str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -&gt; Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -&gt; dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -&gt; str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Clear and reset the agent.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent.run","title":"run","text":"<pre><code>run(instruction, max_iterations=None)\n</code></pre> <p>Run the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent.stream","title":"stream","text":"<pre><code>stream(instruction, max_iterations=None)\n</code></pre> <p>Stream the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/#agents.RewooAgent","title":"RewooAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -&gt; Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) &gt; 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n</code></pre>"},{"location":"reference/agents/#agents.RewooAgent.run","title":"run","text":"<pre><code>run(instruction, use_citation=False)\n</code></pre> <p>Run the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>@BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/#agents.RewooAgent.stream","title":"stream","text":"<pre><code>stream(instruction, use_citation=False)\n</code></pre> <p>Stream the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool","title":"BaseTool","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -&gt; Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -&gt; Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -&gt; Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -&gt; LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool.name","title":"name  <code>instance-attribute</code>","text":"<pre><code>name\n</code></pre> <p>The unique name of the tool that clearly communicates its purpose.</p>"},{"location":"reference/agents/#agents.BaseTool.description","title":"description  <code>instance-attribute</code>","text":"<pre><code>description\n</code></pre> <p>Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.</p>"},{"location":"reference/agents/#agents.BaseTool.args_schema","title":"args_schema  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>args_schema = None\n</code></pre> <p>Pydantic model class to validate and parse the tool's input arguments.</p>"},{"location":"reference/agents/#agents.BaseTool.verbose","title":"verbose  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>verbose = False\n</code></pre> <p>Whether to log the tool's progress.</p>"},{"location":"reference/agents/#agents.BaseTool.handle_tool_error","title":"handle_tool_error  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>handle_tool_error = False\n</code></pre> <p>Handle the content of the ToolException thrown.</p>"},{"location":"reference/agents/#agents.BaseTool.to_langchain_format","title":"to_langchain_format","text":"<pre><code>to_langchain_format()\n</code></pre> <p>Convert this tool to Langchain format to use with its agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def to_langchain_format(self) -&gt; LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool.run","title":"run","text":"<pre><code>run(tool_input, verbose=None, **kwargs)\n</code></pre> <p>Run the tool.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -&gt; Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool.from_langchain_format","title":"from_langchain_format  <code>classmethod</code>","text":"<pre><code>from_langchain_format(langchain_tool)\n</code></pre> <p>Wrapper for Langchain Tool</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>@classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n</code></pre>"},{"location":"reference/agents/#agents.ComponentTool","title":"ComponentTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Wrapper around other BaseComponent to use it as a tool</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <p>BaseComponent-based component to wrap</p> required <code>postprocessor</code> <p>Optional postprocessor for the component output</p> required Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -&gt; Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n</code></pre>"},{"location":"reference/agents/#agents.WikipediaTool","title":"WikipediaTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Tool that adds the capability to query the Wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -&gt; AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n</code></pre>"},{"location":"reference/agents/base/","title":"Base","text":""},{"location":"reference/agents/base/#agents.base.BaseAgent","title":"BaseAgent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define base agent interface</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -&gt; None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/base/#agents.base.BaseAgent.add_tools","title":"add_tools","text":"<pre><code>add_tools(tools)\n</code></pre> <p>Helper method to add tools and update agent state if needed</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def add_tools(self, tools: list[BaseTool]) -&gt; None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n</code></pre>"},{"location":"reference/agents/base/#agents.base.BaseAgent.run","title":"run","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/agents/langchain_based/#agents.langchain_based.LangchainAgent","title":"LangchainAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Wrapper for Langchain Agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/langchain_based.py</code> <pre><code>class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -&gt; None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -&gt; AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n</code></pre>"},{"location":"reference/agents/utils/","title":"Utils","text":""},{"location":"reference/agents/utils/#agents.utils.get_plugin_response_content","title":"get_plugin_response_content","text":"<pre><code>get_plugin_response_content(output)\n</code></pre> <p>Wrapper for AgentOutput content return</p> Source code in <code>libs/kotaemon/kotaemon/agents/utils.py</code> <pre><code>def get_plugin_response_content(output) -&gt; str:\n    \"\"\"\n    Wrapper for AgentOutput content return\n    \"\"\"\n    if isinstance(output, Document):\n        return output.text\n    else:\n        return str(output)\n</code></pre>"},{"location":"reference/agents/utils/#agents.utils.calculate_cost","title":"calculate_cost","text":"<pre><code>calculate_cost(model_name, prompt_token, completion_token)\n</code></pre> <p>Calculate the cost of a prompt and completion.</p> <p>Returns:</p> Name Type Description <code>float</code> <code>float</code> <p>Cost of the provided model name with provided token information</p> Source code in <code>libs/kotaemon/kotaemon/agents/utils.py</code> <pre><code>def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -&gt; float:\n    \"\"\"\n    Calculate the cost of a prompt and completion.\n\n    Returns:\n        float: Cost of the provided model name with provided token information\n    \"\"\"\n    # TODO: to be implemented\n    return 0.0\n</code></pre>"},{"location":"reference/agents/io/","title":"Io","text":""},{"location":"reference/agents/io/#agents.io.AgentAction","title":"AgentAction  <code>dataclass</code>","text":"<p>Agent's action to take.</p> <p>Parameters:</p> Name Type Description Default <code>tool</code> <code>str</code> <p>The tool to invoke.</p> required <code>tool_input</code> <code>Union[str, dict]</code> <p>The input to the tool.</p> required <code>log</code> <code>str</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>@dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n</code></pre>"},{"location":"reference/agents/io/#agents.io.AgentFinish","title":"AgentFinish","text":"<p>               Bases: <code>NamedTuple</code></p> <p>Agent's return value when finishing execution.</p> <p>Parameters:</p> Name Type Description Default <code>return_values</code> <p>The return values of the agent.</p> required <code>log</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n</code></pre>"},{"location":"reference/agents/io/#agents.io.AgentOutput","title":"AgentOutput","text":"<p>               Bases: <code>LLMInterface</code></p> <p>Output from an agent.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <p>The text output from the agent.</p> required <code>agent_type</code> <p>The type of agent.</p> required <code>status</code> <p>The status after executing the agent.</p> required <code>error</code> <p>The error message if any.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n</code></pre>"},{"location":"reference/agents/io/#agents.io.AgentType","title":"AgentType","text":"<p>               Bases: <code>Enum</code></p> <p>Enumerated type for agent types.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad","title":"BaseScratchPad","text":"<p>Base class for output handlers.</p>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad--attributes","title":"Attributes:","text":"<p>logger : logging.Logger     The logger object to log messages.</p>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad--methods","title":"Methods:","text":"<p>stop():     Stop the output.</p> <p>update_status(output: str, **kwargs):     Update the status of the output.</p> <p>thinking(name: str):     Log that a process is thinking.</p> <p>done(_all=False):     Log that the process is done.</p> <p>stream_print(item: str):     Not implemented.</p> <p>json_print(item: Dict[str, Any]):     Log a JSON object.</p> <p>panel_print(item: Any, title: str = \"Output\", stream: bool = False):     Log a panel output.</p> <p>clear():     Not implemented.</p> <p>print(content: str, **kwargs):     Log arbitrary content.</p> <p>format_json(json_obj: str):     Format a JSON object.</p> <p>debug(content: str, **kwargs):     Log a debug message.</p> <p>info(content: str, **kwargs):     Log an informational message.</p> <p>warning(content: str, **kwargs):     Log a warning message.</p> <p>error(content: str, **kwargs):     Log an error message.</p> <p>critical(content: str, **kwargs):     Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stop","title":"stop","text":"<pre><code>stop()\n</code></pre> <p>Stop the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.update_status","title":"update_status","text":"<pre><code>update_status(output, **kwargs)\n</code></pre> <p>Update the status of the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.thinking","title":"thinking","text":"<pre><code>thinking(name)\n</code></pre> <p>Log that a process is thinking.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.done","title":"done","text":"<pre><code>done(_all=False)\n</code></pre> <p>Log that the process is done.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stream_print","title":"stream_print","text":"<pre><code>stream_print(item)\n</code></pre> <p>Stream print.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.json_print","title":"json_print","text":"<pre><code>json_print(item)\n</code></pre> <p>Log a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.panel_print","title":"panel_print","text":"<pre><code>panel_print(item, title='Output', stream=False)\n</code></pre> <p>Log a panel output.</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <p>Any The item to log.</p> required <code>title</code> <p>str, optional The title of the panel, defaults to \"Output\".</p> <code>'Output'</code> <code>stream</code> <p>bool, optional</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Not implemented.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.print","title":"print","text":"<pre><code>print(content, **kwargs)\n</code></pre> <p>Log arbitrary content.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.format_json","title":"format_json","text":"<pre><code>format_json(json_obj)\n</code></pre> <p>Format a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.debug","title":"debug","text":"<pre><code>debug(content, **kwargs)\n</code></pre> <p>Log a debug message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.info","title":"info","text":"<pre><code>info(content, **kwargs)\n</code></pre> <p>Log an informational message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.warning","title":"warning","text":"<pre><code>warning(content, **kwargs)\n</code></pre> <p>Log a warning message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.error","title":"error","text":"<pre><code>error(content, **kwargs)\n</code></pre> <p>Log an error message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.critical","title":"critical","text":"<pre><code>critical(content, **kwargs)\n</code></pre> <p>Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/","title":"Base","text":""},{"location":"reference/agents/io/base/#agents.io.base.AgentType","title":"AgentType","text":"<p>               Bases: <code>Enum</code></p> <p>Enumerated type for agent types.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad","title":"BaseScratchPad","text":"<p>Base class for output handlers.</p>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--attributes","title":"Attributes:","text":"<p>logger : logging.Logger     The logger object to log messages.</p>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--methods","title":"Methods:","text":"<p>stop():     Stop the output.</p> <p>update_status(output: str, **kwargs):     Update the status of the output.</p> <p>thinking(name: str):     Log that a process is thinking.</p> <p>done(_all=False):     Log that the process is done.</p> <p>stream_print(item: str):     Not implemented.</p> <p>json_print(item: Dict[str, Any]):     Log a JSON object.</p> <p>panel_print(item: Any, title: str = \"Output\", stream: bool = False):     Log a panel output.</p> <p>clear():     Not implemented.</p> <p>print(content: str, **kwargs):     Log arbitrary content.</p> <p>format_json(json_obj: str):     Format a JSON object.</p> <p>debug(content: str, **kwargs):     Log a debug message.</p> <p>info(content: str, **kwargs):     Log an informational message.</p> <p>warning(content: str, **kwargs):     Log a warning message.</p> <p>error(content: str, **kwargs):     Log an error message.</p> <p>critical(content: str, **kwargs):     Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stop","title":"stop","text":"<pre><code>stop()\n</code></pre> <p>Stop the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.update_status","title":"update_status","text":"<pre><code>update_status(output, **kwargs)\n</code></pre> <p>Update the status of the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.thinking","title":"thinking","text":"<pre><code>thinking(name)\n</code></pre> <p>Log that a process is thinking.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.done","title":"done","text":"<pre><code>done(_all=False)\n</code></pre> <p>Log that the process is done.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stream_print","title":"stream_print","text":"<pre><code>stream_print(item)\n</code></pre> <p>Stream print.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.json_print","title":"json_print","text":"<pre><code>json_print(item)\n</code></pre> <p>Log a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.panel_print","title":"panel_print","text":"<pre><code>panel_print(item, title='Output', stream=False)\n</code></pre> <p>Log a panel output.</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <p>Any The item to log.</p> required <code>title</code> <p>str, optional The title of the panel, defaults to \"Output\".</p> <code>'Output'</code> <code>stream</code> <p>bool, optional</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Not implemented.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.print","title":"print","text":"<pre><code>print(content, **kwargs)\n</code></pre> <p>Log arbitrary content.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.format_json","title":"format_json","text":"<pre><code>format_json(json_obj)\n</code></pre> <p>Format a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.debug","title":"debug","text":"<pre><code>debug(content, **kwargs)\n</code></pre> <p>Log a debug message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.info","title":"info","text":"<pre><code>info(content, **kwargs)\n</code></pre> <p>Log an informational message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.warning","title":"warning","text":"<pre><code>warning(content, **kwargs)\n</code></pre> <p>Log a warning message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.error","title":"error","text":"<pre><code>error(content, **kwargs)\n</code></pre> <p>Log an error message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.critical","title":"critical","text":"<pre><code>critical(content, **kwargs)\n</code></pre> <p>Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.AgentAction","title":"AgentAction  <code>dataclass</code>","text":"<p>Agent's action to take.</p> <p>Parameters:</p> Name Type Description Default <code>tool</code> <code>str</code> <p>The tool to invoke.</p> required <code>tool_input</code> <code>Union[str, dict]</code> <p>The input to the tool.</p> required <code>log</code> <code>str</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>@dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.AgentFinish","title":"AgentFinish","text":"<p>               Bases: <code>NamedTuple</code></p> <p>Agent's return value when finishing execution.</p> <p>Parameters:</p> Name Type Description Default <code>return_values</code> <p>The return values of the agent.</p> required <code>log</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.AgentOutput","title":"AgentOutput","text":"<p>               Bases: <code>LLMInterface</code></p> <p>Output from an agent.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <p>The text output from the agent.</p> required <code>agent_type</code> <p>The type of agent.</p> required <code>status</code> <p>The status after executing the agent.</p> required <code>error</code> <p>The error message if any.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.check_log","title":"check_log","text":"<pre><code>check_log()\n</code></pre> <p>Checks if logging has been enabled. :return: True if logging has been enabled, False otherwise. :rtype: bool</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def check_log():\n    \"\"\"\n    Checks if logging has been enabled.\n    :return: True if logging has been enabled, False otherwise.\n    :rtype: bool\n    \"\"\"\n    return os.environ.get(\"LOG_PATH\", None) is not None\n</code></pre>"},{"location":"reference/agents/react/","title":"React","text":""},{"location":"reference/agents/react/#agents.react.ReactAgent","title":"ReactAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -&gt; str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -&gt; Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -&gt; dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -&gt; str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n</code></pre>"},{"location":"reference/agents/react/#agents.react.ReactAgent.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Clear and reset the agent.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n</code></pre>"},{"location":"reference/agents/react/#agents.react.ReactAgent.run","title":"run","text":"<pre><code>run(instruction, max_iterations=None)\n</code></pre> <p>Run the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/#agents.react.ReactAgent.stream","title":"stream","text":"<pre><code>stream(instruction, max_iterations=None)\n</code></pre> <p>Stream the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/agent/","title":"Agent","text":""},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent","title":"ReactAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -&gt; str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -&gt; Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -&gt; dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -&gt; str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n</code></pre>"},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Clear and reset the agent.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n</code></pre>"},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.run","title":"run","text":"<pre><code>run(instruction, max_iterations=None)\n</code></pre> <p>Run the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.stream","title":"stream","text":"<pre><code>stream(instruction, max_iterations=None)\n</code></pre> <p>Stream the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/","title":"Rewoo","text":""},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent","title":"RewooAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -&gt; Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) &gt; 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n</code></pre>"},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.run","title":"run","text":"<pre><code>run(instruction, use_citation=False)\n</code></pre> <p>Run the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>@BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.stream","title":"stream","text":"<pre><code>stream(instruction, use_citation=False)\n</code></pre> <p>Stream the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/agent/","title":"Agent","text":""},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent","title":"RewooAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -&gt; Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) &gt; 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n</code></pre>"},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.run","title":"run","text":"<pre><code>run(instruction, use_citation=False)\n</code></pre> <p>Run the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>@BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.stream","title":"stream","text":"<pre><code>stream(instruction, use_citation=False)\n</code></pre> <p>Stream the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/planner/","title":"Planner","text":""},{"location":"reference/agents/rewoo/planner/#agents.rewoo.planner.Planner","title":"Planner","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/planner.py</code> <pre><code>class Planner(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    plugins: List[BaseTool]\n\n    def _compose_worker_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for worker in self.plugins:\n                prompt += f\"{worker.name}[input]: {worker.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _compose_fewshot_prompt(self) -&gt; str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        worker_desctription = self._compose_worker_description()\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_planner_prompt.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return zero_shot_planner_prompt.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n\n    def run(self, instruction: str, output: BaseScratchPad = BaseScratchPad()) -&gt; Any:\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n\n    def stream(self, instruction: str, output: BaseScratchPad = BaseScratchPad()):\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n\n        response = \"\"\n        try:\n            for text in self.model.stream(prompt):\n                response += text\n                yield text\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            print(\"Streaming is not supported, falling back to normal run\")\n            response = self.model(prompt)\n            yield response\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n</code></pre>"},{"location":"reference/agents/rewoo/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/solver/","title":"Solver","text":""},{"location":"reference/agents/rewoo/solver/#agents.rewoo.solver.Solver","title":"Solver","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/solver.py</code> <pre><code>class Solver(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    output_lang: str = \"English\"\n\n    def _compose_fewshot_prompt(self) -&gt; str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction, plan_evidence, output_lang) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, plan&amp;evidence, examples and instruction.\n        \"\"\"\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence, task=instruction, lang=output_lang\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return zero_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    task=instruction,\n                    lang=output_lang,\n                )\n\n    def run(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -&gt; Any:\n        response = None\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n\n    def stream(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -&gt; Any:\n        response = \"\"\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            for text in self.model.stream(prompt):\n                response += text.text\n                yield text\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            response = self.model(prompt).text\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n</code></pre>"},{"location":"reference/agents/tools/","title":"Tools","text":""},{"location":"reference/agents/tools/#agents.tools.BaseTool","title":"BaseTool","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -&gt; Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -&gt; Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -&gt; Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -&gt; LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.name","title":"name  <code>instance-attribute</code>","text":"<pre><code>name\n</code></pre> <p>The unique name of the tool that clearly communicates its purpose.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.description","title":"description  <code>instance-attribute</code>","text":"<pre><code>description\n</code></pre> <p>Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.args_schema","title":"args_schema  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>args_schema = None\n</code></pre> <p>Pydantic model class to validate and parse the tool's input arguments.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.verbose","title":"verbose  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>verbose = False\n</code></pre> <p>Whether to log the tool's progress.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.handle_tool_error","title":"handle_tool_error  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>handle_tool_error = False\n</code></pre> <p>Handle the content of the ToolException thrown.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.to_langchain_format","title":"to_langchain_format","text":"<pre><code>to_langchain_format()\n</code></pre> <p>Convert this tool to Langchain format to use with its agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def to_langchain_format(self) -&gt; LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.run","title":"run","text":"<pre><code>run(tool_input, verbose=None, **kwargs)\n</code></pre> <p>Run the tool.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -&gt; Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.from_langchain_format","title":"from_langchain_format  <code>classmethod</code>","text":"<pre><code>from_langchain_format(langchain_tool)\n</code></pre> <p>Wrapper for Langchain Tool</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>@classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.ComponentTool","title":"ComponentTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Wrapper around other BaseComponent to use it as a tool</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <p>BaseComponent-based component to wrap</p> required <code>postprocessor</code> <p>Optional postprocessor for the component output</p> required Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -&gt; Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.WikipediaTool","title":"WikipediaTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Tool that adds the capability to query the Wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -&gt; AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n</code></pre>"},{"location":"reference/agents/tools/base/","title":"Base","text":""},{"location":"reference/agents/tools/base/#agents.tools.base.ToolException","title":"ToolException","text":"<p>               Bases: <code>Exception</code></p> <p>An optional exception that tool throws when execution error occurs.</p> <p>When this exception is thrown, the agent will not stop working, but will handle the exception according to the handle_tool_error variable of the tool, and the processing result will be returned to the agent as observation, and printed in red on the console.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ToolException(Exception):\n    \"\"\"An optional exception that tool throws when execution error occurs.\n\n    When this exception is thrown, the agent will not stop working,\n    but will handle the exception according to the handle_tool_error\n    variable of the tool, and the processing result will be returned\n    to the agent as observation, and printed in red on the console.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool","title":"BaseTool","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -&gt; Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -&gt; Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -&gt; Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -&gt; LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.name","title":"name  <code>instance-attribute</code>","text":"<pre><code>name\n</code></pre> <p>The unique name of the tool that clearly communicates its purpose.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.description","title":"description  <code>instance-attribute</code>","text":"<pre><code>description\n</code></pre> <p>Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.args_schema","title":"args_schema  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>args_schema = None\n</code></pre> <p>Pydantic model class to validate and parse the tool's input arguments.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.verbose","title":"verbose  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>verbose = False\n</code></pre> <p>Whether to log the tool's progress.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.handle_tool_error","title":"handle_tool_error  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>handle_tool_error = False\n</code></pre> <p>Handle the content of the ToolException thrown.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.to_langchain_format","title":"to_langchain_format","text":"<pre><code>to_langchain_format()\n</code></pre> <p>Convert this tool to Langchain format to use with its agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def to_langchain_format(self) -&gt; LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.run","title":"run","text":"<pre><code>run(tool_input, verbose=None, **kwargs)\n</code></pre> <p>Run the tool.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -&gt; Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.from_langchain_format","title":"from_langchain_format  <code>classmethod</code>","text":"<pre><code>from_langchain_format(langchain_tool)\n</code></pre> <p>Wrapper for Langchain Tool</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>@classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.ComponentTool","title":"ComponentTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Wrapper around other BaseComponent to use it as a tool</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <p>BaseComponent-based component to wrap</p> required <code>postprocessor</code> <p>Optional postprocessor for the component output</p> required Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -&gt; Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n</code></pre>"},{"location":"reference/agents/tools/google/","title":"Google","text":""},{"location":"reference/agents/tools/llm/","title":"Llm","text":""},{"location":"reference/agents/tools/wikipedia/","title":"Wikipedia","text":""},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki","title":"Wiki","text":"<p>Wrapper around wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class Wiki:\n    \"\"\"Wrapper around wikipedia API.\"\"\"\n\n    def __init__(self) -&gt; None:\n        \"\"\"Check that wikipedia package is installed.\"\"\"\n        try:\n            import wikipedia  # noqa: F401\n        except ImportError:\n            raise ValueError(\n                \"Could not import wikipedia python package. \"\n                \"Please install it with `pip install wikipedia`.\"\n            )\n\n    def search(self, search: str) -&gt; Union[str, Document]:\n        \"\"\"Try to search for wiki page.\n\n        If page exists, return the page summary, and a PageWithLookups object.\n        If page does not exist, return similar entries.\n        \"\"\"\n        import wikipedia\n\n        try:\n            page_content = wikipedia.page(search).content\n            url = wikipedia.page(search).url\n            result: Union[str, Document] = Document(\n                text=page_content, metadata={\"page\": url}\n            )\n        except wikipedia.PageError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        except wikipedia.DisambiguationError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        return result\n</code></pre>"},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki.search","title":"search","text":"<pre><code>search(search)\n</code></pre> <p>Try to search for wiki page.</p> <p>If page exists, return the page summary, and a PageWithLookups object. If page does not exist, return similar entries.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>def search(self, search: str) -&gt; Union[str, Document]:\n    \"\"\"Try to search for wiki page.\n\n    If page exists, return the page summary, and a PageWithLookups object.\n    If page does not exist, return similar entries.\n    \"\"\"\n    import wikipedia\n\n    try:\n        page_content = wikipedia.page(search).content\n        url = wikipedia.page(search).url\n        result: Union[str, Document] = Document(\n            text=page_content, metadata={\"page\": url}\n        )\n    except wikipedia.PageError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    except wikipedia.DisambiguationError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    return result\n</code></pre>"},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.WikipediaTool","title":"WikipediaTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Tool that adds the capability to query the Wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -&gt; AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n</code></pre>"},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#base.BaseComponent","title":"BaseComponent","text":"<p>               Bases: <code>Function</code></p> <p>A component is a class that can be used to compose a pipeline.</p> <p>Benefits of component</p> <ul> <li>Auto caching, logging</li> <li>Allow deployment</li> </ul> <p>For each component, the spirit is</p> <ul> <li>Tolerate multiple input types, e.g. str, Document, List[str], List[Document]</li> <li>Enforce single output type. Hence, the output type of a component should be</li> </ul> <p>as generic as possible.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -&gt; Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -&gt; AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n</code></pre>"},{"location":"reference/base/#base.BaseComponent.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>@abstractmethod\ndef run(\n    self, *args, **kwargs\n) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n</code></pre>"},{"location":"reference/base/#base.Document","title":"Document","text":"<p>               Bases: <code>Document</code></p> <p>Base document class, mostly inherited from Document class from llama-index.</p> <p>This class accept one positional argument <code>content</code> of an arbitrary type, which will     store the raw content of the document. If specified, the class will use     <code>content</code> to initialize the base llama_index class.</p> <p>Attributes:</p> Name Type Description <code>content</code> <code>Any</code> <p>raw content of the document, can be anything</p> <code>source</code> <code>Optional[str]</code> <p>id of the source of the Document. Optional.</p> <code>channel</code> <code>Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]</code> <p>the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"&lt;EMBEDDING&gt;\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -&gt; \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -&gt; \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n</code></pre>"},{"location":"reference/base/#base.Document.to_haystack_format","title":"to_haystack_format","text":"<pre><code>to_haystack_format()\n</code></pre> <p>Convert struct to Haystack document format.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>def to_haystack_format(self) -&gt; \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n</code></pre>"},{"location":"reference/base/#base.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document which must contains embedding</p> <p>Use this if you want to enforce component's IOs to must contain embedding.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n</code></pre>"},{"location":"reference/base/#base.ExtractorOutput","title":"ExtractorOutput","text":"<p>               Bases: <code>Document</code></p> <p>Represents the output of an extractor.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n</code></pre>"},{"location":"reference/base/#base.RetrievedDocument","title":"RetrievedDocument","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document with retrieval-related information</p> <p>Attributes:</p> Name Type Description <code>score</code> <code>float</code> <p>score of the document (from 0.0 to 1.0)</p> <code>retrieval_metadata</code> <code>dict</code> <p>metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n</code></pre>"},{"location":"reference/base/component/","title":"Component","text":""},{"location":"reference/base/component/#base.component.BaseComponent","title":"BaseComponent","text":"<p>               Bases: <code>Function</code></p> <p>A component is a class that can be used to compose a pipeline.</p> <p>Benefits of component</p> <ul> <li>Auto caching, logging</li> <li>Allow deployment</li> </ul> <p>For each component, the spirit is</p> <ul> <li>Tolerate multiple input types, e.g. str, Document, List[str], List[Document]</li> <li>Enforce single output type. Hence, the output type of a component should be</li> </ul> <p>as generic as possible.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -&gt; Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -&gt; AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n</code></pre>"},{"location":"reference/base/component/#base.component.BaseComponent.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>@abstractmethod\ndef run(\n    self, *args, **kwargs\n) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n</code></pre>"},{"location":"reference/base/schema/","title":"Schema","text":""},{"location":"reference/base/schema/#base.schema.Document","title":"Document","text":"<p>               Bases: <code>Document</code></p> <p>Base document class, mostly inherited from Document class from llama-index.</p> <p>This class accept one positional argument <code>content</code> of an arbitrary type, which will     store the raw content of the document. If specified, the class will use     <code>content</code> to initialize the base llama_index class.</p> <p>Attributes:</p> Name Type Description <code>content</code> <code>Any</code> <p>raw content of the document, can be anything</p> <code>source</code> <code>Optional[str]</code> <p>id of the source of the Document. Optional.</p> <code>channel</code> <code>Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]</code> <p>the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"&lt;EMBEDDING&gt;\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -&gt; \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -&gt; \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n</code></pre>"},{"location":"reference/base/schema/#base.schema.Document.to_haystack_format","title":"to_haystack_format","text":"<pre><code>to_haystack_format()\n</code></pre> <p>Convert struct to Haystack document format.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>def to_haystack_format(self) -&gt; \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n</code></pre>"},{"location":"reference/base/schema/#base.schema.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document which must contains embedding</p> <p>Use this if you want to enforce component's IOs to must contain embedding.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n</code></pre>"},{"location":"reference/base/schema/#base.schema.RetrievedDocument","title":"RetrievedDocument","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document with retrieval-related information</p> <p>Attributes:</p> Name Type Description <code>score</code> <code>float</code> <p>score of the document (from 0.0 to 1.0)</p> <code>retrieval_metadata</code> <code>dict</code> <p>metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n</code></pre>"},{"location":"reference/base/schema/#base.schema.ExtractorOutput","title":"ExtractorOutput","text":"<p>               Bases: <code>Document</code></p> <p>Represents the output of an extractor.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n</code></pre>"},{"location":"reference/chatbot/","title":"Chatbot","text":""},{"location":"reference/chatbot/#chatbot.ChatConversation","title":"ChatConversation","text":"<p>               Bases: <code>SessionFunction</code></p> <p>Base implementation of a chat bot component</p> A chatbot component should <ul> <li>handle internal state, including history messages</li> <li>return output for a given input</li> </ul> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -&gt; bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\"&gt;&gt; Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: &lt;No response&gt;\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n</code></pre>"},{"location":"reference/chatbot/#chatbot.ChatConversation.run","title":"run","text":"<pre><code>run(message)\n</code></pre> <p>Chat, given a message, return a response</p> <p>Parameters:</p> Name Type Description Default <code>message</code> <code>HumanMessage</code> <p>The message to respond to</p> required <p>Returns:</p> Type Description <code>Optional[BaseMessage]</code> <p>The response to the message. If None, no response is sent.</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n</code></pre>"},{"location":"reference/chatbot/#chatbot.ChatConversation.check_end","title":"check_end","text":"<pre><code>check_end(\n    history=None, user_message=None, bot_message=None\n)\n</code></pre> <p>Check if a conversation should end</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -&gt; bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n</code></pre>"},{"location":"reference/chatbot/#chatbot.ChatConversation.terminal_session","title":"terminal_session","text":"<pre><code>terminal_session()\n</code></pre> <p>Create a terminal session</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\"&gt;&gt; Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: &lt;No response&gt;\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n</code></pre>"},{"location":"reference/chatbot/#chatbot.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"<p>               Bases: <code>BaseChatBot</code></p> <p>Simple text respondent chatbot that essentially wraps around a chat LLM</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/simple_respondent.py</code> <pre><code>class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -&gt; str:\n        return self.llm(self.history).text\n</code></pre>"},{"location":"reference/chatbot/base/","title":"Base","text":""},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation","title":"ChatConversation","text":"<p>               Bases: <code>SessionFunction</code></p> <p>Base implementation of a chat bot component</p> A chatbot component should <ul> <li>handle internal state, including history messages</li> <li>return output for a given input</li> </ul> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -&gt; bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\"&gt;&gt; Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: &lt;No response&gt;\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.run","title":"run","text":"<pre><code>run(message)\n</code></pre> <p>Chat, given a message, return a response</p> <p>Parameters:</p> Name Type Description Default <code>message</code> <code>HumanMessage</code> <p>The message to respond to</p> required <p>Returns:</p> Type Description <code>Optional[BaseMessage]</code> <p>The response to the message. If None, no response is sent.</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.check_end","title":"check_end","text":"<pre><code>check_end(\n    history=None, user_message=None, bot_message=None\n)\n</code></pre> <p>Check if a conversation should end</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -&gt; bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.terminal_session","title":"terminal_session","text":"<pre><code>terminal_session()\n</code></pre> <p>Create a terminal session</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\"&gt;&gt; Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: &lt;No response&gt;\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.session_chat_storage","title":"session_chat_storage","text":"<pre><code>session_chat_storage(obj)\n</code></pre> <p>Store using the bot location rather than the session location</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def session_chat_storage(obj):\n    \"\"\"Store using the bot location rather than the session location\"\"\"\n    return obj._store_result\n</code></pre>"},{"location":"reference/chatbot/simple_respondent/","title":"Simple Respondent","text":""},{"location":"reference/chatbot/simple_respondent/#chatbot.simple_respondent.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"<p>               Bases: <code>BaseChatBot</code></p> <p>Simple text respondent chatbot that essentially wraps around a chat LLM</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/simple_respondent.py</code> <pre><code>class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -&gt; str:\n        return self.llm(self.history).text\n</code></pre>"},{"location":"reference/embeddings/","title":"Embeddings","text":""},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings","title":"EndpointEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>An Embeddings component that uses an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of an OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n</code></pre>"},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings.run","title":"run","text":"<pre><code>run(text)\n</code></pre> Generate embeddings from text Args <p>text (str | list[str] | Document | list[Document]): text to generate embeddings from</p> <p>Returns:     list[DocumentWithEmbedding]: embeddings</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n</code></pre>"},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>Utilize fastembed library for embeddings locally without GPU.</p> <p>Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If &gt; 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -&gt; \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings.ainvoke","title":"ainvoke  <code>async</code>","text":"<pre><code>ainvoke(text, *args, **kwargs)\n</code></pre> <p>Fastembed does not support async API.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's Cohere embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's HuggingFace embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&amp;sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's OpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/base/","title":"Base","text":""},{"location":"reference/embeddings/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings","title":"EndpointEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>An Embeddings component that uses an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of an OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n</code></pre>"},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings.run","title":"run","text":"<pre><code>run(text)\n</code></pre> Generate embeddings from text Args <p>text (str | list[str] | Document | list[Document]): text to generate embeddings from</p> <p>Returns:     list[DocumentWithEmbedding]: embeddings</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n</code></pre>"},{"location":"reference/embeddings/fastembed/","title":"Fastembed","text":""},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>Utilize fastembed library for embeddings locally without GPU.</p> <p>Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If &gt; 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -&gt; \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings.ainvoke","title":"ainvoke  <code>async</code>","text":"<pre><code>ainvoke(text, *args, **kwargs)\n</code></pre> <p>Fastembed does not support async API.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's OpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's Cohere embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n</code></pre>"},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's HuggingFace embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&amp;sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n</code></pre>"},{"location":"reference/embeddings/openai/","title":"Openai","text":""},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings","title":"BaseOpenAIEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>Base interface for OpenAI embedding model, using the openai library.</p> <p>This class exposes the parameters in resources.Chat. To subclass this class:</p> <pre><code>- Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class BaseOpenAIEmbeddings(BaseEmbeddings):\n    \"\"\"Base interface for OpenAI embedding model, using the openai library.\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n\n    api_key: str = Param(None, help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request.\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request.\"\n    )\n\n    dimensions: Optional[int] = Param(\n        None,\n        help=(\n            \"The number of dimensions the resulting output embeddings should have. \"\n            \"Only supported in `text-embedding-3` and later models.\"\n        ),\n    )\n    context_length: Optional[int] = Param(\n        None, help=\"The maximum context length of the embedding model\"\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_doc = self.prepare_input(text)\n        client = self.prepare_client(async_version=False)\n\n        input_: list[str | list[int]] = []\n        splitted_indices = {}\n        for idx, text in enumerate(input_doc):\n            if self.context_length:\n                chunks = split_text_by_chunk_size(text.text or \" \", self.context_length)\n                splitted_indices[idx] = (len(input_), len(input_) + len(chunks))\n                input_.extend(chunks)\n            else:\n                splitted_indices[idx] = (len(input_), len(input_) + 1)\n                input_.append(text.text)\n\n        resp = self.openai_response(client, input=input_, **kwargs).dict()\n        output_ = list(sorted(resp[\"data\"], key=lambda x: x[\"index\"]))\n\n        output = []\n        for idx, doc in enumerate(input_doc):\n            embs = output_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            if len(embs) == 1:\n                output.append(\n                    DocumentWithEmbedding(embedding=embs[0][\"embedding\"], content=doc)\n                )\n                continue\n\n            chunk_lens = [\n                len(_)\n                for _ in input_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            ]\n            vs: list[list[float]] = [_[\"embedding\"] for _ in embs]\n            emb = np.average(vs, axis=0, weights=chunk_lens)\n            emb = emb / np.linalg.norm(emb)\n            output.append(DocumentWithEmbedding(embedding=emb.tolist(), content=doc))\n\n        return output\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        client = self.prepare_client(async_version=True)\n        resp = await self.openai_response(\n            client, input=[_.text if _.text else \" \" for _ in input_], **kwargs\n        ).dict()\n        output_ = sorted(resp[\"data\"], key=lambda x: x[\"index\"])\n        return [\n            DocumentWithEmbedding(embedding=o[\"embedding\"], content=i)\n            for i, o in zip(input_, output_)\n        ]\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.split_text_by_chunk_size","title":"split_text_by_chunk_size","text":"<pre><code>split_text_by_chunk_size(text, chunk_size)\n</code></pre> <p>Split the text into chunks of a given size</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>text to split</p> required <code>chunk_size</code> <code>int</code> <p>size of each chunk</p> required <p>Returns:</p> Type Description <code>list[list[int]]</code> <p>list of chunks (as tokens)</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def split_text_by_chunk_size(text: str, chunk_size: int) -&gt; list[list[int]]:\n    \"\"\"Split the text into chunks of a given size\n\n    Args:\n        text: text to split\n        chunk_size: size of each chunk\n\n    Returns:\n        list of chunks (as tokens)\n    \"\"\"\n    encoding = tiktoken.get_encoding(\"cl100k_base\")\n    tokens = iter(encoding.encode(text))\n    result = []\n    while chunk := list(islice(tokens, chunk_size)):\n        result.append(chunk)\n    return result\n</code></pre>"},{"location":"reference/indices/","title":"Indices","text":""},{"location":"reference/indices/#indices.VectorIndexing","title":"VectorIndexing","text":"<p>               Bases: <code>BaseIndexing</code></p> <p>Ingest the document, run through the embedding, and store the embedding in a vector store.</p> This pipeline supports the following set of inputs <ul> <li>List of documents</li> <li>List of texts</li> </ul> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'&lt;p&gt;&lt;img src=\"{image_origin}\"&gt;&lt;/p&gt;'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n</code></pre>"},{"location":"reference/indices/#indices.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"<pre><code>to_retrieval_pipeline(*args, **kwargs)\n</code></pre> <p>Convert the indexing pipeline to a retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/indices/#indices.VectorRetrieval","title":"VectorRetrieval","text":"<p>               Bases: <code>BaseRetrieval</code></p> <p>Retrieve list of documents from vector store</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -&gt; list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) &lt; thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n</code></pre>"},{"location":"reference/indices/#indices.VectorRetrieval.run","title":"run","text":"<pre><code>run(text, top_k=None, **kwargs)\n</code></pre> <p>Retrieve a list of documents from vector store</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | Document</code> <p>the text to retrieve similar documents</p> required <code>top_k</code> <code>Optional[int]</code> <p>number of top similar documents to return</p> <code>None</code> <p>Returns:</p> Type Description <code>list[RetrievedDocument]</code> <p>list[RetrievedDocument]: list of retrieved documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -&gt; list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) &lt; thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n</code></pre>"},{"location":"reference/indices/base/","title":"Base","text":""},{"location":"reference/indices/base/#indices.base.DocTransformer","title":"DocTransformer","text":"<p>               Bases: <code>BaseComponent</code></p> <p>This is a base class for document transformers</p> <p>A document transformer transforms a list of documents into another list of documents. Transforming can mean splitting a document into multiple documents, reducing a large list of documents into a smaller list of documents, or adding metadata to each document in a list of documents, etc.</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class DocTransformer(BaseComponent):\n    \"\"\"This is a base class for document transformers\n\n    A document transformer transforms a list of documents into another list\n    of documents. Transforming can mean splitting a document into multiple documents,\n    reducing a large list of documents into a smaller list of documents, or adding\n    metadata to each document in a list of documents, etc.\n    \"\"\"\n\n    @abstractmethod\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -&gt; list[Document]:\n        ...\n</code></pre>"},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin","title":"LlamaIndexDocTransformerMixin","text":"<p>Allow automatically wrapping a Llama-index component into kotaemon component</p> Example <p>class TokenSplitter(LlamaIndexMixin, BaseSplitter):     def _get_li_class(self):         from llama_index.core.text_splitter import TokenTextSplitter         return TokenTextSplitter</p> <p>To use this mixin, please:     1. Use this class as the 1st parent class, so that Python will prefer to use     the attributes and methods of this class whenever possible.     2. Overwrite <code>_get_li_class</code> to return the relevant LlamaIndex component.</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class LlamaIndexDocTransformerMixin:\n    \"\"\"Allow automatically wrapping a Llama-index component into kotaemon component\n\n    Example:\n        class TokenSplitter(LlamaIndexMixin, BaseSplitter):\n            def _get_li_class(self):\n                from llama_index.core.text_splitter import TokenTextSplitter\n                return TokenTextSplitter\n\n    To use this mixin, please:\n        1. Use this class as the 1st parent class, so that Python will prefer to use\n        the attributes and methods of this class whenever possible.\n        2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.\n    \"\"\"\n\n    def _get_li_class(self) -&gt; Type[NodeParser]:\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in _get_li_class\"\n        )\n\n    def __init__(self, **params):\n        self._li_cls = self._get_li_class()\n        self._obj = self._li_cls(**params)\n        self._kwargs = params\n        super().__init__()\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) &gt; 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name: str, value: Any) -&gt; None:\n        if name.startswith(\"_\") or name in self._protected_keywords():\n            return super().__setattr__(name, value)\n\n        self._kwargs[name] = value\n        return setattr(self._obj, name, value)\n\n    def __getattr__(self, name: str) -&gt; Any:\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -&gt; list[Document]:\n        \"\"\"Run Llama-index node parser and convert the output to Document from\n        kotaemon\n        \"\"\"\n        docs = self._obj(documents, **kwargs)  # type: ignore\n        return [Document.from_dict(doc.to_dict()) for doc in docs]\n</code></pre>"},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin.run","title":"run","text":"<pre><code>run(documents, **kwargs)\n</code></pre> <p>Run Llama-index node parser and convert the output to Document from kotaemon</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    **kwargs,\n) -&gt; list[Document]:\n    \"\"\"Run Llama-index node parser and convert the output to Document from\n    kotaemon\n    \"\"\"\n    docs = self._obj(documents, **kwargs)  # type: ignore\n    return [Document.from_dict(doc.to_dict()) for doc in docs]\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseIndexing","title":"BaseIndexing","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define the base interface for indexing pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class BaseIndexing(BaseComponent):\n    \"\"\"Define the base interface for indexing pipeline\"\"\"\n\n    def to_retrieval_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        raise NotImplementedError\n\n    def to_qa_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n        raise NotImplementedError\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"<pre><code>to_retrieval_pipeline(**kwargs)\n</code></pre> <p>Convert the indexing pipeline to a retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>def to_retrieval_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_qa_pipeline","title":"to_qa_pipeline","text":"<pre><code>to_qa_pipeline(**kwargs)\n</code></pre> <p>Convert the indexing pipeline to a QA pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>def to_qa_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseRetrieval","title":"BaseRetrieval","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define the base interface for retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class BaseRetrieval(BaseComponent):\n    \"\"\"Define the base interface for retrieval pipeline\"\"\"\n\n    @abstractmethod\n    def run(self, *args, **kwargs) -&gt; list[RetrievedDocument]:\n        ...\n</code></pre>"},{"location":"reference/indices/vectorindex/","title":"Vectorindex","text":""},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing","title":"VectorIndexing","text":"<p>               Bases: <code>BaseIndexing</code></p> <p>Ingest the document, run through the embedding, and store the embedding in a vector store.</p> This pipeline supports the following set of inputs <ul> <li>List of documents</li> <li>List of texts</li> </ul> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'&lt;p&gt;&lt;img src=\"{image_origin}\"&gt;&lt;/p&gt;'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n</code></pre>"},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"<pre><code>to_retrieval_pipeline(*args, **kwargs)\n</code></pre> <p>Convert the indexing pipeline to a retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval","title":"VectorRetrieval","text":"<p>               Bases: <code>BaseRetrieval</code></p> <p>Retrieve list of documents from vector store</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -&gt; list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) &lt; thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n</code></pre>"},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval.run","title":"run","text":"<pre><code>run(text, top_k=None, **kwargs)\n</code></pre> <p>Retrieve a list of documents from vector store</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | Document</code> <p>the text to retrieve similar documents</p> required <code>top_k</code> <code>Optional[int]</code> <p>number of top similar documents to return</p> <code>None</code> <p>Returns:</p> Type Description <code>list[RetrievedDocument]</code> <p>list[RetrievedDocument]: list of retrieved documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -&gt; list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) &lt; thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n</code></pre>"},{"location":"reference/indices/extractors/","title":"Extractors","text":""},{"location":"reference/indices/extractors/doc_parsers/","title":"Doc Parsers","text":""},{"location":"reference/indices/ingests/","title":"Ingests","text":""},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor","title":"DocumentIngestor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Ingest common office document types into Document for indexing</p> Document types <ul> <li>pdf</li> <li>xlsx, xls</li> <li>docx, doc</li> </ul> <p>Parameters:</p> Name Type Description Default <code>pdf_mode</code> <p>mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax</p> required <code>doc_parsers</code> <p>list of document parsers to parse the document</p> required <code>text_splitter</code> <p>splitter to split the document into text nodes</p> required <code>override_file_extractors</code> <p>override file extractors for specific file extensions The default file extractors are stored in <code>KH_DEFAULT_FILE_EXTRACTORS</code></p> required Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n</code></pre>"},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor.run","title":"run","text":"<pre><code>run(file_paths)\n</code></pre> <p>Ingest the file paths into Document</p> <p>Parameters:</p> Name Type Description Default <code>file_paths</code> <code>list[str | Path] | str | Path</code> <p>list of file paths or a single file path</p> required <p>Returns:</p> Type Description <code>list[Document]</code> <p>list of parsed Documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n</code></pre>"},{"location":"reference/indices/ingests/files/","title":"Files","text":""},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor","title":"DocumentIngestor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Ingest common office document types into Document for indexing</p> Document types <ul> <li>pdf</li> <li>xlsx, xls</li> <li>docx, doc</li> </ul> <p>Parameters:</p> Name Type Description Default <code>pdf_mode</code> <p>mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax</p> required <code>doc_parsers</code> <p>list of document parsers to parse the document</p> required <code>text_splitter</code> <p>splitter to split the document into text nodes</p> required <code>override_file_extractors</code> <p>override file extractors for specific file extensions The default file extractors are stored in <code>KH_DEFAULT_FILE_EXTRACTORS</code></p> required Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n</code></pre>"},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor.run","title":"run","text":"<pre><code>run(file_paths)\n</code></pre> <p>Ingest the file paths into Document</p> <p>Parameters:</p> Name Type Description Default <code>file_paths</code> <code>list[str | Path] | str | Path</code> <p>list of file paths or a single file path</p> required <p>Returns:</p> Type Description <code>list[Document]</code> <p>list of parsed Documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n</code></pre>"},{"location":"reference/indices/qa/","title":"Qa","text":""},{"location":"reference/indices/qa/#indices.qa.CitationPipeline","title":"CitationPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Citation pipeline to extract cited evidences from source (based on input question)</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/citation.py</code> <pre><code>class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/indices/qa/#indices.qa.CitationQAPipeline","title":"CitationQAPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Answering question from a text corpus with citation</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/text_based.py</code> <pre><code>class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -&gt; str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -&gt; str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -&gt; Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n</code></pre>"},{"location":"reference/indices/qa/citation/","title":"Citation","text":""},{"location":"reference/indices/qa/citation/#indices.qa.citation.CiteEvidence","title":"CiteEvidence","text":"<p>               Bases: <code>BaseModel</code></p> <p>List of evidences (maximum 5) to support the answer.</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/citation.py</code> <pre><code>class CiteEvidence(BaseModel):\n    \"\"\"List of evidences (maximum 5) to support the answer.\"\"\"\n\n    evidences: List[str] = Field(\n        ...,\n        description=(\n            \"Each source should be a direct quote from the context, \"\n            \"as a substring of the original content (max 15 words).\"\n        ),\n    )\n</code></pre>"},{"location":"reference/indices/qa/citation/#indices.qa.citation.CitationPipeline","title":"CitationPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Citation pipeline to extract cited evidences from source (based on input question)</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/citation.py</code> <pre><code>class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/indices/qa/text_based/","title":"Text Based","text":""},{"location":"reference/indices/qa/text_based/#indices.qa.text_based.CitationQAPipeline","title":"CitationQAPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Answering question from a text corpus with citation</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/text_based.py</code> <pre><code>class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -&gt; str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -&gt; str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -&gt; Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n</code></pre>"},{"location":"reference/indices/rankings/","title":"Rankings","text":""},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking","title":"BaseReranking","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(documents, query)\n</code></pre> <p>Main method to transform list of documents (re-ranking, filtering, etc)</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>@abstractmethod\ndef run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking","title":"CohereReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Use Cohere Reranker model to re-order documents with their relevance score</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking","title":"LLMReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring","title":"LLMScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring","title":"LLMTrulensScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/base/","title":"Base","text":""},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking","title":"BaseReranking","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n</code></pre>"},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(documents, query)\n</code></pre> <p>Main method to transform list of documents (re-ranking, filtering, etc)</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>@abstractmethod\ndef run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n</code></pre>"},{"location":"reference/indices/rankings/cohere/","title":"Cohere","text":""},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking","title":"CohereReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Use Cohere Reranker model to re-order documents with their relevance score</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm/","title":"Llm","text":""},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking","title":"LLMReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_scoring/","title":"Llm Scoring","text":""},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring","title":"LLMScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/","title":"Llm Trulens","text":""},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.PATTERN_INTEGER","title":"PATTERN_INTEGER  <code>module-attribute</code>","text":"<pre><code>PATTERN_INTEGER = compile('([+-]?[1-9][0-9]*|0)')\n</code></pre> <p>Regex that matches integers.</p>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring","title":"LLMTrulensScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.validate_rating","title":"validate_rating","text":"<pre><code>validate_rating(rating)\n</code></pre> <p>Validate a rating is between 0 and 10.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def validate_rating(rating) -&gt; int:\n    \"\"\"Validate a rating is between 0 and 10.\"\"\"\n\n    if not 0 &lt;= rating &lt;= 10:\n        raise ValueError(\"Rating must be between 0 and 10\")\n\n    return rating\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.re_0_10_rating","title":"re_0_10_rating","text":"<pre><code>re_0_10_rating(s)\n</code></pre> <p>Extract a 0-10 rating from a string.</p> <p>If the string does not match an integer or matches an integer outside the 0-10 range, raises an error instead. If multiple numbers are found within the expected 0-10 range, the smallest is returned.</p> <p>Parameters:</p> Name Type Description Default <code>s</code> <code>str</code> <p>String to extract rating from.</p> required <p>Returns:</p> Name Type Description <code>int</code> <code>int</code> <p>Extracted rating.</p> <p>Raises:</p> Type Description <code>ParseError</code> <p>If no integers between 0 and 10 are found in the string.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def re_0_10_rating(s: str) -&gt; int:\n    \"\"\"Extract a 0-10 rating from a string.\n\n    If the string does not match an integer or matches an integer outside the\n    0-10 range, raises an error instead. If multiple numbers are found within\n    the expected 0-10 range, the smallest is returned.\n\n    Args:\n        s: String to extract rating from.\n\n    Returns:\n        int: Extracted rating.\n\n    Raises:\n        ParseError: If no integers between 0 and 10 are found in the string.\n    \"\"\"\n\n    matches = PATTERN_INTEGER.findall(s)\n    if not matches:\n        raise AssertionError\n\n    vals = set()\n    for match in matches:\n        try:\n            vals.add(validate_rating(int(match)))\n        except ValueError:\n            pass\n\n    if not vals:\n        raise AssertionError\n\n    # Min to handle cases like \"The rating is 8 out of 10.\"\n    return min(vals)\n</code></pre>"},{"location":"reference/indices/splitters/","title":"Splitters","text":""},{"location":"reference/indices/splitters/#indices.splitters.BaseSplitter","title":"BaseSplitter","text":"<p>               Bases: <code>DocTransformer</code></p> <p>Represent base splitter class</p> Source code in <code>libs/kotaemon/kotaemon/indices/splitters/__init__.py</code> <pre><code>class BaseSplitter(DocTransformer):\n    \"\"\"Represent base splitter class\"\"\"\n\n    ...\n</code></pre>"},{"location":"reference/llms/","title":"LLMs","text":""},{"location":"reference/llms/#llms.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"<p>               Bases: <code>SimpleBranchingPipeline</code></p> <p>A simple gated branching pipeline for executing multiple branches based on a     condition.</p> <p>This class extends the SimpleBranchingPipeline class and adds the ability to execute     the branches until a branch returns a non-empty output based on a condition.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.GatedBranchingPipeline.run","title":"run","text":"<pre><code>run(*, condition_text=None, **prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the output of the first     branch that returns a non-empty output based on the provided condition.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate for each branch. Default to None.</p> <code>None</code> <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Type Description <p>Union[OutputType, None]: The output of the first branch that satisfies the</p> <p>condition, or None if no branch satisfies the condition.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple branching pipeline for executing multiple branches.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n</code></pre>"},{"location":"reference/llms/#llms.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"<pre><code>add_branch(component)\n</code></pre> <p>Add a new branch to the pipeline.</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <code>BaseComponent</code> <p>The branch component to be added.</p> required Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleBranchingPipeline.run","title":"run","text":"<pre><code>run(**prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the outputs as a list.</p> <p>Parameters:</p> Name Type Description Default <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>List</code> <p>The outputs of each branch as a list.</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n</code></pre>"},{"location":"reference/llms/#llms.AzureChatOpenAI","title":"AzureChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model provided by Microsoft Azure</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/llms/#llms.AzureChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.ChatOpenAI","title":"ChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.ChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/llms/#llms.ChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.EndpointChatLLM","title":"EndpointChatLLM","text":"<p>               Bases: <code>ChatLLM</code></p> <p>A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of a OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        return self.invoke(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.EndpointChatLLM.run","title":"run","text":"<pre><code>run(messages, **kwargs)\n</code></pre> <p>Generate response from messages Args:     messages (str | BaseMessage | list[BaseMessage]): history of messages to         generate response from     **kwargs: additional arguments to pass to the OpenAI API Returns:     LLMInterface: generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n</code></pre>"},{"location":"reference/llms/#llms.EndpointChatLLM.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Same as run</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.LlamaCppChat","title":"LlamaCppChat","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Wrapper around the llama-cpp-python's Llama model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -&gt; \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n</code></pre>"},{"location":"reference/llms/#llms.LlamaCppChat.client_object","title":"client_object","text":"<pre><code>client_object()\n</code></pre> <p>Get the llama-cpp-python client object</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>@Param.auto()\ndef client_object(self) -&gt; \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n</code></pre>"},{"location":"reference/llms/#llms.AzureOpenAI","title":"AzureOpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's AzureOpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n</code></pre>"},{"location":"reference/llms/#llms.LlamaCpp","title":"LlamaCpp","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's LlamaCpp class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n</code></pre>"},{"location":"reference/llms/#llms.OpenAI","title":"OpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's OpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n</code></pre>"},{"location":"reference/llms/#llms.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Perform sequential chain-of-thought with manual pre-defined prompts</p> <p>This method supports variable number of steps. Each step corresponds to a <code>kotaemon.pipelines.cot.Thought</code>. Please refer that section for Thought's detail. This section is about chaining thought together.</p> <p>Usage:</p> <p>Create and run a chain of thought without \"+\" operator:</p> <pre><code>&gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n&gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt;&gt; thought1 = Thought(\n&gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n&gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought2 = Thought(\n&gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n&gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n&gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Create and run a chain of thought without \"+\" operator: Please refer the <code>kotaemon.pipelines.cot.Thought</code> section for examples.</p> <p>This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    &gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    &gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt;&gt; thought1 = Thought(\n    &gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n    &gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought2 = Thought(\n    &gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n    &gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    &gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/#llms.ManualSequentialChainOfThought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the manual chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n</code></pre>"},{"location":"reference/llms/#llms.Thought","title":"Thought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A thought in the chain of thought</p> <ul> <li>Input: <code>**kwargs</code> pairs, where key is the placeholder in the prompt, and value is the value.</li> <li>Output: an output dictionary</li> </ul> <p>Usage:</p> <p>Create and run a thought:</p> <pre><code>&gt;&gt; from kotaemon.pipelines.cot import Thought\n&gt;&gt; thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n&gt;&gt; output = thought(action=\"install\", object=\"python\")\n&gt;&gt; print(output)\n{'tutorial': 'As an AI language model,...'}\n</code></pre> <p>Basically, when a thought is run, it will:</p> <ol> <li>Populate the prompt template with the input <code>**kwargs</code>.</li> <li>Run the LLM model with the populated prompt.</li> <li>Post-process the LLM output with the post-processor.</li> </ol> <p>This <code>Thought</code> allows chaining sequentially with the + operator. For example:</p> <pre><code>&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt; thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n&gt;&gt; thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n&gt;&gt; thought = thought1 + thought2\n&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Under the hood, when the <code>+</code> operator is used, a <code>ManualSequentialChainOfThought</code> is created.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    &gt;&gt; from kotaemon.pipelines.cot import Thought\n    &gt;&gt; thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    &gt;&gt; output = thought(action=\"install\", object=\"python\")\n    &gt;&gt; print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    &gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt; thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    &gt;&gt; thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    &gt;&gt; thought = thought1 + thought2\n    &gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -&gt; List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/#llms.Thought.prompt_template","title":"prompt_template","text":"<pre><code>prompt_template()\n</code></pre> <p>Automatically wrap around param prompt. Can ignore</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>@Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n</code></pre>"},{"location":"reference/llms/#llms.Thought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n</code></pre>"},{"location":"reference/llms/#llms.GatedLinearPipeline","title":"GatedLinearPipeline","text":"<p>               Bases: <code>SimpleLinearPipeline</code></p> <p>A pipeline that extends the SimpleLinearPipeline class and adds a condition     attribute.</p> <p>Attributes:</p> Name Type Description <code>condition</code> <code>Callable[[IO_Type], Any]</code> <p>A callable function that represents the condition.</p> Usage Example Usage<pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -&gt; Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.GatedLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the pipeline with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate. Default to None.</p> <code>None</code> <code>llm_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the language model call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the post-processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <code>Document</code> <p>The final output of the pipeline as a Document object.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -&gt; Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple pipeline for running a function with a prompt, a language model, and an     optional post-processor.</p> <p>Attributes:</p> Name Type Description <code>prompt</code> <code>BasePromptComponent</code> <p>The prompt component used to generate the initial input.</p> <code>llm</code> <code>Union[ChatLLM, LLM]</code> <p>The language model component used to generate the output.</p> <code>post_processor</code> <code>Union[BaseComponent, Callable[[IO_Type], IO_Type]]</code> <p>An optional post-processor component or function.</p> Example Usage <pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the function with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>llm_kwargs</code> <code>dict</code> <p>Keyword arguments for the llm call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Keyword arguments for the post_processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <p>The final output of the function as a Document object.</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n</code></pre>"},{"location":"reference/llms/#llms.BasePromptComponent","title":"BasePromptComponent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base class for prompt components.</p> <p>Parameters:</p> Name Type Description Default <code>template</code> <code>PromptTemplate</code> <p>The prompt template.</p> required <code>**kwargs</code> <p>Any additional keyword arguments that will be used to populate the given template.</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n</code></pre>"},{"location":"reference/llms/#llms.BasePromptComponent.set_value","title":"set_value","text":"<pre><code>set_value(**kwargs)\n</code></pre> <p>Similar to <code>__set</code> but for external use.</p> <p>Set the values of the attributes in the object based on the provided keyword     arguments.</p> <p>Parameters:</p> Name Type Description Default <code>kwargs</code> <code>dict</code> <p>A dictionary with the attribute names as keys and the new values as values.</p> <code>{}</code> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.BasePromptComponent.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the function with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to pass to the function.</p> <code>{}</code> <p>Returns:</p> Type Description <p>The result of calling the <code>populate</code> method of the <code>template</code> object</p> <p>with the given keyword arguments.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate","title":"PromptTemplate","text":"<p>Base class for prompt templates.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -&gt; str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"<pre><code>check_missing_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"<pre><code>check_redundant_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.populate","title":"populate","text":"<pre><code>populate(**kwargs)\n</code></pre> <p>Strictly populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Type Description <code>str</code> <p>The populated template.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If an unknown placeholder is provided.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def populate(self, **kwargs) -&gt; str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.partial_populate","title":"partial_populate","text":"<pre><code>partial_populate(**kwargs)\n</code></pre> <p>Partially populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>str</code> <p>The populated template.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n</code></pre>"},{"location":"reference/llms/base/","title":"Base","text":""},{"location":"reference/llms/branching/","title":"Branching","text":""},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple branching pipeline for executing multiple branches.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"<pre><code>add_branch(component)\n</code></pre> <p>Add a new branch to the pipeline.</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <code>BaseComponent</code> <p>The branch component to be added.</p> required Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.run","title":"run","text":"<pre><code>run(**prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the outputs as a list.</p> <p>Parameters:</p> Name Type Description Default <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>List</code> <p>The outputs of each branch as a list.</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"<p>               Bases: <code>SimpleBranchingPipeline</code></p> <p>A simple gated branching pipeline for executing multiple branches based on a     condition.</p> <p>This class extends the SimpleBranchingPipeline class and adds the ability to execute     the branches until a branch returns a non-empty output based on a condition.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline.run","title":"run","text":"<pre><code>run(*, condition_text=None, **prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the output of the first     branch that returns a non-empty output based on the provided condition.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate for each branch. Default to None.</p> <code>None</code> <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Type Description <p>Union[OutputType, None]: The output of the first branch that satisfies the</p> <p>condition, or None if no branch satisfies the condition.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/cot/","title":"Cot","text":""},{"location":"reference/llms/cot/#llms.cot.Thought","title":"Thought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A thought in the chain of thought</p> <ul> <li>Input: <code>**kwargs</code> pairs, where key is the placeholder in the prompt, and value is the value.</li> <li>Output: an output dictionary</li> </ul> <p>Usage:</p> <p>Create and run a thought:</p> <pre><code>&gt;&gt; from kotaemon.pipelines.cot import Thought\n&gt;&gt; thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n&gt;&gt; output = thought(action=\"install\", object=\"python\")\n&gt;&gt; print(output)\n{'tutorial': 'As an AI language model,...'}\n</code></pre> <p>Basically, when a thought is run, it will:</p> <ol> <li>Populate the prompt template with the input <code>**kwargs</code>.</li> <li>Run the LLM model with the populated prompt.</li> <li>Post-process the LLM output with the post-processor.</li> </ol> <p>This <code>Thought</code> allows chaining sequentially with the + operator. For example:</p> <pre><code>&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt; thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n&gt;&gt; thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n&gt;&gt; thought = thought1 + thought2\n&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Under the hood, when the <code>+</code> operator is used, a <code>ManualSequentialChainOfThought</code> is created.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    &gt;&gt; from kotaemon.pipelines.cot import Thought\n    &gt;&gt; thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    &gt;&gt; output = thought(action=\"install\", object=\"python\")\n    &gt;&gt; print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    &gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt; thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    &gt;&gt; thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    &gt;&gt; thought = thought1 + thought2\n    &gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -&gt; List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.Thought.prompt_template","title":"prompt_template","text":"<pre><code>prompt_template()\n</code></pre> <p>Automatically wrap around param prompt. Can ignore</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>@Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.Thought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Perform sequential chain-of-thought with manual pre-defined prompts</p> <p>This method supports variable number of steps. Each step corresponds to a <code>kotaemon.pipelines.cot.Thought</code>. Please refer that section for Thought's detail. This section is about chaining thought together.</p> <p>Usage:</p> <p>Create and run a chain of thought without \"+\" operator:</p> <pre><code>&gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n&gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt;&gt; thought1 = Thought(\n&gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n&gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought2 = Thought(\n&gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n&gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n&gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Create and run a chain of thought without \"+\" operator: Please refer the <code>kotaemon.pipelines.cot.Thought</code> section for examples.</p> <p>This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    &gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    &gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt;&gt; thought1 = Thought(\n    &gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n    &gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought2 = Thought(\n    &gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n    &gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    &gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the manual chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n</code></pre>"},{"location":"reference/llms/linear/","title":"Linear","text":""},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple pipeline for running a function with a prompt, a language model, and an     optional post-processor.</p> <p>Attributes:</p> Name Type Description <code>prompt</code> <code>BasePromptComponent</code> <p>The prompt component used to generate the initial input.</p> <code>llm</code> <code>Union[ChatLLM, LLM]</code> <p>The language model component used to generate the output.</p> <code>post_processor</code> <code>Union[BaseComponent, Callable[[IO_Type], IO_Type]]</code> <p>An optional post-processor component or function.</p> Example Usage <pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n</code></pre>"},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the function with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>llm_kwargs</code> <code>dict</code> <p>Keyword arguments for the llm call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Keyword arguments for the post_processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <p>The final output of the function as a Document object.</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n</code></pre>"},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline","title":"GatedLinearPipeline","text":"<p>               Bases: <code>SimpleLinearPipeline</code></p> <p>A pipeline that extends the SimpleLinearPipeline class and adds a condition     attribute.</p> <p>Attributes:</p> Name Type Description <code>condition</code> <code>Callable[[IO_Type], Any]</code> <p>A callable function that represents the condition.</p> Usage Example Usage<pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -&gt; Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the pipeline with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate. Default to None.</p> <code>None</code> <code>llm_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the language model call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the post-processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <code>Document</code> <p>The final output of the pipeline as a Document object.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -&gt; Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/chats/","title":"Chats","text":""},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM","title":"EndpointChatLLM","text":"<p>               Bases: <code>ChatLLM</code></p> <p>A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of a OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        return self.invoke(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.run","title":"run","text":"<pre><code>run(messages, **kwargs)\n</code></pre> <p>Generate response from messages Args:     messages (str | BaseMessage | list[BaseMessage]): history of messages to         generate response from     **kwargs: additional arguments to pass to the OpenAI API Returns:     LLMInterface: generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Same as run</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LCChatMixin","title":"LCChatMixin","text":"<p>Mixin for langchain based chat models</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) &gt; 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) &gt; 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LCChatMixin.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Generate response from messages</p> <p>Parameters:</p> Name Type Description Default <code>messages</code> <code>str | BaseMessage | list[BaseMessage]</code> <p>history of messages to generate response from</p> required <code>**kwargs</code> <p>additional arguments to pass to the langchain chat model</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>LLMInterface</code> <code>LLMInterface</code> <p>generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat","title":"LlamaCppChat","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Wrapper around the llama-cpp-python's Llama model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -&gt; \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat.client_object","title":"client_object","text":"<pre><code>client_object()\n</code></pre> <p>Get the llama-cpp-python client object</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>@Param.auto()\ndef client_object(self) -&gt; \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI","title":"AzureChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model provided by Microsoft Azure</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI","title":"ChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/base/","title":"Base","text":""},{"location":"reference/llms/chats/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM","title":"EndpointChatLLM","text":"<p>               Bases: <code>ChatLLM</code></p> <p>A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of a OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        return self.invoke(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.run","title":"run","text":"<pre><code>run(messages, **kwargs)\n</code></pre> <p>Generate response from messages Args:     messages (str | BaseMessage | list[BaseMessage]): history of messages to         generate response from     **kwargs: additional arguments to pass to the OpenAI API Returns:     LLMInterface: generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n</code></pre>"},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Same as run</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin","title":"LCChatMixin","text":"<p>Mixin for langchain based chat models</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) &gt; 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) &gt; 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n</code></pre>"},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Generate response from messages</p> <p>Parameters:</p> Name Type Description Default <code>messages</code> <code>str | BaseMessage | list[BaseMessage]</code> <p>history of messages to generate response from</p> required <code>**kwargs</code> <p>additional arguments to pass to the langchain chat model</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>LLMInterface</code> <code>LLMInterface</code> <p>generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n</code></pre>"},{"location":"reference/llms/chats/llamacpp/","title":"Llamacpp","text":""},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat","title":"LlamaCppChat","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Wrapper around the llama-cpp-python's Llama model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -&gt; \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n</code></pre>"},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat.client_object","title":"client_object","text":"<pre><code>client_object()\n</code></pre> <p>Get the llama-cpp-python client object</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>@Param.auto()\ndef client_object(self) -&gt; \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n</code></pre>"},{"location":"reference/llms/chats/openai/","title":"Openai","text":""},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI","title":"BaseChatOpenAI","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Base interface for OpenAI chat model, using the openai library</p> <p>This class exposes the parameters in resources.Chat. To subclass this class:</p> <pre><code>- Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class BaseChatOpenAI(ChatLLM):\n    \"\"\"Base interface for OpenAI chat model, using the openai library\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n    _capabilities = [\"chat\", \"text\"]  # consider as mixin\n\n    api_key: str = Param(help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request\"\n    )\n\n    temperature: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between 0 and 2 that controls the randomness of the generated \"\n            \"tokens. Lower values make the model more deterministic, while higher \"\n            \"values make the model more random.\"\n        ),\n    )\n    max_tokens: Optional[int] = Param(\n        None,\n        help=(\n            \"Maximum number of tokens to generate. The total length of input tokens \"\n            \"and generated tokens is limited by the model's context length.\"\n        ),\n    )\n    n: int = Param(\n        1,\n        help=(\n            \"Number of completions to generate. The API will generate n completion \"\n            \"for each prompt.\"\n        ),\n    )\n    stop: Optional[str | list[str]] = Param(\n        None,\n        help=(\n            \"Stop sequence. If a stop sequence is detected, generation will stop \"\n            \"at that point. If not specified, generation will continue until the \"\n            \"maximum token length is reached.\"\n        ),\n    )\n    frequency_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing frequency in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    presence_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing presence in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    tool_choice: Optional[str] = Param(\n        None,\n        help=(\n            \"Choice of tool to use for the completion. Available choices are: \"\n            \"auto, default.\"\n        ),\n    )\n    tools: Optional[list[str]] = Param(\n        None,\n        help=\"List of tools to use for the completion.\",\n    )\n    logprobs: Optional[bool] = Param(\n        None,\n        help=(\n            \"Include log probabilities on the logprobs most likely tokens, \"\n            \"as well as the chosen token.\"\n        ),\n    )\n    logit_bias: Optional[dict] = Param(\n        None,\n        help=(\n            \"Dictionary of logit bias values to add to the logits of the tokens \"\n            \"in the vocabulary.\"\n        ),\n    )\n    top_logprobs: Optional[int] = Param(\n        None,\n        help=(\n            \"An integer between 0 and 5 specifying the number of most likely tokens \"\n            \"to return at each token position, each with an associated log \"\n            \"probability. `logprobs` must also be set to `true` if this parameter \"\n            \"is used.\"\n        ),\n    )\n    top_p: Optional[float] = Param(\n        None,\n        help=(\n            \"An alternative to sampling with temperature, called nucleus sampling, \"\n            \"where the model considers the results of the token with top_p \"\n            \"probability mass. So 0.1 means that only the tokens comprising the \"\n            \"top 10% probability mass are considered.\"\n        ),\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[\"ChatCompletionMessageParam\"]:\n        \"\"\"Prepare the message into OpenAI format\n\n        Returns:\n            list[dict]: List of messages in OpenAI format\n        \"\"\"\n        input_: list[BaseMessage] = []\n        output_: list[\"ChatCompletionMessageParam\"] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        for message in input_:\n            output_.append(message.to_openai_format())\n\n        return output_\n\n    def prepare_output(self, resp: dict) -&gt; LLMInterface:\n        \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n        additional_kwargs = {}\n        if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n            additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n                \"tool_calls\"\n            ]\n\n        if resp[\"choices\"][0].get(\"logprobs\") is None:\n            logprobs = []\n        else:\n            all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n            logprobs = (\n                [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n            )\n\n        output = LLMInterface(\n            candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n            content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n            total_tokens=resp[\"usage\"][\"total_tokens\"],\n            prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n            completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n            additional_kwargs=additional_kwargs,\n            messages=[\n                AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n                for _ in resp[\"choices\"]\n            ],\n            logprobs=logprobs,\n        )\n\n        return output\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; LLMInterface:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n        return self.prepare_output(resp)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; LLMInterface:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = await self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n\n        return self.prepare_output(resp)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        for c in resp:\n            chunk = c.dict()\n            if not chunk[\"choices\"]:\n                continue\n            if chunk[\"choices\"][0][\"delta\"][\"content\"] is not None:\n                if chunk[\"choices\"][0].get(\"logprobs\") is None:\n                    logprobs = []\n                else:\n                    logprobs = [\n                        logprob[\"logprob\"]\n                        for logprob in chunk[\"choices\"][0][\"logprobs\"].get(\n                            \"content\", []\n                        )\n                    ]\n\n                yield LLMInterface(\n                    content=chunk[\"choices\"][0][\"delta\"][\"content\"], logprobs=logprobs\n                )\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; AsyncGenerator[LLMInterface, None]:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        async for chunk in resp:\n            if not chunk.choices:\n                continue\n            if chunk.choices[0].delta.content is not None:\n                yield LLMInterface(content=chunk.choices[0].delta.content)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_message","title":"prepare_message","text":"<pre><code>prepare_message(messages)\n</code></pre> <p>Prepare the message into OpenAI format</p> <p>Returns:</p> Type Description <code>list[ChatCompletionMessageParam]</code> <p>list[dict]: List of messages in OpenAI format</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_message(\n    self, messages: str | BaseMessage | list[BaseMessage]\n) -&gt; list[\"ChatCompletionMessageParam\"]:\n    \"\"\"Prepare the message into OpenAI format\n\n    Returns:\n        list[dict]: List of messages in OpenAI format\n    \"\"\"\n    input_: list[BaseMessage] = []\n    output_: list[\"ChatCompletionMessageParam\"] = []\n\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    for message in input_:\n        output_.append(message.to_openai_format())\n\n    return output_\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_output","title":"prepare_output","text":"<pre><code>prepare_output(resp)\n</code></pre> <p>Convert the OpenAI response into LLMInterface</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_output(self, resp: dict) -&gt; LLMInterface:\n    \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n    additional_kwargs = {}\n    if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n        additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n            \"tool_calls\"\n        ]\n\n    if resp[\"choices\"][0].get(\"logprobs\") is None:\n        logprobs = []\n    else:\n        all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n        logprobs = (\n            [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n        )\n\n    output = LLMInterface(\n        candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n        content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n        total_tokens=resp[\"usage\"][\"total_tokens\"],\n        prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n        completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n        additional_kwargs=additional_kwargs,\n        messages=[\n            AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n            for _ in resp[\"choices\"]\n        ],\n        logprobs=logprobs,\n    )\n\n    return output\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI","title":"ChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI","title":"AzureChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model provided by Microsoft Azure</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/completions/","title":"Completions","text":""},{"location":"reference/llms/completions/#llms.completions.AzureOpenAI","title":"AzureOpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's AzureOpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n</code></pre>"},{"location":"reference/llms/completions/#llms.completions.LlamaCpp","title":"LlamaCpp","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's LlamaCpp class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n</code></pre>"},{"location":"reference/llms/completions/#llms.completions.OpenAI","title":"OpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's OpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n</code></pre>"},{"location":"reference/llms/completions/base/","title":"Base","text":""},{"location":"reference/llms/completions/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.OpenAI","title":"OpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's OpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n</code></pre>"},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.AzureOpenAI","title":"AzureOpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's AzureOpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n</code></pre>"},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.LlamaCpp","title":"LlamaCpp","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's LlamaCpp class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n</code></pre>"},{"location":"reference/llms/prompts/","title":"Prompts","text":""},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent","title":"BasePromptComponent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base class for prompt components.</p> <p>Parameters:</p> Name Type Description Default <code>template</code> <code>PromptTemplate</code> <p>The prompt template.</p> required <code>**kwargs</code> <p>Any additional keyword arguments that will be used to populate the given template.</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.set_value","title":"set_value","text":"<pre><code>set_value(**kwargs)\n</code></pre> <p>Similar to <code>__set</code> but for external use.</p> <p>Set the values of the attributes in the object based on the provided keyword     arguments.</p> <p>Parameters:</p> Name Type Description Default <code>kwargs</code> <code>dict</code> <p>A dictionary with the attribute names as keys and the new values as values.</p> <code>{}</code> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the function with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to pass to the function.</p> <code>{}</code> <p>Returns:</p> Type Description <p>The result of calling the <code>populate</code> method of the <code>template</code> object</p> <p>with the given keyword arguments.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate","title":"PromptTemplate","text":"<p>Base class for prompt templates.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -&gt; str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"<pre><code>check_missing_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"<pre><code>check_redundant_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.populate","title":"populate","text":"<pre><code>populate(**kwargs)\n</code></pre> <p>Strictly populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Type Description <code>str</code> <p>The populated template.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If an unknown placeholder is provided.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def populate(self, **kwargs) -&gt; str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.partial_populate","title":"partial_populate","text":"<pre><code>partial_populate(**kwargs)\n</code></pre> <p>Partially populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>str</code> <p>The populated template.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n</code></pre>"},{"location":"reference/llms/prompts/base/","title":"Base","text":""},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent","title":"BasePromptComponent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base class for prompt components.</p> <p>Parameters:</p> Name Type Description Default <code>template</code> <code>PromptTemplate</code> <p>The prompt template.</p> required <code>**kwargs</code> <p>Any additional keyword arguments that will be used to populate the given template.</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n</code></pre>"},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.set_value","title":"set_value","text":"<pre><code>set_value(**kwargs)\n</code></pre> <p>Similar to <code>__set</code> but for external use.</p> <p>Set the values of the attributes in the object based on the provided keyword     arguments.</p> <p>Parameters:</p> Name Type Description Default <code>kwargs</code> <code>dict</code> <p>A dictionary with the attribute names as keys and the new values as values.</p> <code>{}</code> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the function with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to pass to the function.</p> <code>{}</code> <p>Returns:</p> Type Description <p>The result of calling the <code>populate</code> method of the <code>template</code> object</p> <p>with the given keyword arguments.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n</code></pre>"},{"location":"reference/llms/prompts/template/","title":"Template","text":""},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate","title":"PromptTemplate","text":"<p>Base class for prompt templates.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -&gt; str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"<pre><code>check_missing_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"<pre><code>check_redundant_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.populate","title":"populate","text":"<pre><code>populate(**kwargs)\n</code></pre> <p>Strictly populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Type Description <code>str</code> <p>The populated template.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If an unknown placeholder is provided.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def populate(self, **kwargs) -&gt; str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.partial_populate","title":"partial_populate","text":"<pre><code>partial_populate(**kwargs)\n</code></pre> <p>Partially populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>str</code> <p>The populated template.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n</code></pre>"},{"location":"reference/loaders/","title":"Loaders","text":""},{"location":"reference/loaders/#loaders.AdobeReader","title":"AdobeReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import AdobeReader\n&gt;&gt; reader = AdobeReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Args:     endpoint: URL to the Vision Language Model endpoint. If not provided,     will use the default <code>kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT</code></p> <pre><code>max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import AdobeReader\n        &gt;&gt; reader = AdobeReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.AdobeReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, **kwargs)\n</code></pre> <p>Load data by calling to the Adobe's API</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>Path to the PDF file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image</p> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Utilize Azure AI Document Intelligence to parse document</p> <p>As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read &amp; crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Extract the input file, allowing multi-modal extraction</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read &amp; crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/#loaders.AutoReader","title":"AutoReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General auto reader for a variety of files. (based on llama-hub)</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -&gt; None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        return self.load_data(file=file, **kwargs)\n</code></pre>"},{"location":"reference/loaders/#loaders.BaseReader","title":"BaseReader","text":"<p>               Bases: <code>BaseComponent</code></p> <p>The base class for all readers</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n</code></pre>"},{"location":"reference/loaders/#loaders.DirectoryReader","title":"DirectoryReader","text":"<p>               Bases: <code>LIReaderMixin</code>, <code>BaseReader</code></p> <p>Wrap around llama-index SimpleDirectoryReader</p> <p>Parameters:</p> Name Type Description Default <code>input_dir</code> <code>str</code> <p>Path to the directory.</p> required <code>input_files</code> <code>List</code> <p>List of file paths to read (Optional; overrides input_dir, exclude)</p> required <code>exclude</code> <code>List</code> <p>glob of python file paths to exclude (Optional)</p> required <code>exclude_hidden</code> <code>bool</code> <p>Whether to exclude hidden files (dotfiles).</p> required <code>encoding</code> <code>str</code> <p>Encoding of the files. Default is utf-8.</p> required <code>errors</code> <code>str</code> <p>how encoding and decoding errors are to be handled,   see https://docs.python.org/3/library/functions.html#open</p> required <code>recursive</code> <code>bool</code> <p>Whether to recursively search in subdirectories. False by default.</p> required <code>filename_as_id</code> <code>bool</code> <p>Whether to use the filename as the document id. False by default.</p> required <code>required_exts</code> <code>Optional[List[str]]</code> <p>List of required extensions. Default is None.</p> required <code>file_extractor</code> <code>Optional[Dict[str, BaseReader]]</code> <p>A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.</p> required <code>num_files_limit</code> <code>Optional[int]</code> <p>Maximum number of files to read. Default is None.</p> required <code>file_metadata</code> <code>Optional[Callable[str, Dict]]</code> <p>A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/composite_loader.py</code> <pre><code>class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -&gt; Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n</code></pre>"},{"location":"reference/loaders/#loaders.DocxReader","title":"DocxReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read Docx files that respect table, using python-docx library</p> Reader behavior <ul> <li>All paragraphs are extracted as a Document</li> <li>Each table is extracted as a Document, rendered as a CSV string</li> <li>The output is a list of Documents, concatenating the above (tables + paragraphs)</li> </ul> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -&gt; List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.DocxReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Docx reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to .docx file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.ExcelReader","title":"ExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Spreadsheet exporter respecting multiple worksheets</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n</code></pre>"},{"location":"reference/loaders/#loaders.ExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>True</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n</code></pre>"},{"location":"reference/loaders/#loaders.PandasExcelReader","title":"PandasExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Pandas-based CSV parser.</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n</code></pre>"},{"location":"reference/loaders/#loaders.PandasExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>False</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n</code></pre>"},{"location":"reference/loaders/#loaders.HtmlReader","title":"HtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Reader HTML usimg html2text</p> Reader behavior <ul> <li>HTML is read with html2text.</li> <li>All of the texts will be split by <code>page_break_pattern</code></li> <li>Each page is extracted as a Document</li> <li>The output is a list of Documents</li> </ul> <p>Parameters:</p> Name Type Description Default <code>page_break_pattern</code> <code>str</code> <p>Pattern to split the HTML into pages</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.HtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Html reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path | str</code> <p>path to HTML file</p> required <code>extra_info</code> <code>Optional[dict]</code> <p>extra information passed to this reader during extracting data</p> <code>None</code> <p>Returns:</p> Type Description <code>list[Document]</code> <p>list[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.MhtmlReader","title":"MhtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Parse <code>MHTML</code> files with <code>BeautifulSoup</code>.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -&gt; None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/#loaders.MhtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load MHTML document into document objects.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/#loaders.MathpixPDFReader","title":"MathpixPDFReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Load <code>PDF</code> files using <code>Mathpix</code> service.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -&gt; Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -&gt; str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -&gt; dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -&gt; str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -&gt; None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -&gt; str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -&gt; str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"<pre><code>wait_for_processing(pdf_id)\n</code></pre> <p>Wait for processing to complete.</p> <p>Parameters:</p> Name Type Description Default <code>pdf_id</code> <code>str</code> <p>a PDF id.</p> required <p>Returns: None</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def wait_for_processing(self, pdf_id: str) -&gt; None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n</code></pre>"},{"location":"reference/loaders/#loaders.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"<pre><code>clean_pdf(contents)\n</code></pre> <p>Clean the PDF file.</p> <p>Parameters:</p> Name Type Description Default <code>contents</code> <code>str</code> <p>a PDF file contents.</p> required <p>Returns:</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def clean_pdf(self, contents: str) -&gt; str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n</code></pre>"},{"location":"reference/loaders/#loaders.ImageReader","title":"ImageReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from knowledgehub.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from knowledgehub.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n</code></pre>"},{"location":"reference/loaders/#loaders.ImageReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n</code></pre>"},{"location":"reference/loaders/#loaders.OCRReader","title":"OCRReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.OCRReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.PDFThumbnailReader","title":"PDFThumbnailReader","text":"<p>               Bases: <code>PDFReader</code></p> <p>PDF parser with thumbnail for each page.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -&gt; None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.PDFThumbnailReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, fs=None)\n</code></pre> <p>Parse file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -&gt; List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.UnstructuredReader","title":"UnstructuredReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General unstructured text reader for a variety of files.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n</code></pre>"},{"location":"reference/loaders/#loaders.UnstructuredReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n</code></pre> <p>If api is set, parse through api</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n</code></pre>"},{"location":"reference/loaders/adobe_loader/","title":"Adobe Loader","text":""},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader","title":"AdobeReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import AdobeReader\n&gt;&gt; reader = AdobeReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Args:     endpoint: URL to the Vision Language Model endpoint. If not provided,     will use the default <code>kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT</code></p> <pre><code>max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import AdobeReader\n        &gt;&gt; reader = AdobeReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n</code></pre>"},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, **kwargs)\n</code></pre> <p>Load data by calling to the Adobe's API</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>Path to the PDF file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image</p> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n</code></pre>"},{"location":"reference/loaders/azureai_document_intelligence_loader/","title":"Azureai Document Intelligence Loader","text":""},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Utilize Azure AI Document Intelligence to parse document</p> <p>As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read &amp; crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Extract the input file, allowing multi-modal extraction</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read &amp; crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.crop_image","title":"crop_image","text":"<pre><code>crop_image(file_path, bbox, page_number=0)\n</code></pre> <p>Crop the image based on the bounding box</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>path to the image file</p> required <code>bbox</code> <code>list[float]</code> <p>bounding box of the image (in percentage [x0, y0, x1, y1])</p> required <code>page_number</code> <code>int</code> <p>page number of the image. Defaults to 0.</p> <code>0</code> <p>Returns:</p> Type Description <code>Image</code> <p>Image.Image: cropped image</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -&gt; Image.Image:\n    \"\"\"Crop the image based on the bounding box\n\n    Args:\n        file_path (Path): path to the image file\n        bbox (list[float]): bounding box of the image (in percentage [x0, y0, x1, y1])\n        page_number (int, optional): page number of the image. Defaults to 0.\n\n    Returns:\n        Image.Image: cropped image\n    \"\"\"\n    left, upper, right, lower = bbox\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    if suffix == \".pdf\":\n        try:\n            import fitz\n        except ImportError:\n            raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n        doc = fitz.open(file_path)\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=150)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n    elif suffix in [\".tif\", \".tiff\"]:\n        img = Image.open(file_path)\n        img.seek(page_number)\n    else:\n        img = Image.open(file_path)\n\n    return img.crop(\n        (\n            int(left * img.width),\n            int(upper * img.height),\n            int(right * img.width),\n            int(lower * img.height),\n        )\n    )\n</code></pre>"},{"location":"reference/loaders/base/","title":"Base","text":""},{"location":"reference/loaders/base/#loaders.base.BaseReader","title":"BaseReader","text":"<p>               Bases: <code>BaseComponent</code></p> <p>The base class for all readers</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n</code></pre>"},{"location":"reference/loaders/base/#loaders.base.AutoReader","title":"AutoReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General auto reader for a variety of files. (based on llama-hub)</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -&gt; None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        return self.load_data(file=file, **kwargs)\n</code></pre>"},{"location":"reference/loaders/base/#loaders.base.LIReaderMixin","title":"LIReaderMixin","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base wrapper around llama-index reader</p> <p>To use the LIBaseReader, you need to implement the _get_wrapped_class method to return the relevant llama-index reader class that you want to wrap.</p> <p>Example:</p> <pre><code>```python\nclass DirectoryReader(LIBaseReader):\n    def _get_wrapped_class(self) -&gt; Type[\"BaseReader\"]:\n        from llama_index import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n```\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class LIReaderMixin(BaseComponent):\n    \"\"\"Base wrapper around llama-index reader\n\n    To use the LIBaseReader, you need to implement the _get_wrapped_class method to\n    return the relevant llama-index reader class that you want to wrap.\n\n    Example:\n\n        ```python\n        class DirectoryReader(LIBaseReader):\n            def _get_wrapped_class(self) -&gt; Type[\"BaseReader\"]:\n                from llama_index import SimpleDirectoryReader\n\n                return SimpleDirectoryReader\n        ```\n    \"\"\"\n\n    def _get_wrapped_class(self) -&gt; Type[\"LIBaseReader\"]:\n        raise NotImplementedError(\n            \"Please return the relevant llama-index class in in _get_wrapped_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        self._reader_class = self._get_wrapped_class()\n        self._reader = self._reader_class(*args, **kwargs)\n        super().__init__()\n\n    def __setattr__(self, name: str, value: Any) -&gt; None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._reader, name, value)\n\n    def __getattr__(self, name: str) -&gt; Any:\n        return getattr(self._reader, name)\n\n    def load_data(self, *args, **kwargs: Any) -&gt; List[Document]:\n        documents = self._reader.load_data(*args, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, *args, **kwargs: Any) -&gt; List[Document]:\n        return self.load_data(*args, **kwargs)\n</code></pre>"},{"location":"reference/loaders/composite_loader/","title":"Composite Loader","text":""},{"location":"reference/loaders/composite_loader/#loaders.composite_loader.DirectoryReader","title":"DirectoryReader","text":"<p>               Bases: <code>LIReaderMixin</code>, <code>BaseReader</code></p> <p>Wrap around llama-index SimpleDirectoryReader</p> <p>Parameters:</p> Name Type Description Default <code>input_dir</code> <code>str</code> <p>Path to the directory.</p> required <code>input_files</code> <code>List</code> <p>List of file paths to read (Optional; overrides input_dir, exclude)</p> required <code>exclude</code> <code>List</code> <p>glob of python file paths to exclude (Optional)</p> required <code>exclude_hidden</code> <code>bool</code> <p>Whether to exclude hidden files (dotfiles).</p> required <code>encoding</code> <code>str</code> <p>Encoding of the files. Default is utf-8.</p> required <code>errors</code> <code>str</code> <p>how encoding and decoding errors are to be handled,   see https://docs.python.org/3/library/functions.html#open</p> required <code>recursive</code> <code>bool</code> <p>Whether to recursively search in subdirectories. False by default.</p> required <code>filename_as_id</code> <code>bool</code> <p>Whether to use the filename as the document id. False by default.</p> required <code>required_exts</code> <code>Optional[List[str]]</code> <p>List of required extensions. Default is None.</p> required <code>file_extractor</code> <code>Optional[Dict[str, BaseReader]]</code> <p>A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.</p> required <code>num_files_limit</code> <code>Optional[int]</code> <p>Maximum number of files to read. Default is None.</p> required <code>file_metadata</code> <code>Optional[Callable[str, Dict]]</code> <p>A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/composite_loader.py</code> <pre><code>class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -&gt; Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n</code></pre>"},{"location":"reference/loaders/docx_loader/","title":"Docx Loader","text":""},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader","title":"DocxReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read Docx files that respect table, using python-docx library</p> Reader behavior <ul> <li>All paragraphs are extracted as a Document</li> <li>Each table is extracted as a Document, rendered as a CSV string</li> <li>The output is a list of Documents, concatenating the above (tables + paragraphs)</li> </ul> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -&gt; List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Docx reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to .docx file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/excel_loader/","title":"Excel Loader","text":"<p>Pandas Excel reader.</p> <p>Pandas parser for .xlsx files.</p>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader","title":"PandasExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Pandas-based CSV parser.</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n</code></pre>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>False</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n</code></pre>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader","title":"ExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Spreadsheet exporter respecting multiple worksheets</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n</code></pre>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>True</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n</code></pre>"},{"location":"reference/loaders/html_loader/","title":"Html Loader","text":""},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader","title":"HtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Reader HTML usimg html2text</p> Reader behavior <ul> <li>HTML is read with html2text.</li> <li>All of the texts will be split by <code>page_break_pattern</code></li> <li>Each page is extracted as a Document</li> <li>The output is a list of Documents</li> </ul> <p>Parameters:</p> Name Type Description Default <code>page_break_pattern</code> <code>str</code> <p>Pattern to split the HTML into pages</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Html reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path | str</code> <p>path to HTML file</p> required <code>extra_info</code> <code>Optional[dict]</code> <p>extra information passed to this reader during extracting data</p> <code>None</code> <p>Returns:</p> Type Description <code>list[Document]</code> <p>list[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader","title":"MhtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Parse <code>MHTML</code> files with <code>BeautifulSoup</code>.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -&gt; None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load MHTML document into document objects.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/mathpix_loader/","title":"Mathpix Loader","text":""},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader","title":"MathpixPDFReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Load <code>PDF</code> files using <code>Mathpix</code> service.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -&gt; Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -&gt; str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -&gt; dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -&gt; str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -&gt; None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -&gt; str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -&gt; str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"<pre><code>wait_for_processing(pdf_id)\n</code></pre> <p>Wait for processing to complete.</p> <p>Parameters:</p> Name Type Description Default <code>pdf_id</code> <code>str</code> <p>a PDF id.</p> required <p>Returns: None</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def wait_for_processing(self, pdf_id: str) -&gt; None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n</code></pre>"},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"<pre><code>clean_pdf(contents)\n</code></pre> <p>Clean the PDF file.</p> <p>Parameters:</p> Name Type Description Default <code>contents</code> <code>str</code> <p>a PDF file contents.</p> required <p>Returns:</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def clean_pdf(self, contents: str) -&gt; str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n</code></pre>"},{"location":"reference/loaders/ocr_loader/","title":"Ocr Loader","text":""},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader","title":"OCRReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader","title":"ImageReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from knowledgehub.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from knowledgehub.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n</code></pre>"},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n</code></pre>"},{"location":"reference/loaders/pdf_loader/","title":"Pdf Loader","text":""},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader","title":"PDFThumbnailReader","text":"<p>               Bases: <code>PDFReader</code></p> <p>PDF parser with thumbnail for each page.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -&gt; None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, fs=None)\n</code></pre> <p>Parse file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -&gt; List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.get_page_thumbnails","title":"get_page_thumbnails","text":"<pre><code>get_page_thumbnails(file_path, pages, dpi=80)\n</code></pre> <p>Get image thumbnails of the pages in the PDF file.</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>path to the image file</p> required <code>page_number</code> <code>list[int]</code> <p>list of page numbers to extract</p> required <p>Returns:</p> Type Description <code>List[Image]</code> <p>list[Image.Image]: list of page thumbnails</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>def get_page_thumbnails(\n    file_path: Path, pages: list[int], dpi: int = 80\n) -&gt; List[Image.Image]:\n    \"\"\"Get image thumbnails of the pages in the PDF file.\n\n    Args:\n        file_path (Path): path to the image file\n        page_number (list[int]): list of page numbers to extract\n\n    Returns:\n        list[Image.Image]: list of page thumbnails\n    \"\"\"\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    assert suffix == \".pdf\", \"This function only supports PDF files.\"\n    try:\n        import fitz\n    except ImportError:\n        raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n    doc = fitz.open(file_path)\n\n    output_imgs = []\n    for page_number in pages:\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=dpi)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n        output_imgs.append(convert_image_to_base64(img))\n\n    return output_imgs\n</code></pre>"},{"location":"reference/loaders/txt_loader/","title":"Txt Loader","text":""},{"location":"reference/loaders/unstructured_loader/","title":"Unstructured Loader","text":"<p>Unstructured file reader.</p> <p>A parser for unstructured text files using Unstructured.io. Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.</p> <p>To use .doc and .xls parser, install</p> <p>sudo apt-get install -y libmagic-dev poppler-utils libreoffice pip install xlrd</p>"},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader","title":"UnstructuredReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General unstructured text reader for a variety of files.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n</code></pre>"},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n</code></pre> <p>If api is set, parse through api</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n</code></pre>"},{"location":"reference/loaders/utils/","title":"Utils","text":""},{"location":"reference/loaders/utils/adobe/","title":"Adobe","text":""},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.request_adobe_service","title":"request_adobe_service","text":"<pre><code>request_adobe_service(file_path, output_path='')\n</code></pre> <p>Main function to call the adobe service, and unzip the results. Args:     file_path (str): path to the pdf file     output_path (str): path to store the results</p> <p>Returns:</p> Name Type Description <code>output_path</code> <code>str</code> <p>path to the results</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def request_adobe_service(file_path: str, output_path: str = \"\") -&gt; str:\n    \"\"\"Main function to call the adobe service, and unzip the results.\n    Args:\n        file_path (str): path to the pdf file\n        output_path (str): path to store the results\n\n    Returns:\n        output_path (str): path to the results\n\n    \"\"\"\n    try:\n        from adobe.pdfservices.operation.auth.credentials import Credentials\n        from adobe.pdfservices.operation.exception.exceptions import (\n            SdkException,\n            ServiceApiException,\n            ServiceUsageException,\n        )\n        from adobe.pdfservices.operation.execution_context import ExecutionContext\n        from adobe.pdfservices.operation.io.file_ref import FileRef\n        from adobe.pdfservices.operation.pdfops.extract_pdf_operation import (\n            ExtractPDFOperation,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import (  # noqa: E501\n            ExtractElementType,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import (  # noqa: E501\n            ExtractPDFOptions,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import (  # noqa: E501\n            ExtractRenditionsElementType,\n        )\n    except ImportError:\n        raise ImportError(\n            \"pdfservices-sdk is not installed. \"\n            \"Please install it by running `pip install pdfservices-sdk\"\n            \"@git+https://github.com/niallcm/pdfservices-python-sdk.git\"\n            \"@bump-and-unfreeze-requirements`\"\n        )\n\n    if not output_path:\n        output_path = tempfile.mkdtemp()\n\n    try:\n        # Initial setup, create credentials instance.\n        credentials = (\n            Credentials.service_principal_credentials_builder()\n            .with_client_id(config(\"PDF_SERVICES_CLIENT_ID\", default=\"\"))\n            .with_client_secret(config(\"PDF_SERVICES_CLIENT_SECRET\", default=\"\"))\n            .build()\n        )\n\n        # Create an ExecutionContext using credentials\n        # and create a new operation instance.\n        execution_context = ExecutionContext.create(credentials)\n        extract_pdf_operation = ExtractPDFOperation.create_new()\n\n        # Set operation input from a source file.\n        source = FileRef.create_from_local_file(file_path)\n        extract_pdf_operation.set_input(source)\n\n        # Build ExtractPDF options and set them into the operation\n        extract_pdf_options: ExtractPDFOptions = (\n            ExtractPDFOptions.builder()\n            .with_elements_to_extract(\n                [ExtractElementType.TEXT, ExtractElementType.TABLES]\n            )\n            .with_elements_to_extract_renditions(\n                [\n                    ExtractRenditionsElementType.TABLES,\n                    ExtractRenditionsElementType.FIGURES,\n                ]\n            )\n            .build()\n        )\n        extract_pdf_operation.set_options(extract_pdf_options)\n\n        # Execute the operation.\n        result: FileRef = extract_pdf_operation.execute(execution_context)\n\n        # Save the result to the specified location.\n        zip_file_path = os.path.join(\n            output_path, \"ExtractTextTableWithFigureTableRendition.zip\"\n        )\n        result.save_as(zip_file_path)\n        # Open the ZIP file\n        with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n            # Extract all contents to the destination folder\n            zip_ref.extractall(output_path)\n    except (ServiceApiException, ServiceUsageException, SdkException):\n        logging.exception(\"Exception encountered while executing operation\")\n\n    return output_path\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.make_markdown_table","title":"make_markdown_table","text":"<pre><code>make_markdown_table(table_as_list)\n</code></pre> <p>Convert table from python list representation to markdown format. The input list consists of rows of tables, the first row is the header.</p> <p>Parameters:</p> Name Type Description Default <code>table_as_list</code> <code>List[str]</code> <p>list of table rows Example: [[\"Name\", \"Age\", \"Height\"],         [\"Jake\", 20, 5'10],         [\"Mary\", 21, 5'7]]</p> required <p>Returns:     markdown representation of the table</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def make_markdown_table(table_as_list: List[str]) -&gt; str:\n    \"\"\"\n    Convert table from python list representation to markdown format.\n    The input list consists of rows of tables, the first row is the header.\n\n    Args:\n        table_as_list: list of table rows\n            Example: [[\"Name\", \"Age\", \"Height\"],\n                    [\"Jake\", 20, 5'10],\n                    [\"Mary\", 21, 5'7]]\n    Returns:\n        markdown representation of the table\n    \"\"\"\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in table_as_list[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(table_as_list[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in table_as_list[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_json","title":"load_json","text":"<pre><code>load_json(input_path)\n</code></pre> <p>Load json file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def load_json(input_path: Union[str | Path]) -&gt; dict:\n    \"\"\"Load json file\"\"\"\n    with open(input_path, \"r\") as fi:\n        data = json.load(fi)\n\n    return data\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_excel","title":"load_excel","text":"<pre><code>load_excel(input_path)\n</code></pre> <p>Load excel file and convert to markdown</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def load_excel(input_path: Union[str | Path]) -&gt; str:\n    \"\"\"Load excel file and convert to markdown\"\"\"\n\n    df = pd.read_excel(input_path).fillna(\"\")\n    # Convert dataframe to a list of rows\n    row_list = [df.columns.values.tolist()] + df.values.tolist()\n\n    for item_id, item in enumerate(row_list[0]):\n        if \"Unnamed\" in item:\n            row_list[0][item_id] = \"\"\n\n    for row in row_list:\n        for item_id, item in enumerate(row):\n            row[item_id] = str(item).replace(\"_x000D_\", \" \").replace(\"\\n\", \" \").strip()\n\n    markdown_str = make_markdown_table(row_list)\n    return markdown_str\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.encode_image_base64","title":"encode_image_base64","text":"<pre><code>encode_image_base64(image_path)\n</code></pre> <p>Convert image to base64</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def encode_image_base64(image_path: Union[str | Path]) -&gt; Union[bytes, str]:\n    \"\"\"Convert image to base64\"\"\"\n\n    with open(image_path, \"rb\") as image_file:\n        return base64.b64encode(image_file.read()).decode(\"utf-8\")\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_table_paths","title":"parse_table_paths","text":"<pre><code>parse_table_paths(file_paths)\n</code></pre> <p>Read the table stored in an excel file given the file path</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def parse_table_paths(file_paths: List[Path]) -&gt; str:\n    \"\"\"Read the table stored in an excel file given the file path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".xlsx\":\n            content = load_excel(path)\n            break\n    return content\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_figure_paths","title":"parse_figure_paths","text":"<pre><code>parse_figure_paths(file_paths)\n</code></pre> <p>Read and convert an image to base64 given the image path</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def parse_figure_paths(file_paths: List[Path]) -&gt; Union[bytes, str]:\n    \"\"\"Read and convert an image to base64 given the image path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".png\":\n            base64_image = encode_image_base64(path)\n            content = f\"data:image/png;base64,{base64_image}\"  # type: ignore\n            break\n    return content\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_single_figure_caption","title":"generate_single_figure_caption","text":"<pre><code>generate_single_figure_caption(vlm_endpoint, figure)\n</code></pre> <p>Summarize a single figure using GPT-4V</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def generate_single_figure_caption(vlm_endpoint: str, figure: str) -&gt; str:\n    \"\"\"Summarize a single figure using GPT-4V\"\"\"\n    if figure:\n        output = generate_gpt4v(\n            endpoint=vlm_endpoint,\n            prompt=\"Provide a short 2 sentence summary of this image?\",\n            images=figure,\n        )\n        if \"sorry\" in output.lower():\n            output = \"\"\n    else:\n        output = \"\"\n    return output\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_figure_captions","title":"generate_figure_captions","text":"<pre><code>generate_figure_captions(\n    vlm_endpoint, figures, max_figures_to_process\n)\n</code></pre> <p>Summarize several figures using GPT-4V. Args:     vlm_endpoint (str): endpoint to the vision language model service     figures (List): list of base64 images     max_figures_to_process (int): the maximum number of figures will be summarized,     the rest are ignored.</p> <p>Returns:</p> Name Type Description <code>results</code> <code>List[str]</code> <p>list of all figure captions and empty strings for</p> <code>List</code> <p>ignored figures.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def generate_figure_captions(\n    vlm_endpoint: str, figures: List, max_figures_to_process: int\n) -&gt; List:\n    \"\"\"Summarize several figures using GPT-4V.\n    Args:\n        vlm_endpoint (str): endpoint to the vision language model service\n        figures (List): list of base64 images\n        max_figures_to_process (int): the maximum number of figures will be summarized,\n        the rest are ignored.\n\n    Returns:\n        results (List[str]): list of all figure captions and empty strings for\n        ignored figures.\n    \"\"\"\n    to_gen_figures = figures[:max_figures_to_process]\n    other_figures = figures[max_figures_to_process:]\n\n    with ThreadPoolExecutor() as executor:\n        futures = [\n            executor.submit(\n                lambda: generate_single_figure_caption(vlm_endpoint, figure)\n            )\n            for figure in to_gen_figures\n        ]\n\n    results = [future.result() for future in futures]\n    return results + [\"\"] * len(other_figures)\n</code></pre>"},{"location":"reference/loaders/utils/box/","title":"Box","text":""},{"location":"reference/loaders/utils/box/#loaders.utils.box.bbox_to_points","title":"bbox_to_points","text":"<pre><code>bbox_to_points(box)\n</code></pre> <p>Convert bounding box to list of points</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def bbox_to_points(box: List[int]):\n    \"\"\"Convert bounding box to list of points\"\"\"\n    x1, y1, x2, y2 = box\n    return [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.points_to_bbox","title":"points_to_bbox","text":"<pre><code>points_to_bbox(points)\n</code></pre> <p>Convert list of points to bounding box</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def points_to_bbox(points: List[Tuple[int, int]]):\n    \"\"\"Convert list of points to bounding box\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    return [min(all_x), min(all_y), max(all_x), max(all_y)]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_points","title":"scale_points","text":"<pre><code>scale_points(points, scale_factor=1.0)\n</code></pre> <p>Scale points by a scale factor</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def scale_points(points: List[Tuple[int, int]], scale_factor: float = 1.0):\n    \"\"\"Scale points by a scale factor\"\"\"\n    return [(int(pos[0] * scale_factor), int(pos[1] * scale_factor)) for pos in points]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.union_points","title":"union_points","text":"<pre><code>union_points(points)\n</code></pre> <p>Return union bounding box of list of points</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def union_points(points: List[Tuple[int, int]]):\n    \"\"\"Return union bounding box of list of points\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    bbox = (min(all_x), min(all_y), max(all_x), max(all_y))\n    return bbox\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_box","title":"scale_box","text":"<pre><code>scale_box(box, scale_factor=1.0)\n</code></pre> <p>Scale box by a scale factor</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def scale_box(box: List[int], scale_factor: float = 1.0):\n    \"\"\"Scale box by a scale factor\"\"\"\n    return [int(pos * scale_factor) for pos in box]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_h","title":"box_h","text":"<pre><code>box_h(box)\n</code></pre> <p>Return box height</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def box_h(box: List[int]):\n    \"Return box height\"\n    return box[3] - box[1]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_w","title":"box_w","text":"<pre><code>box_w(box)\n</code></pre> <p>Return box width</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def box_w(box: List[int]):\n    \"Return box width\"\n    return box[2] - box[0]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_area","title":"box_area","text":"<pre><code>box_area(box)\n</code></pre> <p>Return box area</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def box_area(box: List[int]):\n    \"Return box area\"\n    x1, y1, x2, y2 = box\n    return (x2 - x1) * (y2 - y1)\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.get_rect_iou","title":"get_rect_iou","text":"<pre><code>get_rect_iou(gt_box, pd_box, iou_type=0)\n</code></pre> <p>Intersection over union on layout rectangle</p> <p>Parameters:</p> Name Type Description Default <code>gt_box</code> <code>List[tuple]</code> <p>List[tuple] A list contains bounding box coordinates of ground truth</p> required <code>pd_box</code> <code>List[tuple]</code> <p>List[tuple] A list contains bounding box coordinates of prediction</p> required <code>iou_type</code> <p>int 0: intersection / union, normal IOU 1: intersection / min(areas), useful when boxes are under/over-segmented</p> <code>0</code> <code>Input</code> <code>format</code> <p>[(x1, y1), (x2, y1), (x2, y2), (x1, y2)]</p> required <code>Annotation</code> <code>for each element in bbox</code> required <p>Returns:</p> Type Description <code>int</code> <p>Intersection over union value</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -&gt; int:\n    \"\"\"Intersection over union on layout rectangle\n\n    Args:\n        gt_box: List[tuple]\n            A list contains bounding box coordinates of ground truth\n        pd_box: List[tuple]\n            A list contains bounding box coordinates of prediction\n        iou_type: int\n            0: intersection / union, normal IOU\n            1: intersection / min(areas), useful when boxes are under/over-segmented\n\n        Input format: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n        Annotation for each element in bbox:\n        (x1, y1)        (x2, y1)\n            +-------+\n            |       |\n            |       |\n            +-------+\n        (x1, y2)        (x2, y2)\n\n    Returns:\n        Intersection over union value\n    \"\"\"\n\n    assert iou_type in [0, 1], \"Only support 0: origin iou, 1: intersection / min(area)\"\n\n    # determine the (x, y)-coordinates of the intersection rectangle\n    # gt_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    # pd_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    x_left = max(gt_box[0][0], pd_box[0][0])\n    y_top = max(gt_box[0][1], pd_box[0][1])\n    x_right = min(gt_box[2][0], pd_box[2][0])\n    y_bottom = min(gt_box[2][1], pd_box[2][1])\n\n    # compute the area of intersection rectangle\n    interArea = max(0, x_right - x_left) * max(0, y_bottom - y_top)\n\n    # compute the area of both the prediction and ground-truth\n    # rectangles\n    gt_area = (gt_box[2][0] - gt_box[0][0]) * (gt_box[2][1] - gt_box[0][1])\n    pd_area = (pd_box[2][0] - pd_box[0][0]) * (pd_box[2][1] - pd_box[0][1])\n\n    # compute the intersection over union by taking the intersection\n    # area and dividing it by the sum of prediction + ground-truth\n    # areas - the intersection area\n    if iou_type == 0:\n        iou = interArea / float(gt_area + pd_area - interArea)\n    elif iou_type == 1:\n        iou = interArea / max(min(gt_area, pd_area), 1)\n\n    # return the intersection over union value\n    return iou\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.sort_funsd_reading_order","title":"sort_funsd_reading_order","text":"<pre><code>sort_funsd_reading_order(lines, box_key_name='box')\n</code></pre> <p>Sort cell list to create the right reading order using their locations</p> <p>Parameters:</p> Name Type Description Default <code>lines</code> <code>List[dict]</code> <p>list of cells to sort</p> required <p>Returns:</p> Type Description <p>a list of cell lists in the right reading order that contain</p> <p>no key or start with a key and contain no other key</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def sort_funsd_reading_order(lines: List[dict], box_key_name: str = \"box\"):\n    \"\"\"Sort cell list to create the right reading order using their locations\n\n    Args:\n        lines: list of cells to sort\n\n    Returns:\n        a list of cell lists in the right reading order that contain\n        no key or start with a key and contain no other key\n    \"\"\"\n    sorted_list = []\n\n    if len(lines) == 0:\n        return lines\n\n    while len(lines) &gt; 1:\n        topleft_line = lines[0]\n        for line in lines[1:]:\n            topleft_line_pos = topleft_line[box_key_name]\n            topleft_line_center_y = (topleft_line_pos[1] + topleft_line_pos[3]) / 2\n            x1, y1, x2, y2 = line[box_key_name]\n            box_center_x = (x1 + x2) / 2\n            box_center_y = (y1 + y2) / 2\n            cell_h = y2 - y1\n            if box_center_y &lt;= topleft_line_center_y - cell_h / 2:\n                topleft_line = line\n                continue\n            if (\n                box_center_x &lt; topleft_line_pos[2]\n                and box_center_y &lt; topleft_line_pos[3]\n            ):\n                topleft_line = line\n                continue\n        sorted_list.append(topleft_line)\n        lines.remove(topleft_line)\n\n    sorted_list.append(lines[0])\n\n    return sorted_list\n</code></pre>"},{"location":"reference/loaders/utils/gpt4v/","title":"Gpt4V","text":""},{"location":"reference/loaders/utils/pdf_ocr/","title":"Pdf Ocr","text":""},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.read_pdf_unstructured","title":"read_pdf_unstructured","text":"<pre><code>read_pdf_unstructured(input_path)\n</code></pre> <p>Convert PDF from specified path to list of text items with location information</p> <p>Parameters:</p> Name Type Description Default <code>input_path</code> <code>Union[Path, str]</code> <p>path to input file</p> required <p>Returns:</p> Type Description <p>Dict page_number: list of text boxes</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def read_pdf_unstructured(input_path: Union[Path, str]):\n    \"\"\"Convert PDF from specified path to list of text items with\n    location information\n\n    Args:\n        input_path: path to input file\n\n    Returns:\n        Dict page_number: list of text boxes\n    \"\"\"\n    try:\n        from unstructured.partition.auto import partition\n    except ImportError as e:\n        raise ImportError(\n            \"Please install unstructured PDF reader `pip install unstructured[pdf]`: \"\n            f\"{e}\"\n        )\n\n    page_items = defaultdict(list)\n    items = partition(input_path)\n    for item in items:\n        page_number = item.metadata.page_number\n        bbox = points_to_bbox(item.metadata.coordinates.points)\n        coord_system = item.metadata.coordinates.system\n        max_w, max_h = coord_system.width, coord_system.height\n        page_items[page_number - 1].append(\n            {\n                \"text\": item.text,\n                \"box\": bbox,\n                \"location\": bbox_to_points(bbox),\n                \"page_shape\": (max_w, max_h),\n            }\n        )\n\n    return page_items\n</code></pre>"},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_ocr_and_pdf_texts","title":"merge_ocr_and_pdf_texts","text":"<pre><code>merge_ocr_and_pdf_texts(\n    ocr_list, pdf_text_list, debug_info=None\n)\n</code></pre> <p>Merge PDF and OCR text using IOU overlapping location Args:     ocr_list: List of OCR items {\"text\", \"box\", \"location\"}     pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}</p> <p>Returns:</p> Type Description <p>Combined list of PDF text and non-overlap OCR text</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def merge_ocr_and_pdf_texts(\n    ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None\n):\n    \"\"\"Merge PDF and OCR text using IOU overlapping location\n    Args:\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        Combined list of PDF text and non-overlap OCR text\n    \"\"\"\n    not_matched_ocr = []\n\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    for ocr_item in ocr_list:\n        matched = False\n        for pdf_item in pdf_text_list:\n            if (\n                get_rect_iou(ocr_item[\"location\"], pdf_item[\"location\"], iou_type=1)\n                &gt; IOU_THRES\n            ):\n                matched = True\n                break\n\n        color = (255, 0, 0)\n        if not matched:\n            ocr_item[\"matched\"] = False\n            not_matched_ocr.append(ocr_item)\n            color = (0, 255, 255)\n\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                ocr_item[\"location\"][0],\n                ocr_item[\"location\"][2],\n                color=color,\n                thickness=1,\n            )\n\n    if debug_info is not None:\n        for pdf_item in pdf_text_list:\n            cv2.rectangle(\n                debug_im,\n                pdf_item[\"location\"][0],\n                pdf_item[\"location\"][2],\n                color=(0, 255, 0),\n                thickness=2,\n            )\n\n    return pdf_text_list + not_matched_ocr\n</code></pre>"},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_table_cell_and_ocr","title":"merge_table_cell_and_ocr","text":"<pre><code>merge_table_cell_and_ocr(\n    table_list, ocr_list, pdf_list, debug_info=None\n)\n</code></pre> <p>Merge table items with OCR text using IOU overlapping location Args:     table_list: List of table items         \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"}     ocr_list: List of OCR items {\"text\", \"box\", \"location\"}     pdf_list: List of PDF items {\"text\", \"box\", \"location\"}</p> <p>Returns:</p> Name Type Description <code>all_table_cells</code> <p>List of tables, each of table is represented by list of cells with combined text from OCR</p> <code>not_matched_items</code> <p>List of PDF text which is not overlapped by table region</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def merge_table_cell_and_ocr(\n    table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None\n):\n    \"\"\"Merge table items with OCR text using IOU overlapping location\n    Args:\n        table_list: List of table items\n            \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"}\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        all_table_cells: List of tables, each of table is represented\n            by list of cells with combined text from OCR\n        not_matched_items: List of PDF text which is not overlapped by table region\n    \"\"\"\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    cell_list = [item for item in table_list if item[\"type\"] == \"cell\"]\n    table_list = [item for item in table_list if item[\"type\"] == \"table\"]\n\n    # sort table by area\n    table_list = sorted(table_list, key=lambda item: box_area(item[\"bbox\"]))\n\n    all_tables = []\n    matched_pdf_ids = []\n    matched_cell_ids = []\n\n    for table in table_list:\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                table[\"location\"][0],\n                table[\"location\"][2],\n                color=[0, 0, 255],\n                thickness=5,\n            )\n\n        cur_table_cells = []\n        for cell_id, cell in enumerate(cell_list):\n            if cell_id in matched_cell_ids:\n                continue\n\n            if get_rect_iou(\n                table[\"location\"], cell[\"location\"], iou_type=1\n            ) &gt; IOU_THRES and box_area(table[\"bbox\"]) &gt; box_area(cell[\"bbox\"]):\n                color = [128, 0, 128]\n                # cell matched to table\n                for item_list, item_type in [(pdf_list, \"pdf\"), (ocr_list, \"ocr\")]:\n                    cell[\"ocr\"] = []\n                    for item_id, item in enumerate(item_list):\n                        if item_type == \"pdf\" and item_id in matched_pdf_ids:\n                            continue\n                        if (\n                            get_rect_iou(item[\"location\"], cell[\"location\"], iou_type=1)\n                            &gt; IOU_THRES\n                        ):\n                            cell[\"ocr\"].append(item)\n                            if item_type == \"pdf\":\n                                matched_pdf_ids.append(item_id)\n\n                    if len(cell[\"ocr\"]) &gt; 0:\n                        # check if union of matched ocr does\n                        # not extend over cell boundary,\n                        # if True, continue to use OCR_list to match\n                        all_box_points_in_cell = []\n                        for item in cell[\"ocr\"]:\n                            all_box_points_in_cell.extend(item[\"location\"])\n                        union_box = union_points(all_box_points_in_cell)\n                        cell_okay = (\n                            box_h(union_box) &lt;= box_h(cell[\"bbox\"]) * PADDING_THRES\n                            and box_w(union_box) &lt;= box_w(cell[\"bbox\"]) * PADDING_THRES\n                        )\n                    else:\n                        cell_okay = False\n\n                    if cell_okay:\n                        if item_type == \"pdf\":\n                            color = [255, 0, 255]\n                        break\n\n                if debug_info is not None:\n                    cv2.rectangle(\n                        debug_im,\n                        cell[\"location\"][0],\n                        cell[\"location\"][2],\n                        color=color,\n                        thickness=3,\n                    )\n\n                matched_cell_ids.append(cell_id)\n                cur_table_cells.append(cell)\n\n        all_tables.append(cur_table_cells)\n\n    not_matched_items = [\n        item for _id, item in enumerate(pdf_list) if _id not in matched_pdf_ids\n    ]\n    if debug_info is not None:\n        for item in not_matched_items:\n            cv2.rectangle(\n                debug_im,\n                item[\"location\"][0],\n                item[\"location\"][2],\n                color=[128, 128, 128],\n                thickness=3,\n            )\n\n    return all_tables, not_matched_items\n</code></pre>"},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.parse_ocr_output","title":"parse_ocr_output","text":"<pre><code>parse_ocr_output(\n    ocr_page_items,\n    pdf_page_items,\n    artifact_path=None,\n    debug_path=None,\n)\n</code></pre> <p>Main function to combine OCR output and PDF text to form list of table / non-table regions Args:     ocr_page_items: List of OCR items by page     pdf_page_items: Dict of PDF texts (page number as key)     debug_path: If specified, use OpenCV to plot debug image and save to debug_path</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def parse_ocr_output(\n    ocr_page_items: List[dict],\n    pdf_page_items: Dict[int, List[dict]],\n    artifact_path: Optional[str] = None,\n    debug_path: Optional[str] = None,\n):\n    \"\"\"Main function to combine OCR output and PDF text to\n    form list of table / non-table regions\n    Args:\n        ocr_page_items: List of OCR items by page\n        pdf_page_items: Dict of PDF texts (page number as key)\n        debug_path: If specified, use OpenCV to plot debug image and save to debug_path\n    \"\"\"\n    all_tables = []\n    all_texts = []\n\n    for page_id, page in enumerate(ocr_page_items):\n        ocr_list = page[\"json\"][\"ocr\"]\n        table_list = page[\"json\"][\"table\"]\n        page_shape = page[\"image_shape\"]\n        pdf_item_list = pdf_page_items[page_id]\n\n        # create bbox additional information\n        for item in ocr_list:\n            item[\"box\"] = points_to_bbox(item[\"location\"])\n\n        # re-scale pdf items according to new image size\n        for item in pdf_item_list:\n            scale_factor = page_shape[0] / item[\"page_shape\"][0]\n            item[\"box\"] = scale_box(item[\"box\"], scale_factor=scale_factor)\n            item[\"location\"] = scale_points(item[\"location\"], scale_factor=scale_factor)\n\n        # if using debug mode, openCV must be installed\n        if debug_path and artifact_path is not None:\n            try:\n                import cv2\n            except ImportError:\n                raise ImportError(\n                    \"Please install openCV first to use OCRReader debug mode\"\n                )\n            image_path = Path(artifact_path) / page[\"image\"]\n            image = cv2.imread(str(image_path))\n            debug_info = (cv2, image)\n        else:\n            debug_info = None\n\n        new_pdf_list = merge_ocr_and_pdf_texts(\n            ocr_list, pdf_item_list, debug_info=debug_info\n        )\n\n        # sort by reading order\n        ocr_list = sort_funsd_reading_order(ocr_list)\n        new_pdf_list = sort_funsd_reading_order(new_pdf_list)\n\n        all_table_cells, non_table_text_list = merge_table_cell_and_ocr(\n            table_list, ocr_list, new_pdf_list, debug_info=debug_info\n        )\n\n        table_texts = [table_cells_to_markdown(cells) for cells in all_table_cells]\n        all_tables.extend([(page_id, text) for text in table_texts])\n        all_texts.append(\n            (page_id, \" \".join(item[\"text\"] for item in non_table_text_list))\n        )\n\n        # export debug image to debug_path\n        if debug_path:\n            cv2.imwrite(str(Path(debug_path) / \"page_{}.png\".format(page_id)), image)\n\n    return all_tables, all_texts\n</code></pre>"},{"location":"reference/loaders/utils/table/","title":"Table","text":""},{"location":"reference/loaders/utils/table/#loaders.utils.table.check_col_conflicts","title":"check_col_conflicts","text":"<pre><code>check_col_conflicts(col_a, col_b, thres=0.15)\n</code></pre> <p>Check if 2 columns A and B has non-empty content in the same row (to be used with merge_cols)</p> <p>Parameters:</p> Name Type Description Default <code>col_a</code> <code>List[str]</code> <p>column A (list of str)</p> required <code>col_b</code> <code>List[str]</code> <p>column B (list of str)</p> required <code>thres</code> <code>float</code> <p>percentage of overlapping allowed</p> <code>0.15</code> <p>Returns:     if number of overlapping greater than threshold</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def check_col_conflicts(\n    col_a: List[str], col_b: List[str], thres: float = 0.15\n) -&gt; bool:\n    \"\"\"Check if 2 columns A and B has non-empty content in the same row\n    (to be used with merge_cols)\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n        thres: percentage of overlapping allowed\n    Returns:\n        if number of overlapping greater than threshold\n    \"\"\"\n    num_rows = len([cell for cell in col_a if cell])\n    assert len(col_a) == len(col_b)\n    conflict_count = 0\n    for cell_a, cell_b in zip(col_a, col_b):\n        if cell_a and cell_b:\n            conflict_count += 1\n    return conflict_count &gt; num_rows * thres\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.merge_cols","title":"merge_cols","text":"<pre><code>merge_cols(col_a, col_b)\n</code></pre> <p>Merge column A and B if they do not have conflict rows</p> <p>Parameters:</p> Name Type Description Default <code>col_a</code> <code>List[str]</code> <p>column A (list of str)</p> required <code>col_b</code> <code>List[str]</code> <p>column B (list of str)</p> required <p>Returns:     merged column</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def merge_cols(col_a: List[str], col_b: List[str]) -&gt; List[str]:\n    \"\"\"Merge column A and B if they do not have conflict rows\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n    Returns:\n        merged column\n    \"\"\"\n    for r_id in range(len(col_a)):\n        if col_b[r_id]:\n            col_a[r_id] = col_a[r_id] + \" \" + col_b[r_id]\n    return col_a\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.add_index_col","title":"add_index_col","text":"<pre><code>add_index_col(csv_rows)\n</code></pre> <p>Add index column as the first column of the table csv_rows</p> <p>Parameters:</p> Name Type Description Default <code>csv_rows</code> <code>List[List[str]]</code> <p>input table</p> required <p>Returns:     output table with index column</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def add_index_col(csv_rows: List[List[str]]) -&gt; List[List[str]]:\n    \"\"\"Add index column as the first column of the table csv_rows\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output table with index column\n    \"\"\"\n    new_csv_rows = [[\"row id\"] + [\"\"] * len(csv_rows[0])]\n    for r_id, row in enumerate(csv_rows):\n        new_csv_rows.append([str(r_id + 1)] + row)\n    return new_csv_rows\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.compress_csv","title":"compress_csv","text":"<pre><code>compress_csv(csv_rows)\n</code></pre> <p>Compress table csv_rows by merging sparse columns (merge_cols)</p> <p>Parameters:</p> Name Type Description Default <code>csv_rows</code> <code>List[List[str]]</code> <p>input table</p> required <p>Returns:     output: compressed table</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def compress_csv(csv_rows: List[List[str]]) -&gt; List[List[str]]:\n    \"\"\"Compress table csv_rows by merging sparse columns (merge_cols)\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output: compressed table\n    \"\"\"\n    csv_cols = [[r[c_id] for r in csv_rows] for c_id in range(len(csv_rows[0]))]\n    to_remove_col_ids = []\n    last_c_id = 0\n    for c_id in range(1, len(csv_cols)):\n        if not check_col_conflicts(csv_cols[last_c_id], csv_cols[c_id]):\n            to_remove_col_ids.append(c_id)\n            csv_cols[last_c_id] = merge_cols(csv_cols[last_c_id], csv_cols[c_id])\n        else:\n            last_c_id = c_id\n\n    csv_cols = [r for c_id, r in enumerate(csv_cols) if c_id not in to_remove_col_ids]\n    csv_rows = [[c[r_id] for c in csv_cols] for r_id in range(len(csv_cols[0]))]\n    return csv_rows\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.get_table_from_ocr","title":"get_table_from_ocr","text":"<pre><code>get_table_from_ocr(ocr_list, table_list)\n</code></pre> <p>Get list of text lines belong to table regions specified by table_list</p> <p>Parameters:</p> Name Type Description Default <code>ocr_list</code> <code>List[dict]</code> <p>list of OCR output in Casia format (Flax)</p> required <code>table_list</code> <code>List[dict]</code> <p>list of table output in Casia format (Flax)</p> required <p>Returns:</p> Name Type Description <code>_type_</code> <p>description</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def get_table_from_ocr(ocr_list: List[dict], table_list: List[dict]):\n    \"\"\"Get list of text lines belong to table regions specified by table_list\n\n    Args:\n        ocr_list: list of OCR output in Casia format (Flax)\n        table_list: list of table output in Casia format (Flax)\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    table_texts = []\n    for table in table_list:\n        if table[\"type\"] != \"table\":\n            continue\n        cur_table_texts = []\n        for ocr in ocr_list:\n            _iou = get_rect_iou(table[\"location\"], ocr[\"location\"], iou_type=1)\n            if _iou &gt; 0.8:\n                cur_table_texts.append(ocr[\"text\"])\n        table_texts.append(cur_table_texts)\n\n    return table_texts\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.make_markdown_table","title":"make_markdown_table","text":"<pre><code>make_markdown_table(array)\n</code></pre> <p>Convert table rows in list format to markdown string</p> <p>Parameters:</p> Name Type Description Default <code>Example</code> <code>Input</code> <pre><code>[[\"Name\", \"Age\", \"Height\"],\n[\"Jake\", 20, 5'10],\n[\"Mary\", 21, 5'7]]\n</code></pre> required <p>Returns:     String to put into a .md file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def make_markdown_table(array: List[List[str]]) -&gt; str:\n    \"\"\"Convert table rows in list format to markdown string\n\n    Args:\n        Python list with rows of table as lists\n        First element as header.\n        Example Input:\n                [[\"Name\", \"Age\", \"Height\"],\n                [\"Jake\", 20, 5'10],\n                [\"Mary\", 21, 5'7]]\n    Returns:\n        String to put into a .md file\n    \"\"\"\n    array = compress_csv(array)\n    array = add_index_col(array)\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in array[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(array[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in array[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_csv_string_to_list","title":"parse_csv_string_to_list","text":"<pre><code>parse_csv_string_to_list(csv_str)\n</code></pre> <p>Convert CSV string to list of rows</p> <p>Parameters:</p> Name Type Description Default <code>csv_str</code> <code>str</code> <p>input CSV string</p> required <p>Returns:</p> Type Description <code>List[List[str]]</code> <p>Output table in list format</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def parse_csv_string_to_list(csv_str: str) -&gt; List[List[str]]:\n    \"\"\"Convert CSV string to list of rows\n\n    Args:\n        csv_str: input CSV string\n\n    Returns:\n        Output table in list format\n    \"\"\"\n    io = StringIO(csv_str)\n    csv_reader = csv.reader(io, delimiter=\",\")\n    rows = [row for row in csv_reader]\n    return rows\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.format_cell","title":"format_cell","text":"<pre><code>format_cell(cell, length_limit=None)\n</code></pre> <p>Format cell content by remove redundant character and enforce length limit</p> <p>Parameters:</p> Name Type Description Default <code>cell</code> <code>str</code> <p>input cell text</p> required <code>length_limit</code> <code>Optional[int]</code> <p>limit of text length.</p> <code>None</code> <p>Returns:</p> Type Description <code>str</code> <p>new cell text</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def format_cell(cell: str, length_limit: Optional[int] = None) -&gt; str:\n    \"\"\"Format cell content by remove redundant character and enforce length limit\n\n    Args:\n        cell: input cell text\n        length_limit: limit of text length.\n\n    Returns:\n        new cell text\n    \"\"\"\n    cell = cell.replace(\"\\n\", \" \")\n    if length_limit:\n        cell = cell[:length_limit]\n    return cell\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.extract_tables_from_csv_string","title":"extract_tables_from_csv_string","text":"<pre><code>extract_tables_from_csv_string(csv_content, table_texts)\n</code></pre> <p>Extract list of table from FullOCR output (csv_content) with the specified table_texts</p> <p>Parameters:</p> Name Type Description Default <code>csv_content</code> <code>str</code> <p>CSV output from FullOCR pipeline</p> required <code>table_texts</code> <code>List[List[str]]</code> <p>list of table texts extracted</p> required <p>Returns:</p> Type Description <code>Tuple[List[str], str]</code> <p>List of tables and non-text content</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def extract_tables_from_csv_string(\n    csv_content: str, table_texts: List[List[str]]\n) -&gt; Tuple[List[str], str]:\n    \"\"\"Extract list of table from FullOCR output\n    (csv_content) with the specified table_texts\n\n    Args:\n        csv_content: CSV output from FullOCR pipeline\n        table_texts: list of table texts extracted\n        from get_table_from_ocr()\n\n    Returns:\n        List of tables and non-text content\n    \"\"\"\n    rows = parse_csv_string_to_list(csv_content)\n    used_row_ids = []\n    table_csv_list = []\n    for table in table_texts:\n        cur_rows = []\n        for row_id, row in enumerate(rows):\n            scores = [\n                any(cell in cell_reference for cell in table)\n                for cell_reference in row\n                if cell_reference\n            ]\n            score = sum(scores) / len(scores)\n            if score &gt; 0.5 and row_id not in used_row_ids:\n                used_row_ids.append(row_id)\n                cur_rows.append([format_cell(cell) for cell in row])\n        if cur_rows:\n            table_csv_list.append(make_markdown_table(cur_rows))\n        else:\n            print(\"table not matched\", table)\n\n    non_table_rows = [\n        row for row_id, row in enumerate(rows) if row_id not in used_row_ids\n    ]\n    non_table_text = \"\\n\".join(\n        \" \".join(format_cell(cell) for cell in row) for row in non_table_rows\n    )\n    return table_csv_list, non_table_text\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.strip_special_chars_markdown","title":"strip_special_chars_markdown","text":"<pre><code>strip_special_chars_markdown(text)\n</code></pre> <p>Strip special characters from input text in markdown table format</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def strip_special_chars_markdown(text: str) -&gt; str:\n    \"\"\"Strip special characters from input text in markdown table format\"\"\"\n    return text.replace(\"|\", \"\").replace(\":---:\", \"\").replace(\"---\", \"\")\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_markdown_text_to_tables","title":"parse_markdown_text_to_tables","text":"<pre><code>parse_markdown_text_to_tables(text)\n</code></pre> <p>Convert markdown text to list of non-table spans and table spans</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>input markdown text</p> required <p>Returns:</p> Type Description <code>Tuple[List[str], List[str]]</code> <p>list of table spans and non-table spans</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def parse_markdown_text_to_tables(text: str) -&gt; Tuple[List[str], List[str]]:\n    \"\"\"Convert markdown text to list of non-table spans and table spans\n\n    Args:\n        text: input markdown text\n\n    Returns:\n        list of table spans and non-table spans\n    \"\"\"\n    # init empty tables and texts list\n    tables = []\n    texts = []\n\n    # split input by line break\n    lines = text.split(\"\\n\")\n    cur_table = []\n    cur_text: List[str] = []\n    for line in lines:\n        line = line.strip()\n        if line.startswith(\"|\"):\n            if len(cur_text) &gt; 0:\n                texts.append(cur_text)\n                cur_text = []\n            cur_table.append(line)\n        else:\n            # add new table to the list\n            if len(cur_table) &gt; 0:\n                tables.append(cur_table)\n                cur_table = []\n            cur_text.append(line)\n\n    table_texts = [\"\\n\".join(table) for table in tables]\n    non_table_texts = [\"\\n\".join(text) for text in texts]\n    return table_texts, non_table_texts\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.table_cells_to_markdown","title":"table_cells_to_markdown","text":"<pre><code>table_cells_to_markdown(cells)\n</code></pre> <p>Convert list of cells with attached text to Markdown table</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def table_cells_to_markdown(cells: List[dict]):\n    \"\"\"Convert list of cells with attached text to Markdown table\"\"\"\n\n    if len(cells) == 0:\n        return \"\"\n\n    all_row_ids = []\n    all_col_ids = []\n    for cell in cells:\n        all_row_ids.extend(cell[\"rows\"])\n        all_col_ids.extend(cell[\"columns\"])\n\n    num_rows, num_cols = max(all_row_ids) + 1, max(all_col_ids) + 1\n    table_rows = [[\"\" for c in range(num_cols)] for r in range(num_rows)]\n\n    # start filling in the grid\n    for cell in cells:\n        cell_text = \" \".join(item[\"text\"] for item in cell[\"ocr\"])\n        start_row_id, end_row_id = cell[\"rows\"]\n        start_col_id, end_col_id = cell[\"columns\"]\n        span_cell = end_row_id != start_row_id or end_col_id != start_col_id\n\n        # do not repeat long text in span cell to prevent context length issue\n        if span_cell and len(cell_text.replace(\" \", \"\")) &lt; 20 and start_row_id &gt; 0:\n            for row in range(start_row_id, end_row_id + 1):\n                for col in range(start_col_id, end_col_id + 1):\n                    table_rows[row][col] += cell_text + \" \"\n        else:\n            table_rows[start_row_id][start_col_id] += cell_text + \" \"\n\n    return make_markdown_table(table_rows)\n</code></pre>"},{"location":"reference/parsers/","title":"Parsers","text":""},{"location":"reference/parsers/#parsers.RegexExtractor","title":"RegexExtractor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Simple class for extracting text from a document using a regex pattern.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>List[str]</code> <p>The regex pattern(s) to use.</p> required <code>output_map</code> <code>dict</code> <p>A mapping from extracted text to the desired output. Defaults to None.</p> required Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -&gt; list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -&gt; str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -&gt; ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            &gt;&gt;&gt; document1 = Document(...)\n            &gt;&gt;&gt; document2 = Document(...)\n            &gt;&gt;&gt; document_batch = [document1, document2]\n            &gt;&gt;&gt; batch_output = self(document_batch)\n            &gt;&gt;&gt; print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw_static","title":"run_raw_static  <code>staticmethod</code>","text":"<pre><code>run_raw_static(pattern, text)\n</code></pre> <p>Finds all non-overlapping occurrences of a pattern in a string.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>str</code> <p>The regular expression pattern to search for.</p> required <code>text</code> <code>str</code> <p>The input string to search in.</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List[str]: A list of all non-overlapping occurrences of the pattern in the string.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef run_raw_static(pattern: str, text: str) -&gt; list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.map_output","title":"map_output  <code>staticmethod</code>","text":"<pre><code>map_output(text, output_map)\n</code></pre> <p>Maps the given <code>text</code> to its corresponding value in the <code>output_map</code> dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The input text to be mapped.</p> required <code>output_map</code> <code>dict</code> <p>A dictionary containing mapping of input text to output values.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>The corresponding value from the <code>output_map</code> if <code>text</code> is found in the dictionary, otherwise returns the original <code>text</code>.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef map_output(text, output_map) -&gt; str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw","title":"run_raw","text":"<pre><code>run_raw(text)\n</code></pre> <p>Matches the raw text against the pattern and rans the output mapping, returning     an instance of ExtractorOutput.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The raw text to be processed.</p> required <p>Returns:</p> Name Type Description <code>ExtractorOutput</code> <code>ExtractorOutput</code> <p>The processed output as a list of ExtractorOutput.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run_raw(self, text: str) -&gt; ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.run","title":"run","text":"<pre><code>run(text)\n</code></pre> <p>Match the input against a pattern and return the output for each input</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | list[str] | Document | list[Document]</code> <p>contains the input string to be processed</p> required <p>Returns:</p> Type Description <code>list[ExtractorOutput]</code> <p>A list contains the output ExtractorOutput for each input</p> Example <pre><code>&gt;&gt;&gt; document1 = Document(...)\n&gt;&gt;&gt; document2 = Document(...)\n&gt;&gt;&gt; document_batch = [document1, document2]\n&gt;&gt;&gt; batch_output = self(document_batch)\n&gt;&gt;&gt; print(batch_output)\n[output1_document1, output1_document2]\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        &gt;&gt;&gt; document1 = Document(...)\n        &gt;&gt;&gt; document2 = Document(...)\n        &gt;&gt;&gt; document_batch = [document1, document2]\n        &gt;&gt;&gt; batch_output = self(document_batch)\n        &gt;&gt;&gt; print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n</code></pre>"},{"location":"reference/parsers/regex_extractor/","title":"Regex Extractor","text":""},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor","title":"RegexExtractor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Simple class for extracting text from a document using a regex pattern.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>List[str]</code> <p>The regex pattern(s) to use.</p> required <code>output_map</code> <code>dict</code> <p>A mapping from extracted text to the desired output. Defaults to None.</p> required Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -&gt; list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -&gt; str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -&gt; ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            &gt;&gt;&gt; document1 = Document(...)\n            &gt;&gt;&gt; document2 = Document(...)\n            &gt;&gt;&gt; document_batch = [document1, document2]\n            &gt;&gt;&gt; batch_output = self(document_batch)\n            &gt;&gt;&gt; print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw_static","title":"run_raw_static  <code>staticmethod</code>","text":"<pre><code>run_raw_static(pattern, text)\n</code></pre> <p>Finds all non-overlapping occurrences of a pattern in a string.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>str</code> <p>The regular expression pattern to search for.</p> required <code>text</code> <code>str</code> <p>The input string to search in.</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List[str]: A list of all non-overlapping occurrences of the pattern in the string.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef run_raw_static(pattern: str, text: str) -&gt; list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.map_output","title":"map_output  <code>staticmethod</code>","text":"<pre><code>map_output(text, output_map)\n</code></pre> <p>Maps the given <code>text</code> to its corresponding value in the <code>output_map</code> dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The input text to be mapped.</p> required <code>output_map</code> <code>dict</code> <p>A dictionary containing mapping of input text to output values.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>The corresponding value from the <code>output_map</code> if <code>text</code> is found in the dictionary, otherwise returns the original <code>text</code>.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef map_output(text, output_map) -&gt; str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw","title":"run_raw","text":"<pre><code>run_raw(text)\n</code></pre> <p>Matches the raw text against the pattern and rans the output mapping, returning     an instance of ExtractorOutput.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The raw text to be processed.</p> required <p>Returns:</p> Name Type Description <code>ExtractorOutput</code> <code>ExtractorOutput</code> <p>The processed output as a list of ExtractorOutput.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run_raw(self, text: str) -&gt; ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run","title":"run","text":"<pre><code>run(text)\n</code></pre> <p>Match the input against a pattern and return the output for each input</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | list[str] | Document | list[Document]</code> <p>contains the input string to be processed</p> required <p>Returns:</p> Type Description <code>list[ExtractorOutput]</code> <p>A list contains the output ExtractorOutput for each input</p> Example <pre><code>&gt;&gt;&gt; document1 = Document(...)\n&gt;&gt;&gt; document2 = Document(...)\n&gt;&gt;&gt; document_batch = [document1, document2]\n&gt;&gt;&gt; batch_output = self(document_batch)\n&gt;&gt;&gt; print(batch_output)\n[output1_document1, output1_document2]\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        &gt;&gt;&gt; document1 = Document(...)\n        &gt;&gt;&gt; document2 = Document(...)\n        &gt;&gt;&gt; document_batch = [document1, document2]\n        &gt;&gt;&gt; batch_output = self(document_batch)\n        &gt;&gt;&gt; print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n</code></pre>"},{"location":"reference/storages/","title":"Storages","text":""},{"location":"reference/storages/#storages.BaseDocumentStore","title":"BaseDocumentStore","text":"<p>               Bases: <code>ABC</code></p> <p>A document store is in charged of storing and managing documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>Document or list of documents</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>List of ids of the documents. Optional, if not set will use doc.doc_id</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.get","title":"get  <code>abstractmethod</code>","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.get_all","title":"get_all  <code>abstractmethod</code>","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.count","title":"count  <code>abstractmethod</code>","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search document store using search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -&gt; List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>refresh_indices</code> <code>bool</code> <p>request Elasticsearch to update its index (default to True)</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"<pre><code>query_raw(query)\n</code></pre> <p>Query Elasticsearch store using query format of ES client</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>dict</code> <p>Elasticsearch query format</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query_raw(self, query: dict) -&gt; List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search Elasticsearch docstore using search query (BM25)</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>str</code> <p>query text</p> required <code>top_k</code> <code>int</code> <p>number of top documents to return. Defaults to 10.</p> <code>10</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.save","title":"save","text":"<pre><code>save(path)\n</code></pre> <p>Save document to path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.load","title":"load","text":"<pre><code>load(path)\n</code></pre> <p>Load document store from path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Perform full-text search on document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>LancdDB document store which support full-text search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def get_all(self) -&gt; List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Load documents into lancedb storage.</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids, refresh_indices=True)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"<p>               Bases: <code>InMemoryDocumentStore</code></p> <p>Improve InMemoryDocumentStore by auto saving whenever the corpus is changed</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore","title":"BaseVectorStore","text":"<p>               Bases: <code>ABC</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -&gt; list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(embeddings, metadatas=None, ids=None)\n</code></pre> <p>Add vector embeddings to vector stores</p> <p>Parameters:</p> Name Type Description Default <code>embeddings</code> <code>list[list[float]] | list[DocumentWithEmbedding]</code> <p>List of embeddings</p> required <code>metadatas</code> <code>Optional[list[dict]]</code> <p>List of metadata of the embeddings</p> <code>None</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings</p> <code>None</code> <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List of ids of the embeddings</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -&gt; list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the vector store</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.ChromaVectorStore","title":"ChromaVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -&gt; int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.ChromaVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n</code></pre>"},{"location":"reference/storages/#storages.ChromaVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore","title":"InMemoryVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore.save","title":"save","text":"<pre><code>save(save_path, fs=None, **kwargs)\n</code></pre> <p>save a simpleVectorStore to a dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>save_path</code> <code>str</code> <p>Path of saving vector to disk.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore.load","title":"load","text":"<pre><code>load(load_path, fs=None)\n</code></pre> <p>Create a SimpleKVStore from a load directory.</p> <p>Parameters:</p> Name Type Description Default <code>load_path</code> <code>str</code> <p>Path of loading vector.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Clear the old data</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBVectorStore","title":"LanceDBVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/#storages.MilvusVectorStore","title":"MilvusVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/milvus.py</code> <pre><code>class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.QdrantVectorStore","title":"QdrantVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.QdrantVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/storages/#storages.QdrantVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> <p>Similar to InMemoryVectorStore but is backed by file by default</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py</code> <pre><code>class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/","title":"Docstores","text":""},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore","title":"BaseDocumentStore","text":"<p>               Bases: <code>ABC</code></p> <p>A document store is in charged of storing and managing documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>Document or list of documents</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>List of ids of the documents. Optional, if not set will use doc.doc_id</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get","title":"get  <code>abstractmethod</code>","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get_all","title":"get_all  <code>abstractmethod</code>","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.count","title":"count  <code>abstractmethod</code>","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search document store using search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -&gt; List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>refresh_indices</code> <code>bool</code> <p>request Elasticsearch to update its index (default to True)</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"<pre><code>query_raw(query)\n</code></pre> <p>Query Elasticsearch store using query format of ES client</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>dict</code> <p>Elasticsearch query format</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query_raw(self, query: dict) -&gt; List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search Elasticsearch docstore using search query (BM25)</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>str</code> <p>query text</p> required <code>top_k</code> <code>int</code> <p>number of top documents to return. Defaults to 10.</p> <code>10</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.save","title":"save","text":"<pre><code>save(path)\n</code></pre> <p>Save document to path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.load","title":"load","text":"<pre><code>load(path)\n</code></pre> <p>Load document store from path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Perform full-text search on document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>LancdDB document store which support full-text search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def get_all(self) -&gt; List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Load documents into lancedb storage.</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids, refresh_indices=True)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"<p>               Bases: <code>InMemoryDocumentStore</code></p> <p>Improve InMemoryDocumentStore by auto saving whenever the corpus is changed</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n</code></pre>"},{"location":"reference/storages/docstores/base/","title":"Base","text":""},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore","title":"BaseDocumentStore","text":"<p>               Bases: <code>ABC</code></p> <p>A document store is in charged of storing and managing documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>Document or list of documents</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>List of ids of the documents. Optional, if not set will use doc.doc_id</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get","title":"get  <code>abstractmethod</code>","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get_all","title":"get_all  <code>abstractmethod</code>","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.count","title":"count  <code>abstractmethod</code>","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search document store using search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/","title":"Elasticsearch","text":""},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -&gt; List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>refresh_indices</code> <code>bool</code> <p>request Elasticsearch to update its index (default to True)</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"<pre><code>query_raw(query)\n</code></pre> <p>Query Elasticsearch store using query format of ES client</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>dict</code> <p>Elasticsearch query format</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query_raw(self, query: dict) -&gt; List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search Elasticsearch docstore using search query (BM25)</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>str</code> <p>query text</p> required <code>top_k</code> <code>int</code> <p>number of top documents to return. Defaults to 10.</p> <code>10</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/","title":"In Memory","text":""},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.save","title":"save","text":"<pre><code>save(path)\n</code></pre> <p>Save document to path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.load","title":"load","text":"<pre><code>load(path)\n</code></pre> <p>Load document store from path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Perform full-text search on document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>LancdDB document store which support full-text search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def get_all(self) -&gt; List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Load documents into lancedb storage.</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids, refresh_indices=True)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/","title":"Simple File","text":""},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"<p>               Bases: <code>InMemoryDocumentStore</code></p> <p>Improve InMemoryDocumentStore by auto saving whenever the corpus is changed</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n</code></pre>"},{"location":"reference/storages/vectorstores/","title":"Vectorstores","text":""},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore","title":"BaseVectorStore","text":"<p>               Bases: <code>ABC</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -&gt; list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(embeddings, metadatas=None, ids=None)\n</code></pre> <p>Add vector embeddings to vector stores</p> <p>Parameters:</p> Name Type Description Default <code>embeddings</code> <code>list[list[float]] | list[DocumentWithEmbedding]</code> <p>List of embeddings</p> required <code>metadatas</code> <code>Optional[list[dict]]</code> <p>List of metadata of the embeddings</p> <code>None</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings</p> <code>None</code> <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List of ids of the embeddings</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -&gt; list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the vector store</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore","title":"ChromaVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -&gt; int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore","title":"InMemoryVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.save","title":"save","text":"<pre><code>save(save_path, fs=None, **kwargs)\n</code></pre> <p>save a simpleVectorStore to a dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>save_path</code> <code>str</code> <p>Path of saving vector to disk.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.load","title":"load","text":"<pre><code>load(load_path, fs=None)\n</code></pre> <p>Create a SimpleKVStore from a load directory.</p> <p>Parameters:</p> Name Type Description Default <code>load_path</code> <code>str</code> <p>Path of loading vector.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Clear the old data</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore","title":"LanceDBVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.MilvusVectorStore","title":"MilvusVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/milvus.py</code> <pre><code>class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore","title":"QdrantVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> <p>Similar to InMemoryVectorStore but is backed by file by default</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py</code> <pre><code>class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/base/","title":"Base","text":""},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore","title":"BaseVectorStore","text":"<p>               Bases: <code>ABC</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -&gt; list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(embeddings, metadatas=None, ids=None)\n</code></pre> <p>Add vector embeddings to vector stores</p> <p>Parameters:</p> Name Type Description Default <code>embeddings</code> <code>list[list[float]] | list[DocumentWithEmbedding]</code> <p>List of embeddings</p> required <code>metadatas</code> <code>Optional[list[dict]]</code> <p>List of metadata of the embeddings</p> <code>None</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings</p> <code>None</code> <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List of ids of the embeddings</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -&gt; list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the vector store</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore","title":"LlamaIndexVectorStore","text":"<p>               Bases: <code>BaseVectorStore</code></p> <p>Mixin for LlamaIndex based vectorstores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class LlamaIndexVectorStore(BaseVectorStore):\n    \"\"\"Mixin for LlamaIndex based vectorstores\"\"\"\n\n    _li_class: type[LIVectorStore | BasePydanticVectorStore] | None\n\n    def _get_li_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in in _get_li_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        # get li_class from the method if not set\n        if not self._li_class:\n            LIClass = self._get_li_class()\n        else:\n            LIClass = self._li_class\n\n        from dataclasses import fields\n\n        self._client = LIClass(*args, **kwargs)\n\n        self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)}\n        for key in [\"query_embedding\", \"similarity_top_k\", \"node_ids\"]:\n            if key in self._vsq_kwargs:\n                self._vsq_kwargs.remove(key)\n\n    def __setattr__(self, name: str, value: Any) -&gt; None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._client, name, value)\n\n    def __getattr__(self, name: str) -&gt; Any:\n        if name == \"_li_class\":\n            return super().__getattribute__(name)\n\n        return getattr(self._client, name)\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if isinstance(embeddings[0], list):\n            nodes: list[DocumentWithEmbedding] = [\n                DocumentWithEmbedding(embedding=embedding) for embedding in embeddings\n            ]\n        else:\n            nodes = embeddings  # type: ignore\n        if metadatas is not None:\n            for node, metadata in zip(nodes, metadatas):\n                node.metadata = metadata\n        if ids is not None:\n            for node, id in zip(nodes, ids):\n                node.id_ = id\n                node.relationships = {\n                    NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)\n                }\n\n        return self._client.add(nodes=nodes)\n\n    def delete(self, ids: list[str], **kwargs):\n        for id_ in ids:\n            self._client.delete(ref_doc_id=id_, **kwargs)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n            kwargs: extra query parameters. Depending on the name, these parameters\n                will be used when constructing the VectorStoreQuery object or when\n                performing querying of the underlying vector store.\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        vsq_kwargs = {}\n        vs_kwargs = {}\n        for kwkey, kwvalue in kwargs.items():\n            if kwkey in self._vsq_kwargs:\n                vsq_kwargs[kwkey] = kwvalue\n            else:\n                vs_kwargs[kwkey] = kwvalue\n\n        output = self._client.query(\n            query=VectorStoreQuery(\n                query_embedding=embedding,\n                similarity_top_k=top_k,\n                node_ids=ids,\n                **vsq_kwargs,\n            ),\n            **vs_kwargs,\n        )\n\n        embeddings = []\n        if output.nodes:\n            for node in output.nodes:\n                embeddings.append(node.embedding)\n        similarities = output.similarities if output.similarities else []\n        out_ids = output.ids if output.ids else []\n\n        return embeddings, similarities, out_ids\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore.query","title":"query","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <code>kwargs</code> <p>extra query parameters. Depending on the name, these parameters will be used when constructing the VectorStoreQuery object or when performing querying of the underlying vector store.</p> <code>{}</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>def query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n        kwargs: extra query parameters. Depending on the name, these parameters\n            will be used when constructing the VectorStoreQuery object or when\n            performing querying of the underlying vector store.\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    vsq_kwargs = {}\n    vs_kwargs = {}\n    for kwkey, kwvalue in kwargs.items():\n        if kwkey in self._vsq_kwargs:\n            vsq_kwargs[kwkey] = kwvalue\n        else:\n            vs_kwargs[kwkey] = kwvalue\n\n    output = self._client.query(\n        query=VectorStoreQuery(\n            query_embedding=embedding,\n            similarity_top_k=top_k,\n            node_ids=ids,\n            **vsq_kwargs,\n        ),\n        **vs_kwargs,\n    )\n\n    embeddings = []\n    if output.nodes:\n        for node in output.nodes:\n            embeddings.append(node.embedding)\n    similarities = output.similarities if output.similarities else []\n    out_ids = output.ids if output.ids else []\n\n    return embeddings, similarities, out_ids\n</code></pre>"},{"location":"reference/storages/vectorstores/chroma/","title":"Chroma","text":""},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore","title":"ChromaVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -&gt; int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/","title":"In Memory","text":"<p>Simple vector store index.</p>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore","title":"InMemoryVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.save","title":"save","text":"<pre><code>save(save_path, fs=None, **kwargs)\n</code></pre> <p>save a simpleVectorStore to a dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>save_path</code> <code>str</code> <p>Path of saving vector to disk.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.load","title":"load","text":"<pre><code>load(load_path, fs=None)\n</code></pre> <p>Create a SimpleKVStore from a load directory.</p> <p>Parameters:</p> Name Type Description Default <code>load_path</code> <code>str</code> <p>Path of loading vector.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Clear the old data</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n</code></pre>"},{"location":"reference/storages/vectorstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore","title":"LanceDBVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/milvus/","title":"Milvus","text":""},{"location":"reference/storages/vectorstores/milvus/#storages.vectorstores.milvus.MilvusVectorStore","title":"MilvusVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/milvus.py</code> <pre><code>class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/qdrant/","title":"Qdrant","text":""},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore","title":"QdrantVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/simple_file/","title":"Simple File","text":"<p>Simple file vector store index.</p>"},{"location":"reference/storages/vectorstores/simple_file/#storages.vectorstores.simple_file.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> <p>Similar to InMemoryVectorStore but is backed by file by default</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py</code> <pre><code>class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n</code></pre>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Quick Start","text":""},{"location":"#getting-started-with-kotaemon","title":"Getting Started with Kotaemon","text":"<p>This page is intended for end users who want to use the <code>kotaemon</code> tool for Question Answering on local documents. If you are a developer who wants contribute to the project, please visit the development page.</p>"},{"location":"#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"<p>Visit this guide.</p>"},{"location":"#installation-offline","title":"Installation (Offline)","text":""},{"location":"#download","title":"Download","text":"<p>Download the <code>kotaemon-app.zip</code> file from the latest release.</p>"},{"location":"#run-setup-script","title":"Run setup script","text":"<ol> <li>Unzip the downloaded file.</li> <li>Navigate to the <code>scripts</code> folder and start an installer that matches your OS:<ul> <li>Windows: <code>run_windows.bat</code>. Just double click the file.</li> <li>macOS: <code>run_macos.sh</code><ol> <li>Right click on your file and select Open with and Other.</li> <li>Enable All Applications and choose Terminal.</li> <li>NOTE: If you always want to open that file with Terminal, then check Always Open With.</li> <li>From now on, double click on your file and it should work.</li> </ol> </li> <li>Linux: <code>run_linux.sh</code>. Please run the script using <code>bash run_linux.sh</code> in your terminal.</li> </ul> </li> <li>After the installation, the installer will ask to launch the ktem's UI, answer to continue.</li> <li>If launched, the application will be open automatically in your browser.</li> </ol>"},{"location":"#launch","title":"Launch","text":"<p>To launch the app after initial setup or any change, simply run the <code>run_*</code> script again.</p> <p>A browser window will be opened and greets you with this screen:</p> <p></p>"},{"location":"#usage","title":"Usage","text":"<p>For how to use the application, see Usage. This page will also be available to you within the application.</p>"},{"location":"#feedback","title":"Feedback","text":"<p>Feel free to create a bug report or a feature request on our repo.</p>"},{"location":"about/","title":"About Kotaemon","text":""},{"location":"about/#about-kotaemon","title":"About Kotaemon","text":"<p>An open-source tool for chatting with your documents. Built with both end users and developers in mind.</p> <p>Source Code | Live Demo</p> <p>User Guide | Developer Guide | Feedback</p> <p>Dark Mode | Light Mode</p>"},{"location":"local_model/","title":"Setup local LLMs &amp; Embedding models","text":""},{"location":"local_model/#setup-local-llms-embedding-models","title":"Setup local LLMs &amp; Embedding models","text":""},{"location":"local_model/#prepare-local-models","title":"Prepare local models","text":""},{"location":"local_model/#note","title":"NOTE","text":"<p>In the case of using Docker image, please replace <code>http://localhost</code> with <code>http://host.docker.internal</code> to correctly communicate with service on the host machine. See more detail.</p>"},{"location":"local_model/#ollama-openai-compatible-server-recommended","title":"Ollama OpenAI compatible server (recommended)","text":"<p>Install ollama and start the application.</p> <p>Pull your model (e.g):</p> <pre><code>ollama pull llama3.1:8b\nollama pull nomic-embed-text\n</code></pre> <p>Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to Ollama:</p> <pre><code>api_key: ollama\nbase_url: http://localhost:11434/v1/\nmodel: gemma2:2b (for llm) | nomic-embed-text (for embedding)\n</code></pre> <p></p>"},{"location":"local_model/#oobaboogatext-generation-webui-openai-compatible-server","title":"oobabooga/text-generation-webui OpenAI compatible server","text":"<p>Install oobabooga/text-generation-webui.</p> <p>Follow the setup guide to download your models (GGUF, HF). Also take a look at OpenAI compatible server for detail instructions.</p> <p>Here is a short version</p> <pre><code># install sentence-transformer for embeddings creation\npip install sentence_transformers\n# change to text-generation-webui src dir\npython server.py --api\n</code></pre> <p>Use the <code>Models</code> tab to download new model and press Load.</p> <p>Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to <code>text-generation-webui</code>:</p> <pre><code>api_key: dummy\nbase_url: http://localhost:5000/v1/\nmodel: any\n</code></pre>"},{"location":"local_model/#llama-cpp-python-server-llm-only","title":"llama-cpp-python server (LLM only)","text":"<p>See llama-cpp-python OpenAI server.</p> <p>Download any GGUF model weight on HuggingFace or other source. Place it somewhere on your local machine.</p> <p>Run</p> <pre><code>LOCAL_MODEL=&lt;path/to/GGUF&gt; python scripts/serve_local.py\n</code></pre> <p>Setup LLM model on Resources tab with type OpenAI. Set these model parameters to connect to <code>llama-cpp-python</code>:</p> <pre><code>api_key: dummy\nbase_url: http://localhost:8000/v1/\nmodel: model_name\n</code></pre>"},{"location":"local_model/#use-local-models-for-rag","title":"Use local models for RAG","text":"<ul> <li>Set default LLM and Embedding model to a local variant.</li> </ul> <ul> <li>Set embedding model for the File Collection to a local model (e.g: <code>ollama</code>)</li> </ul> <ul> <li>Go to Retrieval settings and choose LLM relevant scoring model as a local model (e.g: <code>ollama</code>). Or, you can choose to disable this feature if your machine cannot handle a lot of parallel LLM requests at the same time.</li> </ul> <p>You are set! Start a new conversation to test your local RAG pipeline.</p>"},{"location":"online_install/","title":"Online install","text":""},{"location":"online_install/#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"<ol> <li>Go to kotaemon_template</li> <li>Use Duplicate function to create your own space     </li> <li>Wait for the build to complete and start up (apprx 10 mins).     </li> <li>Follow the first setup instructions (and register for Cohere API key if needed)\\    </li> <li>Complete the setup and use your own private space!    </li> </ol>"},{"location":"usage/","title":"Basic Usage","text":""},{"location":"usage/#1-add-your-ai-models","title":"1. Add your AI models","text":"<ul> <li>The tool uses Large Language Model (LLMs) to perform various tasks in a QA pipeline.   So, you need to provide the application with access to the LLMs you want   to use.</li> <li>You only need to provide at least one. However, tt is recommended that you include all the LLMs   that you have access to, you will be able to switch between them while using the   application.</li> </ul> <p>To add a model:</p> <ol> <li>Navigate to the <code>Resources</code> tab.</li> <li>Select the <code>LLMs</code> sub-tab.</li> <li>Select the <code>Add</code> sub-tab.</li> <li>Config the model to add:<ul> <li>Give it a name.</li> <li>Pick a vendor/provider (e.g. <code>ChatOpenAI</code>).</li> <li>Provide the specifications.</li> <li>(Optional) Set the model as default.</li> </ul> </li> <li>Click <code>Add</code> to add the model.</li> <li>Select <code>Embedding Models</code> sub-tab and repeat the step 3 to 5 to add an embedding model.</li> </ol> (Optional) Configure model via the .env file <p>Alternatively, you can configure the models via the <code>.env</code> file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.</p> <p>Currently, the following providers are supported:</p>"},{"location":"usage/#openai","title":"OpenAI","text":"<p>In the <code>.env</code> file, set the <code>OPENAI_API_KEY</code> variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.</p> <pre><code>OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=&lt;your OpenAI API key here&gt;\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n</code></pre>"},{"location":"usage/#azure-openai","title":"Azure OpenAI","text":"<p>For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.</p> <pre><code>AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview # could be different for you\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo # change to your deployment name\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 # change to your deployment name\n</code></pre>"},{"location":"usage/#local-models","title":"Local models","text":"<p>Pros:</p> <ul> <li>Privacy. Your documents will be stored and process locally.</li> <li>Choices. There are a wide range of LLMs in terms of size, domain, language to choose   from.</li> <li>Cost. It's free.</li> </ul> <p>Cons:</p> <ul> <li>Quality. Local models are much smaller and thus have lower generative quality than   paid APIs.</li> <li>Speed. Local models are deployed using your machine so the processing speed is   limited by your hardware.</li> </ul>"},{"location":"usage/#find-and-download-a-llm","title":"Find and download a LLM","text":"<p>You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:</p> <ul> <li>GGUF</li> </ul> <p>You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that take up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.</p> <p>Here are some recommendations and their size in memory:</p> <ul> <li>Qwen1.5-1.8B-Chat-GGUF:   around 2 GB</li> </ul>"},{"location":"usage/#enable-local-models","title":"Enable local models","text":"<p>To add a local model to the model pool, set the <code>LOCAL_MODEL</code> variable in the <code>.env</code> file to the path of the model file.</p> <pre><code>LOCAL_MODEL=&lt;full path to your model file&gt;\n</code></pre> <p>Here is how to get the full path of your model file:</p> <ul> <li>On Windows 11: right click the file and select <code>Copy as Path</code>.</li> </ul>"},{"location":"usage/#2-upload-your-documents","title":"2. Upload your documents","text":"<p>In order to do QA on your documents, you need to upload them to the application first. Navigate to the <code>File Index</code> tab and you will see 2 sections:</p> <ol> <li>File upload:<ul> <li>Drag and drop your file to the UI or select it from your file system.    Then click <code>Upload and Index</code>.</li> <li>The application will take some time to process the file and show a message once it is done.</li> </ul> </li> <li>File list:<ul> <li>This section shows the list of files that have been uploaded to the application and allows users to delete them.</li> </ul> </li> </ol>"},{"location":"usage/#3-chat-with-your-documents","title":"3. Chat with your documents","text":"<p>Now navigate back to the <code>Chat</code> tab. The chat tab is divided into 3 regions:</p> <ol> <li>Conversation Settings Panel<ul> <li>Here you can select, create, rename, and delete conversations.<ul> <li>By default, a new conversation is created automatically if no conversation is selected.</li> </ul> </li> <li>Below that you have the file index, where you can choose whether to disable, select all files, or select which files to retrieve references from.<ul> <li>If you choose \"Disabled\", no files will be considered as context during chat.</li> <li>If you choose \"Search All\", all files will be considered during chat.</li> <li>If you choose \"Select\", a dropdown will appear for you to select the    files to be considered during chat. If no files are selected, then no    files will be considered during chat.</li> </ul> </li> </ul> </li> <li>Chat Panel<ul> <li>This is where you can chat with the chatbot.</li> </ul> </li> <li>Information Panel</li> </ol> <p></p> <ul> <li>Supporting information such as the retrieved evidence and reference will be   displayed here.</li> <li>Direct citation for the answer produced by the LLM is highlighted.</li> <li>The confidence score of the answer and relevant scores of evidences are displayed to quickly assess the quality of the answer and retrieved content.</li> </ul> <ul> <li>Meaning of the score displayed:<ul> <li>Answer confidence: answer confidence level from the LLM model.</li> <li>Relevance score: overall relevant score between evidence and user question.</li> <li>Vectorstore score: relevant score from vector embedding similarity calculation (show <code>full-text search</code> if retrieved from full-text search DB).</li> <li>LLM relevant score: relevant score from LLM model (which judge relevancy between question and evidence using specific prompt).</li> <li>Reranking score: relevant score from Cohere reranking model.</li> </ul> </li> </ul> <p>Generally, the score quality is <code>LLM relevant score</code> &gt; <code>Reranking score</code> &gt; <code>Vectorscore</code>. By default, overall relevance score is taken directly from LLM relevant score. Evidences are sorted based on their overall relevance score and whether they have citation or not.</p>"},{"location":"development/","title":"Development","text":""},{"location":"development/#kotaemon","title":"kotaemon","text":"<p>An open-source clean &amp; customizable RAG UI for chatting with your documents. Built with both end users and developers in mind.</p> <p></p> <p>Live Demo | Source Code</p> <p>User Guide | Developer Guide | Feedback</p> <p> </p> <p></p>"},{"location":"development/#introduction","title":"Introduction","text":"<p>This project serves as a functional RAG UI for both end users who want to do QA on their documents and developers who want to build their own RAG pipeline.</p> <ul> <li>For end users:<ul> <li>A clean &amp; minimalistic UI for RAG-based QA.</li> <li>Supports LLM API providers (OpenAI, AzureOpenAI, Cohere, etc) and local LLMs   (via <code>ollama</code> and <code>llama-cpp-python</code>).</li> <li>Easy installation scripts.</li> </ul> </li> <li>For developers:<ul> <li>A framework for building your own RAG-based document QA pipeline.</li> <li>Customize and see your RAG pipeline in action with the provided UI (built with Gradio ).</li> <li>If you use Gradio for development, check out our theme here: kotaemon-gradio-theme.</li> </ul> </li> </ul> <pre><code>+----------------------------------------------------------------------------+\n| End users: Those who use apps built with `kotaemon`.                       |\n| (You use an app like the one in the demo above)                            |\n|     +----------------------------------------------------------------+     |\n|     | Developers: Those who built with `kotaemon`.                   |     |\n|     | (You have `import kotaemon` somewhere in your project)         |     |\n|     |     +----------------------------------------------------+     |     |\n|     |     | Contributors: Those who make `kotaemon` better.    |     |     |\n|     |     | (You make PR to this repo)                         |     |     |\n|     |     +----------------------------------------------------+     |     |\n|     +----------------------------------------------------------------+     |\n+----------------------------------------------------------------------------+\n</code></pre> <p>This repository is under active development. Feedback, issues, and PRs are highly appreciated.</p>"},{"location":"development/#key-features","title":"Key Features","text":"<ul> <li>Host your own document QA (RAG) web-UI. Support multi-user login, organize your files in private / public collections, collaborate and share your favorite chat with others.</li> </ul> <ul> <li>Organize your LLM &amp; Embedding models. Support both local LLMs &amp; popular API providers (OpenAI, Azure, Ollama, Groq).</li> </ul> <ul> <li>Hybrid RAG pipeline. Sane default RAG pipeline with hybrid (full-text &amp; vector) retriever + re-ranking to ensure best retrieval quality.</li> </ul> <ul> <li>Multi-modal QA support. Perform Question Answering on multiple documents with figures &amp; tables support. Support multi-modal document parsing (selectable options on UI).</li> </ul> <ul> <li>Advance citations with document preview. By default the system will provide detailed citations to ensure the correctness of LLM answers. View your citations (incl. relevant score) directly in the in-browser PDF viewer with highlights. Warning when retrieval pipeline return low relevant articles.</li> </ul> <ul> <li>Support complex reasoning methods. Use question decomposition to answer your complex / multi-hop question. Support agent-based reasoning with ReAct, ReWOO and other agents.</li> </ul> <ul> <li>Configurable settings UI. You can adjust most important aspects of retrieval &amp; generation process on the UI (incl. prompts).</li> </ul> <ul> <li>Extensible. Being built on Gradio, you are free to customize / add any UI elements as you like. Also, we aim to support multiple strategies for document indexing &amp; retrieval. <code>GraphRAG</code> indexing pipeline is provided as an example.</li> </ul>"},{"location":"development/#installation","title":"Installation","text":""},{"location":"development/#for-end-users","title":"For end users","text":"<p>This document is intended for developers. If you just want to install and use the app as it is, please follow the non-technical User Guide. Use the most recent release <code>.zip</code> to include latest features and bug-fixes.</p>"},{"location":"development/#for-developers","title":"For developers","text":""},{"location":"development/#with-docker-recommended","title":"With Docker (recommended)","text":"<p>We support <code>lite</code> &amp; <code>full</code> version of Docker images. With <code>full</code>, the extra packages of <code>unstructured</code> will be installed as well, it can support additional file types (.doc, .docx, ...) but the cost is larger docker image size. For most users, the <code>lite</code> image should work well in most cases.</p> <ul> <li>To use the <code>lite</code> version.</li> </ul> <pre><code>docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-lite\n</code></pre> <ul> <li>To use the <code>full</code> version.</li> </ul> <pre><code>docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-full\n</code></pre> <p>Currently, two platforms: <code>linux/amd64</code> and <code>linux/arm64</code> (for newer Mac) are provided &amp; tested. User can specify the platform by passing <code>--platform</code> in the docker run command. For example:</p> <pre><code># To run docker with platform linux/arm64\ndocker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\n--platform linux/arm64 \\\nghcr.io/cinnamon/kotaemon:main-lite\n</code></pre> <p>If everything is set up fine, navigate to <code>http://localhost:7860/</code> to access the web UI.</p> <p>We use GHCR to store docker images, all images can be found here.</p>"},{"location":"development/#without-docker","title":"Without Docker","text":"<ul> <li>Clone and install required packages on a fresh python environment.</li> </ul> <pre><code># optional (setup env)\nconda create -n kotaemon python=3.10\nconda activate kotaemon\n\n# clone this repo\ngit clone https://github.com/Cinnamon/kotaemon\ncd kotaemon\n\npip install -e \"libs/kotaemon[all]\"\npip install -e \"libs/ktem\"\n</code></pre> <ul> <li>View and edit your environment variables (API keys, end-points) in <code>.env</code>.</li> </ul> <ul> <li>(Optional) To enable in-browser PDF_JS viewer, download PDF_JS_DIST and extract it to <code>libs/ktem/ktem/assets/prebuilt</code></li> </ul> <ul> <li>Start the web server:</li> </ul> <pre><code>python app.py\n</code></pre> <p>The app will be automatically launched in your browser.</p> <p>Default username / password are: <code>admin</code> / <code>admin</code>. You can setup additional users directly on the UI.</p> <p></p>"},{"location":"development/#setup-local-models-for-local-private-rag","title":"Setup local models (for local / private RAG)","text":"<p>See Local model setup.</p>"},{"location":"development/#customize-your-application","title":"Customize your application","text":"<p>By default, all application data are stored in <code>./ktem_app_data</code> folder. You can backup or copy this folder to move your installation to a new machine.</p> <p>For advance users or specific use-cases, you can customize those files:</p> <ul> <li><code>flowsettings.py</code></li> <li><code>.env</code></li> </ul>"},{"location":"development/#flowsettingspy","title":"<code>flowsettings.py</code>","text":"<p>This file contains the configuration of your application. You can use the example here as the starting point.</p> Notable settings <pre><code># setup your preferred document store (with full-text search capabilities)\nKH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore)\n\n# setup your preferred vectorstore (for vector-based search)\nKH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Qdrant)\n\n# Enable / disable multimodal QA\nKH_REASONINGS_USE_MULTIMODAL=True\n\n# Setup your new reasoning pipeline or modify existing one.\nKH_REASONINGS = [\n    \"ktem.reasoning.simple.FullQAPipeline\",\n    \"ktem.reasoning.simple.FullDecomposeQAPipeline\",\n    \"ktem.reasoning.react.ReactAgentPipeline\",\n    \"ktem.reasoning.rewoo.RewooAgentPipeline\",\n]\n)\n</code></pre>"},{"location":"development/#env","title":"<code>.env</code>","text":"<p>This file provides another way to configure your models and credentials.</p> Configure model via the .env file <p>Alternatively, you can configure the models via the <code>.env</code> file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.</p> <p>Currently, the following providers are supported:</p>"},{"location":"development/#openai","title":"OpenAI","text":"<p>In the <code>.env</code> file, set the <code>OPENAI_API_KEY</code> variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.</p> <pre><code>OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=&lt;your OpenAI API key here&gt;\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n</code></pre>"},{"location":"development/#azure-openai","title":"Azure OpenAI","text":"<p>For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.</p> <pre><code>AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002\n</code></pre>"},{"location":"development/#local-models","title":"Local models","text":""},{"location":"development/#using-ollama-openai-compatible-server","title":"Using ollama OpenAI compatible server","text":"<p>Install ollama and start the application.</p> <p>Pull your model (e.g):</p> <pre><code>ollama pull llama3.1:8b\nollama pull nomic-embed-text\n</code></pre> <p>Set the model names on web UI and make it as default.</p> <p></p>"},{"location":"development/#using-gguf-with-llama-cpp-python","title":"Using GGUF with llama-cpp-python","text":"<p>You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:</p> <ul> <li>GGUF</li> </ul> <p>You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.</p> <p>Here are some recommendations and their size in memory:</p> <ul> <li>Qwen1.5-1.8B-Chat-GGUF:   around 2 GB</li> </ul> <p>Add a new LlamaCpp model with the provided model name on the web uI.</p>"},{"location":"development/#adding-your-own-rag-pipeline","title":"Adding your own RAG pipeline","text":""},{"location":"development/#custom-reasoning-pipeline","title":"Custom reasoning pipeline","text":"<p>First, check the default pipeline implementation in here. You can make quick adjustment to how the default QA pipeline work.</p> <p>Next, if you feel comfortable adding new pipeline, add new <code>.py</code> implementation in <code>libs/ktem/ktem/reasoning/</code> and later include it in <code>flowssettings</code> to enable it on the UI.</p>"},{"location":"development/#custom-indexing-pipeline","title":"Custom indexing pipeline","text":"<p>Check sample implementation in <code>libs/ktem/ktem/index/file/graph</code></p> <p>(more instruction WIP).</p>"},{"location":"development/#developer-guide","title":"Developer guide","text":"<p>Please refer to the Developer Guide for more details.</p>"},{"location":"development/#star-history","title":"Star History","text":""},{"location":"development/contributing/","title":"Contributing","text":""},{"location":"development/contributing/#contributing","title":"Contributing","text":""},{"location":"development/contributing/#setting-up","title":"Setting up","text":"<ul> <li> <p>Clone the repo</p> <pre><code>git clone git@github.com:Cinnamon/kotaemon.git\ncd kotaemon\n</code></pre> </li> </ul> <ul> <li> <p>Install the environment</p> <ul> <li> <p>Create a conda environment (python &gt;= 3.10 is recommended)</p> <pre><code>conda create -n kotaemon python=3.10\nconda activate kotaemon\n\n# install dependencies\ncd libs/kotaemon\npip install -e \".[all]\"\n</code></pre> </li> </ul> <ul> <li> <p>Or run the installer (one of the <code>scripts/run_*</code> scripts depends on your OS), then   you will have all the dependencies installed as a conda environment at   <code>install_dir/env</code>.</p> <pre><code>conda activate install_dir/env\n</code></pre> </li> </ul> </li> </ul> <ul> <li> <p>Pre-commit</p> <pre><code>pre-commit install\n</code></pre> </li> </ul> <ul> <li> <p>Test</p> <pre><code>pytest tests\n</code></pre> </li> </ul>"},{"location":"development/contributing/#package-overview","title":"Package overview","text":"<p><code>kotaemon</code> library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities:</p> <ul> <li>Base interfaces: <code>kotaemon</code> defines the base interface of a component in a pipeline. A pipeline is also a component. By clearly define this interface, a pipeline of steps can be easily constructed and orchestrated.</li> <li>Core components: <code>kotaemon</code> implements (or wraps 3rd-party libraries   like Langchain, llama-index,... when possible) commonly used components in   kotaemon use cases. Some of these components are: LLM, vector store,   document store, retriever... For a detailed list and description of these   components, please refer to the API Reference section.</li> <li>List of utilities: <code>kotaemon</code> provides utilities and tools that are   usually needed in client project. For example, it provides a prompt   engineering UI for AI developers in a project to quickly create a prompt   engineering tool for DMs and QALs. It also provides a command to quickly spin   up a project code base. For a full list and description of these utilities,   please refer to the Utilities section.</li> </ul> <pre><code>mindmap\n  root((kotaemon))\n    Base Interfaces\n      Document\n      LLMInterface\n      RetrievedDocument\n      BaseEmbeddings\n      BaseChat\n      BaseCompletion\n      ...\n    Core Components\n      LLMs\n        AzureOpenAI\n        OpenAI\n      Embeddings\n        AzureOpenAI\n        OpenAI\n        HuggingFaceEmbedding\n      VectorStore\n        InMemoryVectorstore\n        ChromaVectorstore\n      Agent\n      Tool\n      DocumentStore\n      ...\n    Utilities\n      Scaffold project\n      PromptUI\n      Documentation Support</code></pre>"},{"location":"development/contributing/#common-conventions","title":"Common conventions","text":"<ul> <li>PR title: One-line description (example: Feat: Declare BaseComponent and decide LLM call interface).</li> <li>[Encouraged] Provide a quick description in the PR, so that:<ul> <li>Reviewers can quickly understand the direction of the PR.</li> <li>It will be included in the commit message when the PR is merged.</li> </ul> </li> </ul>"},{"location":"development/contributing/#environment-caching-on-pr","title":"Environment caching on PR","text":"<ul> <li>To speed up CI, environments are cached based on the version specified in <code>__init__.py</code>.</li> <li>Since dependencies versions in <code>setup.py</code> are not pinned, you need to pump the version in order to use a new environment. That environment will then be cached and used by your subsequence commits within the PR, until you pump the version again</li> <li>The new environment created during your PR is cached and will be available to others once the PR is merged.</li> <li>If you are experimenting with new dependencies and want a fresh environment every time, add <code>[ignore cache]</code> in your commit message. The CI will create a fresh environment to run your commit and then discard it.</li> <li>If your PR include updated dependencies, the recommended workflow would be:<ul> <li>Doing development as usual.</li> <li>When you want to run the CI, push a commit with the message containing <code>[ignore cache]</code>.</li> <li>Once the PR is final, pump the version in <code>__init__.py</code> and push a final commit not containing <code>[ignore cache]</code>.</li> </ul> </li> </ul>"},{"location":"development/contributing/#merge-pr-guideline","title":"Merge PR guideline","text":"<ul> <li>Use squash and merge option</li> <li>1st line message is the PR title.</li> <li>The text area is the PR description.</li> </ul>"},{"location":"development/create-a-component/","title":"Creating a Component","text":""},{"location":"development/create-a-component/#creating-a-component","title":"Creating a component","text":"<p>A fundamental concept in kotaemon is \"component\".</p> <p>Anything that isn't data or data structure is a \"component\". A component can be thought of as a step within a pipeline. It takes in some input, processes it, and returns an output, just the same as a Python function! The output will then become an input for the next component in a pipeline. In fact, a pipeline is just a component. More appropriately, a nested component: a component that makes use of one or more other components in the processing step. So in reality, there isn't a difference between a pipeline and a component! Because of that, in kotaemon, we will consider them the same as \"component\".</p> <p>To define a component, you will:</p> <ol> <li>Create a class that subclasses from <code>kotaemon.base.BaseComponent</code></li> <li>Declare init params with type annotation</li> <li>Declare nodes (nodes are just other components!) with type annotation</li> <li>Implement the processing logic in <code>run</code>.</li> </ol> <p>The syntax of a component is as follow:</p> <pre><code>from kotaemon.base import BaseComponent\nfrom kotaemon.llms import LCAzureChatOpenAI\nfrom kotaemon.parsers import RegexExtractor\n\n\nclass FancyPipeline(BaseComponent):\n    param1: str = \"This is param1\"\n    param2: int = 10\n    param3: float\n\n    node1: BaseComponent    # this is a node because of BaseComponent type annotation\n    node2: LCAzureChatOpenAI  # this is also a node because LCAzureChatOpenAI subclasses BaseComponent\n    node3: RegexExtractor   # this is also a node bceause RegexExtractor subclasses BaseComponent\n\n    def run(self, some_text: str):\n        prompt = (self.param1 + some_text) * int(self.param2 + self.param3)\n        llm_pred = self.node2(prompt).text\n        matches = self.node3(llm_pred)\n        return matches\n</code></pre> <p>Then this component can be used as follow:</p> <pre><code>llm = LCAzureChatOpenAI(endpoint=\"some-endpont\")\nextractor = RegexExtractor(pattern=[\"yes\", \"Yes\"])\n\ncomponent = FancyPipeline(\n    param1=\"Hello\"\n    param3=1.5\n    node1=llm,\n    node2=llm,\n    node3=extractor\n)\ncomponent(\"goodbye\")\n</code></pre> <p>This way, we can define each operation as a reusable component, and use them to compose larger reusable components!</p>"},{"location":"development/create-a-component/#benefits-of-component","title":"Benefits of component","text":"<p>By defining a component as above, we formally encapsulate all the necessary information inside a single class. This introduces several benefits:</p> <ol> <li>Allow tools like promptui to inspect the inner working of a component in    order to automatically generate the promptui.</li> <li>Allow visualizing a pipeline for debugging purpose.</li> </ol>"},{"location":"development/data-components/","title":"Data &amp; Data Structure Components","text":""},{"location":"development/data-components/#data-data-structure-components","title":"Data &amp; Data Structure Components","text":"<p>The data &amp; data structure components include:</p> <ul> <li>The <code>Document</code> class.</li> <li>The document store.</li> <li>The vector store.</li> </ul>"},{"location":"development/data-components/#data-loader","title":"Data Loader","text":"<ul> <li>PdfLoader</li> <li> <p>Layout-aware with table parsing PdfLoader</p> <ul> <li>MathPixLoader: To use this loader, you need MathPix API key, refer to mathpix docs for more information</li> <li>OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module).</li> <li> <p>Output:</p> <ul> <li> <p>Document: text + metadata to identify whether it is table or not</p> <pre><code>- \"source\": source file name\n- \"type\": \"table\" or \"text\"\n- \"table_origin\": original table in markdown format (to be feed to LLM or visualize using external tools)\n- \"page_label\": page number in the original PDF document\n</code></pre> </li> </ul> </li> </ul> </li> </ul>"},{"location":"development/data-components/#document-store","title":"Document Store","text":"<ul> <li>InMemoryDocumentStore</li> </ul>"},{"location":"development/data-components/#vector-store","title":"Vector Store","text":"<ul> <li>ChromaVectorStore</li> <li>InMemoryVectorStore</li> </ul>"},{"location":"development/utilities/","title":"Utilities","text":""},{"location":"development/utilities/#utilities","title":"Utilities","text":""},{"location":"development/utilities/#prompt-engineering-ui","title":"Prompt engineering UI","text":"<p>Important: despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...).</p> <p>In the development process, developers typically build the pipeline. However, for use cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more effective. To facilitate this, <code>kotaemon</code> offers a user-friendly prompt engineering UI that developers integrate into their pipelines. This enables non-technical members to adjust prompts and parameters, run experiments, and export results for optimization.</p> <p>As of Sept 2023, there are 2 kinds of prompt engineering UI:</p> <ul> <li>Simple pipeline: run one-way from start to finish.</li> <li>Chat pipeline: interactive back-and-forth.</li> </ul>"},{"location":"development/utilities/#simple-pipeline","title":"Simple pipeline","text":"<p>For simple pipeline, the supported client project workflow looks as follow:</p> <ol> <li>[tech] Build pipeline</li> <li>[tech] Export pipeline to config: <code>$ kotaemon promptui export &lt;module.path.piplineclass&gt; --output &lt;path/to/config/file.yml&gt;</code></li> <li>[tech] Customize the config</li> <li>[tech] Spin up prompt engineering UI: <code>$ kotaemon promptui run &lt;path/to/config/file.yml&gt;</code></li> <li>[non-tech] Change params, run inference</li> <li>[non-tech] Export to Excel</li> <li>[non-tech] Select the set of params that achieve the best output</li> </ol> <p>The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally done by the developers, while step 7 happens exclusively in Excel file).</p>"},{"location":"development/utilities/#step-2-export-pipeline-to-config","title":"Step 2 - Export pipeline to config","text":"<p>Command:</p> <pre><code>$ kotaemon promptui export &lt;module.path.piplineclass&gt; --output &lt;path/to/config/file.yml&gt;\n</code></pre> <p>where:</p> <ul> <li><code>&lt;module.path.pipelineclass&gt;</code> is a dot-separated path to the pipeline. For example, if your pipeline can be accessed with <code>from projectA.pipelines import AnsweringPipeline</code>, then this value is <code>projectA.pipelines.AnswerPipeline</code>.</li> <li><code>&lt;path/to/config/file.yml&gt;</code> is the target file path that the config will be exported to. If the config file already exists, and contains information of other pipelines, the config of current pipeline will additionally be added. If it contains information of the current pipeline (in the past), the old information will be replaced.</li> </ul> <p>By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with <code>ignore_ui=True</code>, and they will be ignored in the config generation process. Example:</p> <pre><code>class Pipeline(BaseComponent):\n    param1: str = Param(default=\"hello\")\n    param2: str = Param(default=\"goodbye\", ignore_ui=True)\n</code></pre> <p>Declared as above, and <code>param1</code> will show up in the config YAML file, while <code>param2</code> will not.</p>"},{"location":"development/utilities/#step-3-customize-the-config","title":"Step 3 - Customize the config","text":"<p>developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema:</p> <pre><code>&lt;module.path.pipelineclass1&gt;:\n  params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.)\n  inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.)\n  outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.)\n  logs: ... (Detail what information should show up in the log.)\n</code></pre>"},{"location":"development/utilities/#input-and-params","title":"Input and params","text":"<p>The inputs section have the overall schema as follow:</p> <pre><code>inputs:\n  &lt;input-variable-name-1&gt;:\n    component: &lt;supported-UI-component&gt;\n    params: # this section is optional)\n      value: &lt;default-value&gt;\n  &lt;input-variable-name-2&gt;: ... # similar to above\nparams:\n  &lt;param-variable-name-1&gt;: ... # similar to those in the inputs\n</code></pre> <p>The list of supported prompt UI and their corresponding gradio UI components:</p> <pre><code>COMPONENTS_CLASS = {\n    \"text\": gr.components.Textbox,\n    \"checkbox\": gr.components.CheckboxGroup,\n    \"dropdown\": gr.components.Dropdown,\n    \"file\": gr.components.File,\n    \"image\": gr.components.Image,\n    \"number\": gr.components.Number,\n    \"radio\": gr.components.Radio,\n    \"slider\": gr.components.Slider,\n}\n</code></pre>"},{"location":"development/utilities/#outputs","title":"Outputs","text":"<p>The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration:</p> <pre><code>outputs:\n  - component: &lt;supported-UI-component&gt;\n    step: &lt;name-of-pipeline-step&gt;\n    item: &lt;jsonpath way to retrieve the info&gt;\n  - ... # similar to above\n</code></pre> <p>where:</p> <ul> <li>component: the same text string and corresponding Gradio UI as in inputs &amp; params</li> <li>step: the pipeline step that we wish to look fetch and show output on the UI</li> <li>item: the jsonpath mechanism to get the targeted variable from the step above</li> </ul>"},{"location":"development/utilities/#logs","title":"Logs","text":"<p>The logs show a list of sheetname and how to retrieve the desired information.</p> <pre><code>logs:\n  &lt;logname&gt;:\n    inputs:\n      - name: &lt;column name&gt;\n        step: &lt;the pipeline step that we would wish to see the input&gt;\n        variable: &lt;the variable in the step&gt;\n      - ...\n    outputs:\n      - name: &lt;column name&gt;\n        step: &lt;the pipeline step that we would wish to see the output&gt;\n        item: &lt;how to retrieve the output of that step&gt;\n</code></pre>"},{"location":"development/utilities/#step-4-5-spin-up-prompt-engineering-ui-perform-prompt-engineering","title":"Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering","text":"<p>Command:</p> <pre><code>$ kotaemon promptui run &lt;path/to/config/file.yml&gt;\n</code></pre> <p>This will generate an UI as follow:</p> <p></p> <p>where:</p> <ul> <li>The tabs at the top of the UI corresponds to the pipeline to do prompt engineering.</li> <li>The inputs and params tabs allow users to edit (these corresponds to the inputs and params in the config file).</li> <li>The outputs panel holds the UI elements to show the outputs defined in config file.</li> <li>The Run button: will execute pipeline with the supplied inputs and params, and render result in the outputs panel.</li> <li>The Export button: will export the logs of all the run to an Excel files users to inspect for best set of params.</li> </ul>"},{"location":"development/utilities/#step-6-export-to-excel","title":"Step 6 - Export to Excel","text":"<p>Upon clicking export, the users can download Excel file.</p>"},{"location":"development/utilities/#chat-pipeline","title":"Chat pipeline","text":"<p>Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow:</p> <ol> <li>Set the desired parameters.</li> <li>Click \"New chat\" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect.</li> <li>Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot.</li> <li>During chat, the log of the chat will show up in the \"Output\" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings.</li> <li>When finishing chat, select your preference in the radio box. Click \"End chat\". This will save the chat log and the preference to disk.</li> <li>To compare the result of different run, click \"Export\" to get an Excel spreadsheet summary of different run.</li> </ol>"},{"location":"pages/app/customize-flows/","title":"Customize flow logic","text":""},{"location":"pages/app/customize-flows/#add-new-indexing-and-reasoning-pipeline-to-the-application","title":"Add new indexing and reasoning pipeline to the application","text":"<p>@trducng</p> <p>At high level, to add new indexing and reasoning pipeline:</p> <ol> <li>You define your indexing or reasoning pipeline as a class from    <code>BaseComponent</code>.</li> <li>You declare that class in the setting files <code>flowsettings.py</code>.</li> </ol> <p>Then when <code>python app.py</code>, the application will dynamically load those pipelines.</p> <p>The below sections talk in more detail about how the pipelines should be constructed.</p>"},{"location":"pages/app/customize-flows/#define-a-pipeline-as-a-class","title":"Define a pipeline as a class","text":"<p>In essence, a pipeline will subclass from <code>kotaemon.base.BaseComponent</code>. Each pipeline has 2 main parts:</p> <ul> <li>All declared arguments and sub-pipelines.</li> <li>The logic inside the pipeline.</li> </ul> <p>An example pipeline:</p> <pre><code>from kotaemon.base import BaseComponent\n\n\nclass SoSimple(BaseComponent):\n    arg1: int\n    arg2: str\n\n    def run(self, arg3: str):\n        return self.arg1 * self.arg2 + arg3\n</code></pre> <p>This pipeline is simple for demonstration purpose, but we can imagine pipelines with much more arguments, that can take other pipelines as arguments, and have more complicated logic in the <code>run</code> method.</p> <p>An indexing or reasoning pipeline is just a class subclass from <code>BaseComponent</code> like above.</p> <p>For more detail on this topic, please refer to Creating a Component</p>"},{"location":"pages/app/customize-flows/#run-signatures","title":"Run signatures","text":"<p>Note: this section is tentative at the moment. We will finalize <code>def run</code> function signature by latest early April.</p> <p>The indexing pipeline:</p> <pre><code>    def run(\n        self,\n        file_paths: str | Path | list[str | Path],\n        reindex: bool = False,\n        **kwargs,\n    ):\n        \"\"\"Index files to intermediate representation (e.g. vector, database...)\n\n        Args:\n            file_paths: the list of paths to files\n            reindex: if True, files in `file_paths` that already exists in database\n                should be reindex.\n        \"\"\"\n</code></pre> <p>The reasoning pipeline:</p> <pre><code>    def run(self, question: str, history: list, **kwargs) -&gt; Document:\n        \"\"\"Answer the question\n\n        Args:\n            question: the user input\n            history: the chat history [(user_msg1, bot_msg1), (user_msg2, bot_msg2)...]\n\n        Returns:\n            kotaemon.base.Document: the final answer\n        \"\"\"\n</code></pre>"},{"location":"pages/app/customize-flows/#register-your-pipeline-to-ktem","title":"Register your pipeline to ktem","text":"<p>To register your pipelines to ktem, you declare it in the <code>flowsettings.py</code> file. This file locates at the current working directory where you start the ktem. In most use cases, it is this one.</p> <pre><code>KH_REASONING = [\"&lt;python.module.path.to.the.reasoning.class&gt;\"]\n\nKH_INDEX = \"&lt;python.module.path.to.the.indexing.class&gt;\"\n</code></pre> <p>You can register multiple reasoning pipelines to ktem by populating the <code>KH_REASONING</code> list. The user can select which reasoning pipeline to use in their Settings page.</p> <p>For now, there's only one supported index option for <code>KH_INDEX</code>.</p> <p>Make sure that your class is discoverable by Python.</p>"},{"location":"pages/app/customize-flows/#allow-users-to-customize-your-pipeline-in-the-app-settings","title":"Allow users to customize your pipeline in the app settings","text":"<p>To allow the users to configure your pipeline, you need to declare what you allow the users to configure as a dictionary. <code>ktem</code> will include them into the application settings.</p> <p>In your pipeline class, add a classmethod <code>get_user_settings</code> that returns a setting dictionary, add a classmethod <code>get_info</code> that returns an info dictionary. Example:</p> <pre><code>class SoSimple(BaseComponent):\n\n    ... # as above\n\n    @classmethod\n    def get_user_settings(cls) -&gt; dict:\n        \"\"\"The settings to the user\"\"\"\n        return {\n            \"setting_1\": {\n                \"name\": \"Human-friendly name\",\n                \"value\": \"Default value\",\n                \"choices\": [(\"Human-friendly Choice 1\", \"choice1-id\"), (\"HFC 2\", \"choice2-id\")], # optional\n                \"component\": \"Which Gradio UI component to render, can be: text, number, checkbox, dropdown, radio, checkboxgroup\"\n            },\n            \"setting_2\": {\n                # follow the same rule as above\n            }\n        }\n\n    @classmethod\n    def get_info(cls) -&gt; dict:\n        \"\"\"Pipeline information for bookkeeping purpose\"\"\"\n        return {\n            \"id\": \"a unique id to differentiate this pipeline from other pipeline\",\n            \"name\": \"Human-friendly name of the pipeline\",\n            \"description\": \"Can be a short description of this pipeline\"\n        }\n</code></pre> <p>Once adding these methods to your pipeline class, <code>ktem</code> will automatically extract and add them to the settings.</p>"},{"location":"pages/app/customize-flows/#construct-to-pipeline-object","title":"Construct to pipeline object","text":"<p>Once <code>ktem</code> runs your pipeline, it will call your classmethod <code>get_pipeline</code> with the full user settings and expect to obtain the pipeline object. Within this <code>get_pipeline</code> method, you implement all the necessary logics to initiate the pipeline object. Example:</p> <pre><code>class SoSimple(BaseComponent):\n    ... # as above\n\n    @classmethod\n    def get_pipeline(self, setting):\n        obj = cls(arg1=setting[\"reasoning.id.setting1\"])\n        return obj\n</code></pre>"},{"location":"pages/app/customize-flows/#reasoning-stream-output-to-ui","title":"Reasoning: Stream output to UI","text":"<p>For fast user experience, you can stream the output directly to UI. This way, user can start observing the output as soon as the LLM model generates the 1st token, rather than having to wait the pipeline finishes to read the whole message.</p> <p>To stream the output, you need to;</p> <ol> <li>Turn the <code>run</code> function to async.</li> <li>Pass in the output to a special queue with <code>self.report_output</code>.</li> </ol> <pre><code>    async def run(self, question: str, history: list, **kwargs) -&gt; Document:\n        for char in \"This is a long messages\":\n            self.report_output({\"output\": text.text})\n</code></pre> <p>The argument to <code>self.report_output</code> is a dictionary, that contains either or all of these 2 keys: \"output\", \"evidence\". The \"output\" string will be streamed to the chat message, and the \"evidence\" string will be streamed to the information panel.</p>"},{"location":"pages/app/customize-flows/#access-application-llms-embeddings","title":"Access application LLMs, Embeddings","text":"<p>You can access users' collections of LLMs and embedding models with:</p> <pre><code>from ktem.embeddings.manager import embeddings\nfrom ktem.llms.manager import llms\n\n\nllm = llms.get_default()\nembedding_model = embeddings.get_default()\n</code></pre> <p>You can also allow the users to specifically select which llms or embedding models they want to use through the settings.</p> <pre><code>    @classmethod\n    def get_user_settings(cls) -&gt; dict:\n        from ktem.llms.manager import llms\n\n        return {\n            \"citation_llm\": {\n                \"name\": \"LLM for citation\",\n                \"value\": llms.get_default(),\n                \"component: \"dropdown\",\n                \"choices\": list(llms.options().keys()),\n            },\n            ...\n        }\n</code></pre>"},{"location":"pages/app/customize-flows/#optional-access-application-data","title":"Optional: Access application data","text":"<p>You can access the user's application database, vector store as follow:</p> <pre><code># get the database that contains the source files\nfrom ktem.db.models import Source, Index, Conversation, User\n\n# get the vector store\n</code></pre>"},{"location":"pages/app/features/","title":"Features","text":""},{"location":"pages/app/features/#chat","title":"Chat","text":"<p>The kotaemon focuses on question and answering over a corpus of data. Below is the gentle introduction about the chat functionality.</p> <ul> <li>Users can upload corpus of files.</li> <li>Users can converse to the chatbot to ask questions about the corpus of files.</li> <li>Users can view the reference in the files.</li> </ul>"},{"location":"pages/app/functional-description/","title":"Functional description","text":""},{"location":"pages/app/functional-description/#user-group-tenant-management","title":"User group / tenant management","text":""},{"location":"pages/app/functional-description/#create-new-user-group","title":"Create new user group","text":"<p>(6 man-days)</p> <p>Description: each client has a dedicated user group. Each user group has an admin user who can do administrative tasks (e.g. creating user account in that user group...). The workflow for creating new user group is as follow:</p> <ol> <li>Cinnamon accesses the user group management UI.</li> <li>On \"Create user group\" panel, we supply:    a. Client name: e.g. Apple.    b. Sub-domain name: e.g. apple.    c. Admin email, username &amp; password.</li> <li>The system will:    a. An Aurora Platform deployment with the specified sub-domain.    b. Send an email to the admin, with the username &amp; password.</li> </ol> <p>Expectation:</p> <ul> <li>The admin can go to the deployed Aurora Platform.</li> <li>The admin can login with the specified username &amp; password.</li> </ul> <p>Condition:</p> <ul> <li>When sub-domain name already exists, raise error.</li> <li>If error sending email to the client, raise the error, and delete the   newly-created user-group.</li> <li>Password rule:<ul> <li>Have at least 8 characters.</li> <li>Must contain uppercase, lowercase, number and symbols.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#delete-user-group","title":"Delete user group","text":"<p>(2 man-days)</p> <p>Description: in the tenant management page, we can delete the selected user group. The user flow is as follow:</p> <ol> <li>Cinnamon accesses the user group management UI,</li> <li>View list of user groups.</li> <li>Next to target user group, click delete.</li> <li>Confirm whether to delete.</li> <li>If Yes, delete the user group. If No, cancel the operation.</li> </ol> <p>Expectation: when a user group is deleted, we expect to delete everything related to the user groups: domain, files, databases, caches, deployments.</p>"},{"location":"pages/app/functional-description/#user-management","title":"User management","text":""},{"location":"pages/app/functional-description/#create-user-account-for-admin-user","title":"Create user account (for admin user)","text":"<p>(1 man-day)</p> <p>Description: the admin user in the client's account can create user account for that user group. To create the new user, the client admin do:</p> <ol> <li>Navigate to \"Admin\" &gt; \"Users\"</li> <li>In the \"Create user\" panel, supply:<ul> <li>Username</li> <li>Password</li> <li>Confirm password</li> </ul> </li> <li>Click \"Create\"</li> </ol> <p>Expectation:</p> <ul> <li>The user can create the account.</li> <li>The username:<ul> <li>Is case-insensitive (e.g. Moon and moon will be the same)</li> <li>Can only contains these characters: a-z A-Z 0-9 _ + - .</li> <li>Has maximum length of 32 characters</li> </ul> </li> <li>The password is subjected to the following rule:<ul> <li>8-character minimum length</li> <li>Contains at least 1 number</li> <li>Contains at least 1 lowercase letter</li> <li>Contains at least 1 uppercase letter</li> <li>Contains at least 1 special character from the following set, or a   non-leading, non-trailing space character: <code>^ $ * . [ ] { } ( ) ? - \" ! @ # % &amp; / \\ , &gt; &lt; ' : ; | _ ~</code> + =</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#delete-user-account-for-admin-user","title":"Delete user account (for admin user)","text":"<p>Description: the admin user in the client's account can delete user account. Once an user account is deleted, he/she cannot login to Aurora Platform.</p> <ol> <li>The admin user navigates to \"Admin\" &gt; \"Users\".</li> <li>In the user list panel, next to the username, the admin click on the \"Delete\"    button. The Confirmation dialog appears.</li> <li>If \"Delete\", the user account is deleted. If \"Cancel\", do nothing. The    Confirmation dialog disappears.</li> </ol> <p>Expectation:</p> <ul> <li>Once the user is deleted, the following information relating to the user will   be deleted:<ul> <li>His/her personal setting.</li> <li>His/her conversations.</li> </ul> </li> <li>The following information relating to the user will still be retained:<ul> <li>His/her uploaded files.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#edit-user-account-for-admin-user","title":"Edit user account (for admin user)","text":"<p>Description: the admin user can change any information about the user account, including password. To change user information:</p> <ol> <li>The admin user navigates to \"Admin\" &gt; \"Users\".</li> <li>In the user list panel, next to the username, the admin click on the \"Edit\"    button.</li> <li>The user list disappears, the user detail appears, with the following    information show up:<ul> <li>Username: (prefilled the username)</li> <li>Password: (blank)</li> <li>Confirm password: (blank)</li> </ul> </li> <li>The admin can edit any of the information, and click \"Save\" or \"Cancel\".<ul> <li>If \"Save\": the information will be updated to the database, or show    error per Expectation below.</li> <li>If \"Cancel\": skip.</li> </ul> </li> <li>If Save success or Cancel, transfer back to the user list UI, where the user    information is updated accordingly.</li> </ol> <p>Expectation:</p> <ul> <li>If the \"Password\" &amp; \"Confirm password\" are different from each other, show   error: \"Password mismatch\".</li> <li>If both \"Password\" &amp; *\"Confirm password\" are blank, don't change the user   password.</li> <li>If changing password, the password rule is subjected to the same rule when   creating user.</li> <li>It's possible to change username. If changing username, the target user has to   use the new username.</li> </ul>"},{"location":"pages/app/functional-description/#sign-in","title":"Sign-in","text":"<p>(3 man-days)</p> <p>Description: the users can sign-in to Aurora Platform as follow:</p> <ol> <li>User navigates to the URL.</li> <li>If the user is not logged in, the UI just shows the login screen.</li> <li>User types username &amp; password.</li> <li>If correct, the user will proceed to normal working UI.</li> <li>If incorrect, the login screen shows text error.</li> </ol>"},{"location":"pages/app/functional-description/#sign-out","title":"Sign-out","text":"<p>(1 man-day)</p> <p>Description: the user can sign-out of Aurora Platform as follow:</p> <ol> <li>User navigates to the Settings &gt; User page.</li> <li>User click on logout.</li> <li>The user is signed out to the UI login screen.</li> </ol> <p>Expectation: the user is completely signed out. Next time he/she uses the Aurora Platform, he/she has to login again.</p>"},{"location":"pages/app/functional-description/#change-password","title":"Change password","text":"<p>Description: the user can change their password as follow:</p> <ol> <li>User navigates to the Settings &gt; User page.</li> <li>In the change password section, the user provides these info and click    Change:<ul> <li>Current password</li> <li>New password</li> <li>Confirm new password</li> </ul> </li> <li>If changing successfully, then the password is changed. Otherwise, show the    error on the UI.</li> </ol> <p>Expectation:</p> <ul> <li>If changing password succeeds, next time they logout/login to the system, they   can use the new password.</li> <li>Password rule (Same as normal password rule when creating user)</li> <li>Errors:<ul> <li>Password does not match.</li> <li>Violated password rules.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#chat","title":"Chat","text":""},{"location":"pages/app/functional-description/#chat-to-the-bot","title":"Chat to the bot","text":"<p>Description: the Aurora Platform focuses on question and answering over the uploaded data. Each chat has the following components:</p> <ul> <li>Chat message: show the exchange between bots and humans.</li> <li>Text input + send button: for the user to input the message.</li> <li>Data source panel: for selecting the files that will scope the context for the   bot.</li> <li>Information panel: showing evidence as the bot answers user's questions.</li> </ul> <p>The chat workflow looks as follow:</p> <ol> <li>[Optional] User select files that they want to scope the context for the bot.    If the user doesn't select any files, then all files on Aurora Platform will    be the context for the bot.<ul> <li>The user can type multi-line messages, using \"Shift + Enter\" for    line-break.</li> </ul> </li> <li>User sends the message (either clicking the Send button or hitting the Enter    key).</li> <li>The bot in the chat conversation will return \"Thinking...\" while it    processes.</li> <li>The information panel on the right begin to show data related to the user    message.</li> <li>The bot begins to generate answer. The \"Thinking...\" placeholder disappears..</li> </ol> <p>Expecatation:</p> <ul> <li>Messages:<ul> <li>User can send multi-line messages, using \"Shift + Enter\" for line-break.</li> <li>User can thumbs up, thumbs down the AI response. This information is   recorded in the database.</li> <li>User can click on a copy button on the chat message to copy the content to   clipboard.</li> </ul> </li> <li>Information panel:<ul> <li>The information panel shows the latest evidence.</li> <li>The user can click on the message, and the reference for that message will   show up on the \"Reference panel\" (feature in-planning).</li> <li>The user can click on the title to show/hide the content.</li> <li>The whole information panel can be collapsed.</li> </ul> </li> <li>Chatbot quality:<ul> <li>The user can converse with the bot. The bot answer the user's requests in a   natural manner.</li> <li>The bot message should be streamed to the UI. The bot don't wait to gather   alll the text response, then dump all of them at once.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#conversation-switch","title":"Conversation - switch","text":"<p>Description: users can jump around between different conversations. They can see the list of all conversations, can select an old converation, and continue the chat under the context of the old conversation. The switching workflow is like this:</p> <ol> <li>Users click on the conversation dropdown. It will show a list of    conversations.</li> <li>Within that dropdown, the user selects one conversation.</li> <li>The chat messages, information panel, and selected data will show the content    in that old chat.</li> <li>The user can continue chatting as normal under the context of this old chat.</li> </ol> <p>Expectation:</p> <ul> <li>In the conversation drop down list, the conversations are ordered in created   date order.</li> <li>When there is no conversation, the conversation list is empty.</li> <li>When there is no conversation, the user can still converse with the chat bot.   When doing so, it automatically create new conversation.</li> </ul>"},{"location":"pages/app/functional-description/#conversation-create","title":"Conversation - create","text":"<p>Description: the user can explicitly start a new conversation with the chatbot:</p> <ol> <li>User click on the \"New\" button.</li> <li>The new conversation is automatically created.</li> </ol> <p>Expectation:</p> <ul> <li>The default conversation name is the current datetime.</li> <li>It become selected.</li> <li>It is added to the conversation list.</li> </ul>"},{"location":"pages/app/functional-description/#conversation-rename","title":"Conversation - rename","text":"<p>Description: user can rename the chatbot by typing the name, and click on the Rename button next to it.</p> <ul> <li>If rename succeeds: the name shown in the 1st dropdown will change accordingly</li> <li>If rename doesn't succeed: show error message in red color below the rename section</li> </ul> <p>Condition:</p> <ul> <li>Name constraint:<ul> <li>Min characters: 1</li> <li>Max characters: 40</li> <li>Could not having the same name with an existing conversation of the same   user.</li> </ul> </li> </ul>"},{"location":"pages/app/functional-description/#conversation-delete","title":"Conversation - delete","text":"<p>Description: user can delete the existing conversation as follow:</p> <ol> <li>Click on Delete button.</li> <li>The UI show confirmation with 2 buttons:<ul> <li>Delete</li> <li>Cancel.</li> </ul> </li> <li>If Delete, delete the conversation, switch to the next oldest conversation,    close the confirmation panel.</li> <li>If cancel, just close the confirmation panel.</li> </ol>"},{"location":"pages/app/functional-description/#file-management","title":"File management","text":"<p>The file management allows users to upload, list and delete files that they upload to the Aurora Platform</p>"},{"location":"pages/app/functional-description/#upload-file","title":"Upload file","text":"<p>Description: the user can upload files to the Aurora Platform. The uploaded files will be served as context for our chatbot to refer to when it converses with the user. To upload file, the user:</p> <ol> <li>Navigate to the File tab.</li> <li>Within the File tab, there is an Upload section.</li> <li>User can add files to the Upload section through drag &amp; drop, and or by click    on the file browser.</li> <li>User can select some options relating to uploading and indexing. Depending on    the project, these options can be different. Nevertheless, they will discuss    below.</li> <li>User click on \"Upload and Index\" button.</li> <li>The app show notifications when indexing starts and finishes, and when errors    happen on the top right corner.</li> </ol> <p>Options:</p> <ul> <li>Force re-index file. When user tries to upload files that already exists on   the system:<ul> <li>If this option is True: will re-index those files.</li> <li>If this option is False: will skip indexing those files.</li> </ul> </li> </ul> <p>Condition:</p> <ul> <li>Max number of files: 100 files.</li> <li>Max number of pages per file: 500 pages</li> <li>Max file size: 10 MB</li> </ul>"},{"location":"pages/app/functional-description/#list-all-files","title":"List all files","text":"<p>Description: the user can know which files are on the system by:</p> <ol> <li>Navigate to the File tab.</li> <li>By default, it will show all the uploaded files, each with the following    information: file name, file size, number of pages, uploaded date</li> <li>The UI also shows total number of pages, and total number of sizes in MB.</li> </ol>"},{"location":"pages/app/functional-description/#delete-file","title":"Delete file","text":"<p>Description: users can delete files from this UI to free up the space, or to remove outdated information. To remove the files:</p> <ol> <li>User navigate to the File tab.</li> <li>In the list of file, next to each file, there is a Delete button.</li> <li>The user clicks on the Delete button. Confirmation dialog appear.</li> <li>If Delete, delete the file. If Cancel, close the confirmation dialog.</li> </ol> <p>Expectation: once the file is deleted:</p> <ul> <li>The database entry of that file is deleted.</li> <li>The file is removed from \"Chat - Data source\".</li> <li>The total number of pages and MB sizes are reduced accordingly.</li> <li>The reference to the file in the information panel is still retained.</li> </ul>"},{"location":"pages/app/ext/user-management/","title":"User management","text":"<p><code>ktem</code> provides user management as an extension. To enable user management, in your <code>flowsettings.py</code>, set the following variables:</p> <ul> <li><code>KH_FEATURE_USER_MANAGEMENT</code>: True to enable.</li> <li><code>KH_FEATURE_USER_MANAGEMENT_ADMIN</code>: the admin username. This user will be   created when the app 1st start.</li> <li><code>KH_FEATURE_USER_MANAGEMENT_PASSWORD</code>: the admin password. This value   accompanies the admin username.</li> </ul> <p>Once enabled, you have access to the following features:</p> <ul> <li>User login/logout (located in Settings Tab)</li> <li>User changing password (located in Settings Tab)</li> <li>Create / List / Edit / Delete user (located in Admin &gt; User Management Tab)</li> </ul>"},{"location":"pages/app/index/file/","title":"File index","text":"<p>The file index stores files in a local folder and index them for retrieval. This file index provides the following infrastructure to support the indexing:</p> <ul> <li>SQL table Source: store the list of files that are indexed by the system</li> <li>Vector store: contain the embedding of segments of the files</li> <li>Document store: contain the text of segments of the files. Each text stored   in this document store is associated with a vector in the vector store.</li> <li>SQL table Index: store the relationship between (1) the source and the   docstore, and (2) the source and the vector store.</li> </ul> <p>The indexing and retrieval pipelines are encouraged to use the above software infrastructure.</p>"},{"location":"pages/app/index/file/#indexing-pipeline","title":"Indexing pipeline","text":"<p>The ktem has default indexing pipeline: <code>ktem.index.file.pipelines.IndexDocumentPipeline</code>.</p> <p>This default pipeline works as follow:</p> <ul> <li>Input: list of file paths</li> <li>Output: list of nodes that are indexed into database</li> <li>Process:<ul> <li>Read files into texts. Different file types has different ways to read texts.</li> <li>Split text files into smaller segments</li> <li>Run each segments into embeddings.</li> <li>Store the embeddings into vector store. Store the texts of each segment   into docstore. Store the list of files in Source. Store the linking   between Sources and docstore + vectorstore in Index table.</li> </ul> </li> </ul> <p>You can customize this default pipeline if your indexing process is close to the default pipeline. You can create your own indexing pipeline if there are too much different logic.</p>"},{"location":"pages/app/index/file/#customize-the-default-pipeline","title":"Customize the default pipeline","text":"<p>The default pipeline provides the contact points in <code>flowsettings.py</code>.</p> <ol> <li><code>FILE_INDEX_PIPELINE_FILE_EXTRACTORS</code>. Supply overriding file extractor,    based on file extension. Example: <code>{\".pdf\": \"path.to.PDFReader\", \".xlsx\": \"path.to.ExcelReader\"}</code></li> <li><code>FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE</code>. The expected number of characters    of each text segment. Example: 1024.</li> <li><code>FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP</code>. The expected number of    characters that consecutive text segments should overlap with each other.    Example: 256.</li> </ol>"},{"location":"pages/app/index/file/#create-your-own-indexing-pipeline","title":"Create your own indexing pipeline","text":"<p>Your indexing pipeline will subclass <code>BaseFileIndexIndexing</code>.</p> <p>You should define the following methods:</p> <ul> <li><code>run(self, file_paths)</code>: run the indexing given the pipeline</li> <li><code>get_pipeline(cls, user_settings, index_settings)</code>: return the   fully-initialized pipeline, ready to be used by ktem.<ul> <li><code>user_settings</code>: is a dictionary contains user settings (e.g. <code>{\"pdf_mode\": True, \"num_retrieval\": 5}</code>). You can declare these settings in the <code>get_user_settings</code> classmethod. ktem will collect these settings into the app Settings page, and will supply these user settings to your <code>get_pipeline</code> method.</li> <li><code>index_settings</code>: is a dictionary. Currently it's empty for File Index.</li> </ul> </li> <li><code>get_user_settings</code>: to declare user settings, return a dictionary.</li> </ul> <p>By subclassing <code>BaseFileIndexIndexing</code>, You will have access to the following resources:</p> <ul> <li><code>self._Source</code>: the source table</li> <li><code>self._Index</code>: the index table</li> <li><code>self._VS</code>: the vector store</li> <li><code>self._DS</code>: the docstore</li> </ul> <p>Once you have prepared your pipeline, register it in <code>flowsettings.py</code>: <code>FILE_INDEX_PIPELINE = \"&lt;python.path.to.your.pipeline&gt;\"</code>.</p>"},{"location":"pages/app/index/file/#retrieval-pipeline","title":"Retrieval pipeline","text":"<p>The ktem has default retrieval pipeline: <code>ktem.index.file.pipelines.DocumentRetrievalPipeline</code>. This pipeline works as follow:</p> <ul> <li>Input: user text query &amp; optionally a list of source file ids</li> <li>Output: the output segments that match the user text query</li> <li>Process:<ul> <li>If a list of source file ids is given, get the list of vector ids that   associate with those file ids.</li> <li>Embed the user text query.</li> <li>Query the vector store. Provide a list of vector ids to limit query scope   if the user restrict.</li> <li>Return the matched text segments</li> </ul> </li> </ul>"},{"location":"pages/app/index/file/#create-your-own-retrieval-pipeline","title":"Create your own retrieval pipeline","text":"<p>Your retrieval pipeline will subclass <code>BaseFileIndexRetriever</code>. The retriever has the same database, vectorstore and docstore accesses like the indexing pipeline.</p> <p>You should define the following methods:</p> <ul> <li><code>run(self, query, file_ids)</code>: retrieve relevant documents relating to the   query. If <code>file_ids</code> is given, you should restrict your search within these   <code>file_ids</code>.</li> <li><code>get_pipeline(cls, user_settings, index_settings, selected)</code>: return the   fully-initialized pipeline, ready to be used by ktem.<ul> <li><code>user_settings</code>: is a dictionary contains user settings (e.g. <code>{\"pdf_mode\": True, \"num_retrieval\": 5}</code>). You can declare these settings in the <code>get_user_settings</code> classmethod. ktem will collect these settings into the app Settings page, and will supply these user settings to your <code>get_pipeline</code> method.<ul> <li><code>index_settings</code>: is a dictionary. Currently it's empty for File Index.</li> <li><code>selected</code>: a list of file ids selected by user. If user doesn't select   anything, this variable will be None.</li> </ul> </li> </ul> </li> <li><code>get_user_settings</code>: to declare user settings, return a dictionary.</li> </ul> <p>Once you build the retrieval pipeline class, you can register it in <code>flowsettings.py</code>: <code>FILE_INDEXING_RETRIEVER_PIPELIENS = [\"path.to.retrieval.pipelie\"]</code>. Because there can be multiple parallel pipelines within an index, this variable takes a list of string rather than a string.</p>"},{"location":"pages/app/index/file/#software-infrastructure","title":"Software infrastructure","text":"Infra Access Schema Ref SQL table Source self._Source - id (int): id of the source (auto)- name (str): the name of the file- path (str): the path of the file- size (int): the file size in bytes- note (dict): allow extra optional information about the file- date_created (datetime): the time the file is created (auto) This is SQLALchemy ORM class. Can consult SQL table Index self._Index - id (int): id of the index entry (auto)- source_id (int): the id of a file in the Source table- target_id: the id of the segment in docstore or vector store- relation_type (str): if the link is \"document\" or \"vector\" This is SQLAlchemy ORM class Vector store self._VS - self._VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)- self._VS.delete: delete vector entries based on ids- self._VS.query: get embeddings based on embeddings. kotaemon &gt; storages &gt; vectorstores &gt; BaseVectorStore Doc store self._DS - self._DS.add: add the segments to document stores- self._DS.get: get the segments based on id- self._DS.get_all: get all segments- self._DS.delete: delete segments based on id kotaemon &gt; storages &gt; docstores &gt; base &gt; BaseDocumentStore"},{"location":"pages/app/settings/overview/","title":"Settings","text":""},{"location":"pages/app/settings/overview/#overview","title":"Overview","text":"<p>There are 3 kinds of settings in <code>ktem</code>, geared towards different stakeholders for different use cases:</p> <ul> <li>Developer settings. These settings are meant for very basic app customization, such as database URL, cloud config, logging config, which features to enable... You will be interested in the developer settings if you deploy <code>ktem</code> to your customers, or if you build extension for <code>ktem</code> for developers. These settings are declared inside <code>flowsettings.py</code>.</li> <li>Admin settings. These settings show up in the Admin page, and are meant to allow admin-level user to customize low level features, such as which credentials to connect to data sources, which keys to use for LLM...</li> <li>User settings. These settings are meant for run-time users to tweak ktem to their personal needs, such as which output languages the chatbot should generate, which reasoning type to use...</li> </ul>"},{"location":"pages/app/settings/user-settings/","title":"User settings","text":""},{"location":"pages/app/settings/user-settings/#user-settings","title":"User settings","text":"<p><code>ktem</code> allows developers to extend the index and the reasoning pipeline. In many cases, these components can have settings that should be modified by users at run-time, (e.g. <code>topk</code>, <code>chunksize</code>...). These are the user settings.</p> <p><code>ktem</code> allows developers to declare such user settings in their code. Once declared, <code>ktem</code> will render them in a Settings page.</p> <p>There are 2 places that <code>ktem</code> looks for declared user settings. You can refer to the respective pages.</p> <ul> <li>In the index.</li> <li>In the reasoning pipeline.</li> </ul>"},{"location":"pages/app/settings/user-settings/#syntax-of-a-settings","title":"Syntax of a settings","text":"<p>A collection of settings is a dictionary of type <code>dict[str, dict]</code>, where the key is a setting id, and the value is the description of the setting.</p> <pre><code>settings = {\n    \"topk\": {\n        \"name\": \"Top-k chunks\",\n        \"value\": 10,\n        \"component\": \"number\",\n    },\n    \"lang\": {\n        \"name\": \"Languages\",\n        \"value\": \"en\",\n        \"component\": \"dropdown\",\n        \"choices\": [(\"en\", \"English\"), (\"cn\", \"Chinese\")],\n    }\n}\n</code></pre> <p>Each setting description must have:</p> <ul> <li>name: the human-understandable name of the settings.</li> <li>value: the default value of the settings.</li> <li> <p>component: the UI component to render such setting on the UI. Available:</p> <ul> <li>\"text\": single-value</li> <li>\"number\": single-value</li> <li>\"checkbox\": single-value</li> <li>\"dropdown\": choices</li> <li>\"radio\": choices</li> <li>\"checkboxgroup\": choices</li> </ul> </li> </ul> <ul> <li>choices: the list of choices, if the component type allows.</li> </ul>"},{"location":"pages/app/settings/user-settings/#settings-page-structure","title":"Settings page structure","text":""},{"location":"reference/Summary/","title":"Summary","text":"<ul> <li>Agents     * Base     * Io         * Base     * Langchain Based     * React         * Agent         * Prompt     * Rewoo         * Agent         * Planner         * Prompt         * Solver     * Tools         * Base         * Google         * Llm         * Wikipedia     * Utils</li> <li>Base     * Component     * Schema</li> <li>Chatbot     * Base     * Simple Respondent</li> <li>CLI</li> <li>Embeddings     * Base     * Endpoint Based     * Fastembed     * Langchain Based     * Openai</li> <li>Indices     * Base     * Extractors         * Doc Parsers     * Ingests         * Files     * Qa         * Citation         * Text Based     * Rankings         * Base         * Cohere         * Llm         * Llm Scoring         * Llm Trulens     * Splitters     * Vectorindex</li> <li>LLMs     * Base     * Branching     * Chats         * Base         * Endpoint Based         * Langchain Based         * Llamacpp         * Openai     * Completions         * Base         * Langchain Based     * Cot     * Linear     * Prompts         * Base         * Template</li> <li>Loaders     * Adobe Loader     * Azureai Document Intelligence Loader     * Base     * Composite Loader     * Docx Loader     * Excel Loader     * Html Loader     * Mathpix Loader     * Ocr Loader     * Pdf Loader     * Txt Loader     * Unstructured Loader     * Utils         * Adobe         * Box         * Gpt4V         * Pdf Ocr         * Table</li> <li>Parsers     * Regex Extractor</li> <li>Storages     * Docstores         * Base         * Elasticsearch         * In Memory         * Lancedb         * Simple File     * Vectorstores         * Base         * Chroma         * In Memory         * Lancedb         * Milvus         * Qdrant         * Simple File</li> </ul>"},{"location":"reference/cli/","title":"CLI","text":""},{"location":"reference/cli/#cli.export","title":"export","text":"<pre><code>export(export_path, output)\n</code></pre> <p>Export a pipeline to a config file</p> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@promptui.command()\n@click.argument(\"export_path\", nargs=1)\n@click.option(\"--output\", default=\"promptui.yml\", show_default=True, required=False)\ndef export(export_path, output):\n    \"\"\"Export a pipeline to a config file\"\"\"\n    import sys\n\n    from theflow.utils.modules import import_dotted_string\n\n    from kotaemon.contribs.promptui.config import export_pipeline_to_config\n\n    sys.path.append(os.getcwd())\n    cls = import_dotted_string(export_path, safe=False)\n    export_pipeline_to_config(cls, output)\n    check_config_format(output)\n</code></pre>"},{"location":"reference/cli/#cli.run","title":"run","text":"<pre><code>run(run_path, share, username, password, appname, port)\n</code></pre> <p>Run the UI from a config file</p> <p>Examples:</p> <pre><code>\n# Run with default config file\n$ kh promptui run\n\n\n# Run with username and password supplied\n$ kh promptui run --username admin --password password\n\n\n# Run with username and prompted password\n$ kh promptui run --username admin\n\n# Run and share to promptui\n# kh promptui run --username admin --password password --share --appname hey                 --port 7861\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@promptui.command()\n@click.argument(\"run_path\", required=False, default=\"promptui.yml\")\n@click.option(\n    \"--share\",\n    is_flag=True,\n    show_default=True,\n    default=False,\n    help=\"Share the app through Gradio. Requires --username to enable authentication.\",\n)\n@click.option(\n    \"--username\",\n    required=False,\n    help=(\n        \"Username for the user. If not provided, the promptui will not have \"\n        \"authentication.\"\n    ),\n)\n@click.option(\n    \"--password\",\n    required=False,\n    help=\"Password for the user. If not provided, will be prompted.\",\n)\n@click.option(\n    \"--appname\",\n    required=False,\n    help=\"The share app subdomain. Requires --share and --username\",\n)\n@click.option(\n    \"--port\",\n    required=False,\n    help=\"Port to run the app. If not provided, will $GRADIO_SERVER_PORT (7860)\",\n)\ndef run(run_path, share, username, password, appname, port):\n    \"\"\"Run the UI from a config file\n\n    Examples:\n\n        \\b\n        # Run with default config file\n        $ kh promptui run\n\n        \\b\n        # Run with username and password supplied\n        $ kh promptui run --username admin --password password\n\n        \\b\n        # Run with username and prompted password\n        $ kh promptui run --username admin\n\n        # Run and share to promptui\n        # kh promptui run --username admin --password password --share --appname hey \\\n                --port 7861\n    \"\"\"\n    import sys\n\n    from kotaemon.contribs.promptui.ui import build_from_dict\n\n    sys.path.append(os.getcwd())\n\n    check_config_format(run_path)\n    demo = build_from_dict(run_path)\n\n    params: dict = {}\n    if username is not None:\n        if password is not None:\n            auth = (username, password)\n        else:\n            auth = (username, click.prompt(\"Password\", hide_input=True))\n        params[\"auth\"] = auth\n\n    port = int(port) if port else int(os.getenv(\"GRADIO_SERVER_PORT\", \"7860\"))\n    params[\"server_port\"] = port\n\n    if share:\n        if username is None:\n            raise ValueError(\n                \"Username must be provided to enable authentication for sharing\"\n            )\n        if appname:\n            from kotaemon.contribs.promptui.tunnel import Tunnel\n\n            tunnel = Tunnel(\n                appname=str(appname), username=str(username), local_port=port\n            )\n            url = tunnel.run()\n            print(f\"App is shared at {url}\")\n        else:\n            params[\"share\"] = True\n            print(\"App is shared at Gradio\")\n\n    demo.launch(**params)\n</code></pre>"},{"location":"reference/cli/#cli.makedoc","title":"makedoc","text":"<pre><code>makedoc(module, output, separation_level)\n</code></pre> <p>Make documentation for module <code>module</code></p> <p>Example:</p> <pre><code>\n# Make component documentation for kotaemon library\n$ kh makedoc kotaemon\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@main.command()\n@click.argument(\"module\", required=True)\n@click.option(\n    \"--output\", default=\"docs.md\", required=False, help=\"The output markdown file\"\n)\n@click.option(\n    \"--separation-level\", required=False, default=1, help=\"Organize markdown layout\"\n)\ndef makedoc(module, output, separation_level):\n    \"\"\"Make documentation for module `module`\n\n    Example:\n\n        \\b\n        # Make component documentation for kotaemon library\n        $ kh makedoc kotaemon\n    \"\"\"\n    from kotaemon.contribs.docs import make_doc\n\n    make_doc(module, output, separation_level)\n    print(f\"Documentation exported to {output}\")\n</code></pre>"},{"location":"reference/cli/#cli.start_project","title":"start_project","text":"<pre><code>start_project(template)\n</code></pre> <p>Start a project from a template.</p> <p>Important: the value for --template corresponds to the name of the template folder, which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates The default value is \"project-default\", which should work when you are starting a client project.</p> Source code in <code>libs/kotaemon/kotaemon/cli.py</code> <pre><code>@main.command()\n@click.option(\n    \"--template\",\n    default=\"project-default\",\n    required=False,\n    help=\"Template name\",\n    show_default=True,\n)\ndef start_project(template):\n    \"\"\"Start a project from a template.\n\n    Important: the value for --template corresponds to the name of the template folder,\n    which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates\n    The default value is \"project-default\", which should work when you are starting a\n    client project.\n    \"\"\"\n\n    print(\"Retrieving template...\")\n    os.system(\n        \"cookiecutter git@github.com:Cinnamon/kotaemon.git \"\n        f\"--directory='templates/{template}'\"\n    )\n</code></pre>"},{"location":"reference/agents/","title":"Agents","text":""},{"location":"reference/agents/#agents.BaseAgent","title":"BaseAgent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define base agent interface</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -&gt; None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/#agents.BaseAgent.add_tools","title":"add_tools","text":"<pre><code>add_tools(tools)\n</code></pre> <p>Helper method to add tools and update agent state if needed</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def add_tools(self, tools: list[BaseTool]) -&gt; None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n</code></pre>"},{"location":"reference/agents/#agents.BaseAgent.run","title":"run","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/#agents.AgentFinish","title":"AgentFinish","text":"<p>               Bases: <code>NamedTuple</code></p> <p>Agent's return value when finishing execution.</p> <p>Parameters:</p> Name Type Description Default <code>return_values</code> <p>The return values of the agent.</p> required <code>log</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n</code></pre>"},{"location":"reference/agents/#agents.AgentOutput","title":"AgentOutput","text":"<p>               Bases: <code>LLMInterface</code></p> <p>Output from an agent.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <p>The text output from the agent.</p> required <code>agent_type</code> <p>The type of agent.</p> required <code>status</code> <p>The status after executing the agent.</p> required <code>error</code> <p>The error message if any.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n</code></pre>"},{"location":"reference/agents/#agents.AgentType","title":"AgentType","text":"<p>               Bases: <code>Enum</code></p> <p>Enumerated type for agent types.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad","title":"BaseScratchPad","text":"<p>Base class for output handlers.</p>"},{"location":"reference/agents/#agents.BaseScratchPad--attributes","title":"Attributes:","text":"<p>logger : logging.Logger     The logger object to log messages.</p>"},{"location":"reference/agents/#agents.BaseScratchPad--methods","title":"Methods:","text":"<p>stop():     Stop the output.</p> <p>update_status(output: str, **kwargs):     Update the status of the output.</p> <p>thinking(name: str):     Log that a process is thinking.</p> <p>done(_all=False):     Log that the process is done.</p> <p>stream_print(item: str):     Not implemented.</p> <p>json_print(item: Dict[str, Any]):     Log a JSON object.</p> <p>panel_print(item: Any, title: str = \"Output\", stream: bool = False):     Log a panel output.</p> <p>clear():     Not implemented.</p> <p>print(content: str, **kwargs):     Log arbitrary content.</p> <p>format_json(json_obj: str):     Format a JSON object.</p> <p>debug(content: str, **kwargs):     Log a debug message.</p> <p>info(content: str, **kwargs):     Log an informational message.</p> <p>warning(content: str, **kwargs):     Log a warning message.</p> <p>error(content: str, **kwargs):     Log an error message.</p> <p>critical(content: str, **kwargs):     Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.stop","title":"stop","text":"<pre><code>stop()\n</code></pre> <p>Stop the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.update_status","title":"update_status","text":"<pre><code>update_status(output, **kwargs)\n</code></pre> <p>Update the status of the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.thinking","title":"thinking","text":"<pre><code>thinking(name)\n</code></pre> <p>Log that a process is thinking.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.done","title":"done","text":"<pre><code>done(_all=False)\n</code></pre> <p>Log that the process is done.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.stream_print","title":"stream_print","text":"<pre><code>stream_print(item)\n</code></pre> <p>Stream print.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.json_print","title":"json_print","text":"<pre><code>json_print(item)\n</code></pre> <p>Log a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.panel_print","title":"panel_print","text":"<pre><code>panel_print(item, title='Output', stream=False)\n</code></pre> <p>Log a panel output.</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <p>Any The item to log.</p> required <code>title</code> <p>str, optional The title of the panel, defaults to \"Output\".</p> <code>'Output'</code> <code>stream</code> <p>bool, optional</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Not implemented.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.print","title":"print","text":"<pre><code>print(content, **kwargs)\n</code></pre> <p>Log arbitrary content.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.format_json","title":"format_json","text":"<pre><code>format_json(json_obj)\n</code></pre> <p>Format a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.debug","title":"debug","text":"<pre><code>debug(content, **kwargs)\n</code></pre> <p>Log a debug message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.info","title":"info","text":"<pre><code>info(content, **kwargs)\n</code></pre> <p>Log an informational message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.warning","title":"warning","text":"<pre><code>warning(content, **kwargs)\n</code></pre> <p>Log a warning message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.error","title":"error","text":"<pre><code>error(content, **kwargs)\n</code></pre> <p>Log an error message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.BaseScratchPad.critical","title":"critical","text":"<pre><code>critical(content, **kwargs)\n</code></pre> <p>Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/#agents.LangchainAgent","title":"LangchainAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Wrapper for Langchain Agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/langchain_based.py</code> <pre><code>class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -&gt; None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -&gt; AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent","title":"ReactAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -&gt; str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -&gt; Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -&gt; dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -&gt; str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Clear and reset the agent.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent.run","title":"run","text":"<pre><code>run(instruction, max_iterations=None)\n</code></pre> <p>Run the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/#agents.ReactAgent.stream","title":"stream","text":"<pre><code>stream(instruction, max_iterations=None)\n</code></pre> <p>Stream the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/#agents.RewooAgent","title":"RewooAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -&gt; Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) &gt; 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n</code></pre>"},{"location":"reference/agents/#agents.RewooAgent.run","title":"run","text":"<pre><code>run(instruction, use_citation=False)\n</code></pre> <p>Run the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>@BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/#agents.RewooAgent.stream","title":"stream","text":"<pre><code>stream(instruction, use_citation=False)\n</code></pre> <p>Stream the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool","title":"BaseTool","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -&gt; Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -&gt; Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -&gt; Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -&gt; LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool.name","title":"name  <code>instance-attribute</code>","text":"<pre><code>name\n</code></pre> <p>The unique name of the tool that clearly communicates its purpose.</p>"},{"location":"reference/agents/#agents.BaseTool.description","title":"description  <code>instance-attribute</code>","text":"<pre><code>description\n</code></pre> <p>Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.</p>"},{"location":"reference/agents/#agents.BaseTool.args_schema","title":"args_schema  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>args_schema = None\n</code></pre> <p>Pydantic model class to validate and parse the tool's input arguments.</p>"},{"location":"reference/agents/#agents.BaseTool.verbose","title":"verbose  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>verbose = False\n</code></pre> <p>Whether to log the tool's progress.</p>"},{"location":"reference/agents/#agents.BaseTool.handle_tool_error","title":"handle_tool_error  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>handle_tool_error = False\n</code></pre> <p>Handle the content of the ToolException thrown.</p>"},{"location":"reference/agents/#agents.BaseTool.to_langchain_format","title":"to_langchain_format","text":"<pre><code>to_langchain_format()\n</code></pre> <p>Convert this tool to Langchain format to use with its agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def to_langchain_format(self) -&gt; LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool.run","title":"run","text":"<pre><code>run(tool_input, verbose=None, **kwargs)\n</code></pre> <p>Run the tool.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -&gt; Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n</code></pre>"},{"location":"reference/agents/#agents.BaseTool.from_langchain_format","title":"from_langchain_format  <code>classmethod</code>","text":"<pre><code>from_langchain_format(langchain_tool)\n</code></pre> <p>Wrapper for Langchain Tool</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>@classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n</code></pre>"},{"location":"reference/agents/#agents.ComponentTool","title":"ComponentTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Wrapper around other BaseComponent to use it as a tool</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <p>BaseComponent-based component to wrap</p> required <code>postprocessor</code> <p>Optional postprocessor for the component output</p> required Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -&gt; Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n</code></pre>"},{"location":"reference/agents/#agents.WikipediaTool","title":"WikipediaTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Tool that adds the capability to query the Wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -&gt; AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n</code></pre>"},{"location":"reference/agents/base/","title":"Base","text":""},{"location":"reference/agents/base/#agents.base.BaseAgent","title":"BaseAgent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define base agent interface</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -&gt; None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/base/#agents.base.BaseAgent.add_tools","title":"add_tools","text":"<pre><code>add_tools(tools)\n</code></pre> <p>Helper method to add tools and update agent state if needed</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def add_tools(self, tools: list[BaseTool]) -&gt; None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n</code></pre>"},{"location":"reference/agents/base/#agents.base.BaseAgent.run","title":"run","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/agents/base.py</code> <pre><code>def run(self, *args, **kwargs) -&gt; AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n</code></pre>"},{"location":"reference/agents/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/agents/langchain_based/#agents.langchain_based.LangchainAgent","title":"LangchainAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Wrapper for Langchain Agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/langchain_based.py</code> <pre><code>class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -&gt; None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -&gt; AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n</code></pre>"},{"location":"reference/agents/utils/","title":"Utils","text":""},{"location":"reference/agents/utils/#agents.utils.get_plugin_response_content","title":"get_plugin_response_content","text":"<pre><code>get_plugin_response_content(output)\n</code></pre> <p>Wrapper for AgentOutput content return</p> Source code in <code>libs/kotaemon/kotaemon/agents/utils.py</code> <pre><code>def get_plugin_response_content(output) -&gt; str:\n    \"\"\"\n    Wrapper for AgentOutput content return\n    \"\"\"\n    if isinstance(output, Document):\n        return output.text\n    else:\n        return str(output)\n</code></pre>"},{"location":"reference/agents/utils/#agents.utils.calculate_cost","title":"calculate_cost","text":"<pre><code>calculate_cost(model_name, prompt_token, completion_token)\n</code></pre> <p>Calculate the cost of a prompt and completion.</p> <p>Returns:</p> Name Type Description <code>float</code> <code>float</code> <p>Cost of the provided model name with provided token information</p> Source code in <code>libs/kotaemon/kotaemon/agents/utils.py</code> <pre><code>def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -&gt; float:\n    \"\"\"\n    Calculate the cost of a prompt and completion.\n\n    Returns:\n        float: Cost of the provided model name with provided token information\n    \"\"\"\n    # TODO: to be implemented\n    return 0.0\n</code></pre>"},{"location":"reference/agents/io/","title":"Io","text":""},{"location":"reference/agents/io/#agents.io.AgentAction","title":"AgentAction  <code>dataclass</code>","text":"<p>Agent's action to take.</p> <p>Parameters:</p> Name Type Description Default <code>tool</code> <code>str</code> <p>The tool to invoke.</p> required <code>tool_input</code> <code>Union[str, dict]</code> <p>The input to the tool.</p> required <code>log</code> <code>str</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>@dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n</code></pre>"},{"location":"reference/agents/io/#agents.io.AgentFinish","title":"AgentFinish","text":"<p>               Bases: <code>NamedTuple</code></p> <p>Agent's return value when finishing execution.</p> <p>Parameters:</p> Name Type Description Default <code>return_values</code> <p>The return values of the agent.</p> required <code>log</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n</code></pre>"},{"location":"reference/agents/io/#agents.io.AgentOutput","title":"AgentOutput","text":"<p>               Bases: <code>LLMInterface</code></p> <p>Output from an agent.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <p>The text output from the agent.</p> required <code>agent_type</code> <p>The type of agent.</p> required <code>status</code> <p>The status after executing the agent.</p> required <code>error</code> <p>The error message if any.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n</code></pre>"},{"location":"reference/agents/io/#agents.io.AgentType","title":"AgentType","text":"<p>               Bases: <code>Enum</code></p> <p>Enumerated type for agent types.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad","title":"BaseScratchPad","text":"<p>Base class for output handlers.</p>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad--attributes","title":"Attributes:","text":"<p>logger : logging.Logger     The logger object to log messages.</p>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad--methods","title":"Methods:","text":"<p>stop():     Stop the output.</p> <p>update_status(output: str, **kwargs):     Update the status of the output.</p> <p>thinking(name: str):     Log that a process is thinking.</p> <p>done(_all=False):     Log that the process is done.</p> <p>stream_print(item: str):     Not implemented.</p> <p>json_print(item: Dict[str, Any]):     Log a JSON object.</p> <p>panel_print(item: Any, title: str = \"Output\", stream: bool = False):     Log a panel output.</p> <p>clear():     Not implemented.</p> <p>print(content: str, **kwargs):     Log arbitrary content.</p> <p>format_json(json_obj: str):     Format a JSON object.</p> <p>debug(content: str, **kwargs):     Log a debug message.</p> <p>info(content: str, **kwargs):     Log an informational message.</p> <p>warning(content: str, **kwargs):     Log a warning message.</p> <p>error(content: str, **kwargs):     Log an error message.</p> <p>critical(content: str, **kwargs):     Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stop","title":"stop","text":"<pre><code>stop()\n</code></pre> <p>Stop the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.update_status","title":"update_status","text":"<pre><code>update_status(output, **kwargs)\n</code></pre> <p>Update the status of the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.thinking","title":"thinking","text":"<pre><code>thinking(name)\n</code></pre> <p>Log that a process is thinking.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.done","title":"done","text":"<pre><code>done(_all=False)\n</code></pre> <p>Log that the process is done.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stream_print","title":"stream_print","text":"<pre><code>stream_print(item)\n</code></pre> <p>Stream print.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.json_print","title":"json_print","text":"<pre><code>json_print(item)\n</code></pre> <p>Log a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.panel_print","title":"panel_print","text":"<pre><code>panel_print(item, title='Output', stream=False)\n</code></pre> <p>Log a panel output.</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <p>Any The item to log.</p> required <code>title</code> <p>str, optional The title of the panel, defaults to \"Output\".</p> <code>'Output'</code> <code>stream</code> <p>bool, optional</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Not implemented.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.print","title":"print","text":"<pre><code>print(content, **kwargs)\n</code></pre> <p>Log arbitrary content.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.format_json","title":"format_json","text":"<pre><code>format_json(json_obj)\n</code></pre> <p>Format a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.debug","title":"debug","text":"<pre><code>debug(content, **kwargs)\n</code></pre> <p>Log a debug message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.info","title":"info","text":"<pre><code>info(content, **kwargs)\n</code></pre> <p>Log an informational message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.warning","title":"warning","text":"<pre><code>warning(content, **kwargs)\n</code></pre> <p>Log a warning message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.error","title":"error","text":"<pre><code>error(content, **kwargs)\n</code></pre> <p>Log an error message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/#agents.io.BaseScratchPad.critical","title":"critical","text":"<pre><code>critical(content, **kwargs)\n</code></pre> <p>Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/","title":"Base","text":""},{"location":"reference/agents/io/base/#agents.io.base.AgentType","title":"AgentType","text":"<p>               Bases: <code>Enum</code></p> <p>Enumerated type for agent types.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad","title":"BaseScratchPad","text":"<p>Base class for output handlers.</p>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--attributes","title":"Attributes:","text":"<p>logger : logging.Logger     The logger object to log messages.</p>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--methods","title":"Methods:","text":"<p>stop():     Stop the output.</p> <p>update_status(output: str, **kwargs):     Update the status of the output.</p> <p>thinking(name: str):     Log that a process is thinking.</p> <p>done(_all=False):     Log that the process is done.</p> <p>stream_print(item: str):     Not implemented.</p> <p>json_print(item: Dict[str, Any]):     Log a JSON object.</p> <p>panel_print(item: Any, title: str = \"Output\", stream: bool = False):     Log a panel output.</p> <p>clear():     Not implemented.</p> <p>print(content: str, **kwargs):     Log arbitrary content.</p> <p>format_json(json_obj: str):     Format a JSON object.</p> <p>debug(content: str, **kwargs):     Log a debug message.</p> <p>info(content: str, **kwargs):     Log an informational message.</p> <p>warning(content: str, **kwargs):     Log a warning message.</p> <p>error(content: str, **kwargs):     Log an error message.</p> <p>critical(content: str, **kwargs):     Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stop","title":"stop","text":"<pre><code>stop()\n</code></pre> <p>Stop the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.update_status","title":"update_status","text":"<pre><code>update_status(output, **kwargs)\n</code></pre> <p>Update the status of the output.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.thinking","title":"thinking","text":"<pre><code>thinking(name)\n</code></pre> <p>Log that a process is thinking.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.done","title":"done","text":"<pre><code>done(_all=False)\n</code></pre> <p>Log that the process is done.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stream_print","title":"stream_print","text":"<pre><code>stream_print(item)\n</code></pre> <p>Stream print.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.json_print","title":"json_print","text":"<pre><code>json_print(item)\n</code></pre> <p>Log a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.panel_print","title":"panel_print","text":"<pre><code>panel_print(item, title='Output', stream=False)\n</code></pre> <p>Log a panel output.</p> <p>Parameters:</p> Name Type Description Default <code>item</code> <p>Any The item to log.</p> required <code>title</code> <p>str, optional The title of the panel, defaults to \"Output\".</p> <code>'Output'</code> <code>stream</code> <p>bool, optional</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Not implemented.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.print","title":"print","text":"<pre><code>print(content, **kwargs)\n</code></pre> <p>Log arbitrary content.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.format_json","title":"format_json","text":"<pre><code>format_json(json_obj)\n</code></pre> <p>Format a JSON object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.debug","title":"debug","text":"<pre><code>debug(content, **kwargs)\n</code></pre> <p>Log a debug message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.info","title":"info","text":"<pre><code>info(content, **kwargs)\n</code></pre> <p>Log an informational message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.warning","title":"warning","text":"<pre><code>warning(content, **kwargs)\n</code></pre> <p>Log a warning message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.error","title":"error","text":"<pre><code>error(content, **kwargs)\n</code></pre> <p>Log an error message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.critical","title":"critical","text":"<pre><code>critical(content, **kwargs)\n</code></pre> <p>Log a critical message.</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.AgentAction","title":"AgentAction  <code>dataclass</code>","text":"<p>Agent's action to take.</p> <p>Parameters:</p> Name Type Description Default <code>tool</code> <code>str</code> <p>The tool to invoke.</p> required <code>tool_input</code> <code>Union[str, dict]</code> <p>The input to the tool.</p> required <code>log</code> <code>str</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>@dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.AgentFinish","title":"AgentFinish","text":"<p>               Bases: <code>NamedTuple</code></p> <p>Agent's return value when finishing execution.</p> <p>Parameters:</p> Name Type Description Default <code>return_values</code> <p>The return values of the agent.</p> required <code>log</code> <p>The log message.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.AgentOutput","title":"AgentOutput","text":"<p>               Bases: <code>LLMInterface</code></p> <p>Output from an agent.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <p>The text output from the agent.</p> required <code>agent_type</code> <p>The type of agent.</p> required <code>status</code> <p>The status after executing the agent.</p> required <code>error</code> <p>The error message if any.</p> required Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n</code></pre>"},{"location":"reference/agents/io/base/#agents.io.base.check_log","title":"check_log","text":"<pre><code>check_log()\n</code></pre> <p>Checks if logging has been enabled. :return: True if logging has been enabled, False otherwise. :rtype: bool</p> Source code in <code>libs/kotaemon/kotaemon/agents/io/base.py</code> <pre><code>def check_log():\n    \"\"\"\n    Checks if logging has been enabled.\n    :return: True if logging has been enabled, False otherwise.\n    :rtype: bool\n    \"\"\"\n    return os.environ.get(\"LOG_PATH\", None) is not None\n</code></pre>"},{"location":"reference/agents/react/","title":"React","text":""},{"location":"reference/agents/react/#agents.react.ReactAgent","title":"ReactAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -&gt; str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -&gt; Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -&gt; dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -&gt; str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n</code></pre>"},{"location":"reference/agents/react/#agents.react.ReactAgent.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Clear and reset the agent.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n</code></pre>"},{"location":"reference/agents/react/#agents.react.ReactAgent.run","title":"run","text":"<pre><code>run(instruction, max_iterations=None)\n</code></pre> <p>Run the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/#agents.react.ReactAgent.stream","title":"stream","text":"<pre><code>stream(instruction, max_iterations=None)\n</code></pre> <p>Stream the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/agent/","title":"Agent","text":""},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent","title":"ReactAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -&gt; str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -&gt; Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -&gt; dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -&gt; str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations &gt; 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n</code></pre>"},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.clear","title":"clear","text":"<pre><code>clear()\n</code></pre> <p>Clear and reset the agent.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n</code></pre>"},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.run","title":"run","text":"<pre><code>run(instruction, max_iterations=None)\n</code></pre> <p>Run the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def run(self, instruction, max_iterations=None) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.stream","title":"stream","text":"<pre><code>stream(instruction, max_iterations=None)\n</code></pre> <p>Stream the agent with the given instruction.</p> <p>Parameters:</p> Name Type Description Default <code>instruction</code> <p>Instruction to run the agent with.</p> required <code>max_iterations</code> <p>Maximum number of iterations of reasoning steps, defaults to 10.</p> <code>None</code> Return <p>AgentOutput object.</p> Source code in <code>libs/kotaemon/kotaemon/agents/react/agent.py</code> <pre><code>def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations &gt; 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n</code></pre>"},{"location":"reference/agents/react/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/","title":"Rewoo","text":""},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent","title":"RewooAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -&gt; Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) &gt; 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n</code></pre>"},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.run","title":"run","text":"<pre><code>run(instruction, use_citation=False)\n</code></pre> <p>Run the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>@BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.stream","title":"stream","text":"<pre><code>stream(instruction, use_citation=False)\n</code></pre> <p>Stream the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/agent/","title":"Agent","text":""},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent","title":"RewooAgent","text":"<p>               Bases: <code>BaseAgent</code></p> <p>Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -&gt; tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -&gt; Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) &gt; 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n</code></pre>"},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.run","title":"run","text":"<pre><code>run(instruction, use_citation=False)\n</code></pre> <p>Run the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>@BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -&gt; AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.stream","title":"stream","text":"<pre><code>stream(instruction, use_citation=False)\n</code></pre> <p>Stream the agent with a given instruction.</p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/agent.py</code> <pre><code>def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n</code></pre>"},{"location":"reference/agents/rewoo/planner/","title":"Planner","text":""},{"location":"reference/agents/rewoo/planner/#agents.rewoo.planner.Planner","title":"Planner","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/planner.py</code> <pre><code>class Planner(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    plugins: List[BaseTool]\n\n    def _compose_worker_description(self) -&gt; str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for worker in self.plugins:\n                prompt += f\"{worker.name}[input]: {worker.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _compose_fewshot_prompt(self) -&gt; str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        worker_desctription = self._compose_worker_description()\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_planner_prompt.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return zero_shot_planner_prompt.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n\n    def run(self, instruction: str, output: BaseScratchPad = BaseScratchPad()) -&gt; Any:\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n\n    def stream(self, instruction: str, output: BaseScratchPad = BaseScratchPad()):\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n\n        response = \"\"\n        try:\n            for text in self.model.stream(prompt):\n                response += text\n                yield text\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            print(\"Streaming is not supported, falling back to normal run\")\n            response = self.model(prompt)\n            yield response\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n</code></pre>"},{"location":"reference/agents/rewoo/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/solver/","title":"Solver","text":""},{"location":"reference/agents/rewoo/solver/#agents.rewoo.solver.Solver","title":"Solver","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/rewoo/solver.py</code> <pre><code>class Solver(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    output_lang: str = \"English\"\n\n    def _compose_fewshot_prompt(self) -&gt; str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction, plan_evidence, output_lang) -&gt; str:\n        \"\"\"\n        Compose the prompt from template, plan&amp;evidence, examples and instruction.\n        \"\"\"\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence, task=instruction, lang=output_lang\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return zero_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    task=instruction,\n                    lang=output_lang,\n                )\n\n    def run(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -&gt; Any:\n        response = None\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n\n    def stream(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -&gt; Any:\n        response = \"\"\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            for text in self.model.stream(prompt):\n                response += text.text\n                yield text\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            response = self.model(prompt).text\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n</code></pre>"},{"location":"reference/agents/tools/","title":"Tools","text":""},{"location":"reference/agents/tools/#agents.tools.BaseTool","title":"BaseTool","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -&gt; Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -&gt; Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -&gt; Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -&gt; LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.name","title":"name  <code>instance-attribute</code>","text":"<pre><code>name\n</code></pre> <p>The unique name of the tool that clearly communicates its purpose.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.description","title":"description  <code>instance-attribute</code>","text":"<pre><code>description\n</code></pre> <p>Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.args_schema","title":"args_schema  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>args_schema = None\n</code></pre> <p>Pydantic model class to validate and parse the tool's input arguments.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.verbose","title":"verbose  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>verbose = False\n</code></pre> <p>Whether to log the tool's progress.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.handle_tool_error","title":"handle_tool_error  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>handle_tool_error = False\n</code></pre> <p>Handle the content of the ToolException thrown.</p>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.to_langchain_format","title":"to_langchain_format","text":"<pre><code>to_langchain_format()\n</code></pre> <p>Convert this tool to Langchain format to use with its agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def to_langchain_format(self) -&gt; LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.run","title":"run","text":"<pre><code>run(tool_input, verbose=None, **kwargs)\n</code></pre> <p>Run the tool.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -&gt; Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.BaseTool.from_langchain_format","title":"from_langchain_format  <code>classmethod</code>","text":"<pre><code>from_langchain_format(langchain_tool)\n</code></pre> <p>Wrapper for Langchain Tool</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>@classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.ComponentTool","title":"ComponentTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Wrapper around other BaseComponent to use it as a tool</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <p>BaseComponent-based component to wrap</p> required <code>postprocessor</code> <p>Optional postprocessor for the component output</p> required Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -&gt; Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n</code></pre>"},{"location":"reference/agents/tools/#agents.tools.WikipediaTool","title":"WikipediaTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Tool that adds the capability to query the Wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -&gt; AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n</code></pre>"},{"location":"reference/agents/tools/base/","title":"Base","text":""},{"location":"reference/agents/tools/base/#agents.tools.base.ToolException","title":"ToolException","text":"<p>               Bases: <code>Exception</code></p> <p>An optional exception that tool throws when execution error occurs.</p> <p>When this exception is thrown, the agent will not stop working, but will handle the exception according to the handle_tool_error variable of the tool, and the processing result will be returned to the agent as observation, and printed in red on the console.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ToolException(Exception):\n    \"\"\"An optional exception that tool throws when execution error occurs.\n\n    When this exception is thrown, the agent will not stop working,\n    but will handle the exception according to the handle_tool_error\n    variable of the tool, and the processing result will be returned\n    to the agent as observation, and printed in red on the console.\n    \"\"\"\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool","title":"BaseTool","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -&gt; Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -&gt; Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -&gt; Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -&gt; LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -&gt; Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.name","title":"name  <code>instance-attribute</code>","text":"<pre><code>name\n</code></pre> <p>The unique name of the tool that clearly communicates its purpose.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.description","title":"description  <code>instance-attribute</code>","text":"<pre><code>description\n</code></pre> <p>Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.args_schema","title":"args_schema  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>args_schema = None\n</code></pre> <p>Pydantic model class to validate and parse the tool's input arguments.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.verbose","title":"verbose  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>verbose = False\n</code></pre> <p>Whether to log the tool's progress.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.handle_tool_error","title":"handle_tool_error  <code>class-attribute</code> <code>instance-attribute</code>","text":"<pre><code>handle_tool_error = False\n</code></pre> <p>Handle the content of the ToolException thrown.</p>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.to_langchain_format","title":"to_langchain_format","text":"<pre><code>to_langchain_format()\n</code></pre> <p>Convert this tool to Langchain format to use with its agent</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def to_langchain_format(self) -&gt; LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.run","title":"run","text":"<pre><code>run(tool_input, verbose=None, **kwargs)\n</code></pre> <p>Run the tool.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -&gt; Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.from_langchain_format","title":"from_langchain_format  <code>classmethod</code>","text":"<pre><code>from_langchain_format(langchain_tool)\n</code></pre> <p>Wrapper for Langchain Tool</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>@classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -&gt; \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n</code></pre>"},{"location":"reference/agents/tools/base/#agents.tools.base.ComponentTool","title":"ComponentTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Wrapper around other BaseComponent to use it as a tool</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <p>BaseComponent-based component to wrap</p> required <code>postprocessor</code> <p>Optional postprocessor for the component output</p> required Source code in <code>libs/kotaemon/kotaemon/agents/tools/base.py</code> <pre><code>class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -&gt; Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n</code></pre>"},{"location":"reference/agents/tools/google/","title":"Google","text":""},{"location":"reference/agents/tools/llm/","title":"Llm","text":""},{"location":"reference/agents/tools/wikipedia/","title":"Wikipedia","text":""},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki","title":"Wiki","text":"<p>Wrapper around wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class Wiki:\n    \"\"\"Wrapper around wikipedia API.\"\"\"\n\n    def __init__(self) -&gt; None:\n        \"\"\"Check that wikipedia package is installed.\"\"\"\n        try:\n            import wikipedia  # noqa: F401\n        except ImportError:\n            raise ValueError(\n                \"Could not import wikipedia python package. \"\n                \"Please install it with `pip install wikipedia`.\"\n            )\n\n    def search(self, search: str) -&gt; Union[str, Document]:\n        \"\"\"Try to search for wiki page.\n\n        If page exists, return the page summary, and a PageWithLookups object.\n        If page does not exist, return similar entries.\n        \"\"\"\n        import wikipedia\n\n        try:\n            page_content = wikipedia.page(search).content\n            url = wikipedia.page(search).url\n            result: Union[str, Document] = Document(\n                text=page_content, metadata={\"page\": url}\n            )\n        except wikipedia.PageError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        except wikipedia.DisambiguationError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        return result\n</code></pre>"},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki.search","title":"search","text":"<pre><code>search(search)\n</code></pre> <p>Try to search for wiki page.</p> <p>If page exists, return the page summary, and a PageWithLookups object. If page does not exist, return similar entries.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>def search(self, search: str) -&gt; Union[str, Document]:\n    \"\"\"Try to search for wiki page.\n\n    If page exists, return the page summary, and a PageWithLookups object.\n    If page does not exist, return similar entries.\n    \"\"\"\n    import wikipedia\n\n    try:\n        page_content = wikipedia.page(search).content\n        url = wikipedia.page(search).url\n        result: Union[str, Document] = Document(\n            text=page_content, metadata={\"page\": url}\n        )\n    except wikipedia.PageError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    except wikipedia.DisambiguationError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    return result\n</code></pre>"},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.WikipediaTool","title":"WikipediaTool","text":"<p>               Bases: <code>BaseTool</code></p> <p>Tool that adds the capability to query the Wikipedia API.</p> Source code in <code>libs/kotaemon/kotaemon/agents/tools/wikipedia.py</code> <pre><code>class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -&gt; AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n</code></pre>"},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#base.BaseComponent","title":"BaseComponent","text":"<p>               Bases: <code>Function</code></p> <p>A component is a class that can be used to compose a pipeline.</p> <p>Benefits of component</p> <ul> <li>Auto caching, logging</li> <li>Allow deployment</li> </ul> <p>For each component, the spirit is</p> <ul> <li>Tolerate multiple input types, e.g. str, Document, List[str], List[Document]</li> <li>Enforce single output type. Hence, the output type of a component should be</li> </ul> <p>as generic as possible.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -&gt; Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -&gt; AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n</code></pre>"},{"location":"reference/base/#base.BaseComponent.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>@abstractmethod\ndef run(\n    self, *args, **kwargs\n) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n</code></pre>"},{"location":"reference/base/#base.Document","title":"Document","text":"<p>               Bases: <code>Document</code></p> <p>Base document class, mostly inherited from Document class from llama-index.</p> <p>This class accept one positional argument <code>content</code> of an arbitrary type, which will     store the raw content of the document. If specified, the class will use     <code>content</code> to initialize the base llama_index class.</p> <p>Attributes:</p> Name Type Description <code>content</code> <code>Any</code> <p>raw content of the document, can be anything</p> <code>source</code> <code>Optional[str]</code> <p>id of the source of the Document. Optional.</p> <code>channel</code> <code>Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]</code> <p>the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"&lt;EMBEDDING&gt;\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -&gt; \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -&gt; \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n</code></pre>"},{"location":"reference/base/#base.Document.to_haystack_format","title":"to_haystack_format","text":"<pre><code>to_haystack_format()\n</code></pre> <p>Convert struct to Haystack document format.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>def to_haystack_format(self) -&gt; \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n</code></pre>"},{"location":"reference/base/#base.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document which must contains embedding</p> <p>Use this if you want to enforce component's IOs to must contain embedding.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n</code></pre>"},{"location":"reference/base/#base.ExtractorOutput","title":"ExtractorOutput","text":"<p>               Bases: <code>Document</code></p> <p>Represents the output of an extractor.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n</code></pre>"},{"location":"reference/base/#base.RetrievedDocument","title":"RetrievedDocument","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document with retrieval-related information</p> <p>Attributes:</p> Name Type Description <code>score</code> <code>float</code> <p>score of the document (from 0.0 to 1.0)</p> <code>retrieval_metadata</code> <code>dict</code> <p>metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n</code></pre>"},{"location":"reference/base/component/","title":"Component","text":""},{"location":"reference/base/component/#base.component.BaseComponent","title":"BaseComponent","text":"<p>               Bases: <code>Function</code></p> <p>A component is a class that can be used to compose a pipeline.</p> <p>Benefits of component</p> <ul> <li>Auto caching, logging</li> <li>Allow deployment</li> </ul> <p>For each component, the spirit is</p> <ul> <li>Tolerate multiple input types, e.g. str, Document, List[str], List[Document]</li> <li>Enforce single output type. Hence, the output type of a component should be</li> </ul> <p>as generic as possible.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -&gt; Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -&gt; Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -&gt; AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n</code></pre>"},{"location":"reference/base/component/#base.component.BaseComponent.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(*args, **kwargs)\n</code></pre> <p>Run the component.</p> Source code in <code>libs/kotaemon/kotaemon/base/component.py</code> <pre><code>@abstractmethod\ndef run(\n    self, *args, **kwargs\n) -&gt; Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n</code></pre>"},{"location":"reference/base/schema/","title":"Schema","text":""},{"location":"reference/base/schema/#base.schema.Document","title":"Document","text":"<p>               Bases: <code>Document</code></p> <p>Base document class, mostly inherited from Document class from llama-index.</p> <p>This class accept one positional argument <code>content</code> of an arbitrary type, which will     store the raw content of the document. If specified, the class will use     <code>content</code> to initialize the base llama_index class.</p> <p>Attributes:</p> Name Type Description <code>content</code> <code>Any</code> <p>raw content of the document, can be anything</p> <code>source</code> <code>Optional[str]</code> <p>id of the source of the Document. Optional.</p> <code>channel</code> <code>Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]</code> <p>the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"&lt;EMBEDDING&gt;\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -&gt; \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -&gt; \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n</code></pre>"},{"location":"reference/base/schema/#base.schema.Document.to_haystack_format","title":"to_haystack_format","text":"<pre><code>to_haystack_format()\n</code></pre> <p>Convert struct to Haystack document format.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>def to_haystack_format(self) -&gt; \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n</code></pre>"},{"location":"reference/base/schema/#base.schema.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document which must contains embedding</p> <p>Use this if you want to enforce component's IOs to must contain embedding.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n</code></pre>"},{"location":"reference/base/schema/#base.schema.RetrievedDocument","title":"RetrievedDocument","text":"<p>               Bases: <code>Document</code></p> <p>Subclass of Document with retrieval-related information</p> <p>Attributes:</p> Name Type Description <code>score</code> <code>float</code> <p>score of the document (from 0.0 to 1.0)</p> <code>retrieval_metadata</code> <code>dict</code> <p>metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n</code></pre>"},{"location":"reference/base/schema/#base.schema.ExtractorOutput","title":"ExtractorOutput","text":"<p>               Bases: <code>Document</code></p> <p>Represents the output of an extractor.</p> Source code in <code>libs/kotaemon/kotaemon/base/schema.py</code> <pre><code>class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n</code></pre>"},{"location":"reference/chatbot/","title":"Chatbot","text":""},{"location":"reference/chatbot/#chatbot.ChatConversation","title":"ChatConversation","text":"<p>               Bases: <code>SessionFunction</code></p> <p>Base implementation of a chat bot component</p> A chatbot component should <ul> <li>handle internal state, including history messages</li> <li>return output for a given input</li> </ul> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -&gt; bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\"&gt;&gt; Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: &lt;No response&gt;\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n</code></pre>"},{"location":"reference/chatbot/#chatbot.ChatConversation.run","title":"run","text":"<pre><code>run(message)\n</code></pre> <p>Chat, given a message, return a response</p> <p>Parameters:</p> Name Type Description Default <code>message</code> <code>HumanMessage</code> <p>The message to respond to</p> required <p>Returns:</p> Type Description <code>Optional[BaseMessage]</code> <p>The response to the message. If None, no response is sent.</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n</code></pre>"},{"location":"reference/chatbot/#chatbot.ChatConversation.check_end","title":"check_end","text":"<pre><code>check_end(\n    history=None, user_message=None, bot_message=None\n)\n</code></pre> <p>Check if a conversation should end</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -&gt; bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n</code></pre>"},{"location":"reference/chatbot/#chatbot.ChatConversation.terminal_session","title":"terminal_session","text":"<pre><code>terminal_session()\n</code></pre> <p>Create a terminal session</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\"&gt;&gt; Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: &lt;No response&gt;\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n</code></pre>"},{"location":"reference/chatbot/#chatbot.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"<p>               Bases: <code>BaseChatBot</code></p> <p>Simple text respondent chatbot that essentially wraps around a chat LLM</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/simple_respondent.py</code> <pre><code>class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -&gt; str:\n        return self.llm(self.history).text\n</code></pre>"},{"location":"reference/chatbot/base/","title":"Base","text":""},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation","title":"ChatConversation","text":"<p>               Bases: <code>SessionFunction</code></p> <p>Base implementation of a chat bot component</p> A chatbot component should <ul> <li>handle internal state, including history messages</li> <li>return output for a given input</li> </ul> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -&gt; bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\"&gt;&gt; Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: &lt;No response&gt;\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.run","title":"run","text":"<pre><code>run(message)\n</code></pre> <p>Chat, given a message, return a response</p> <p>Parameters:</p> Name Type Description Default <code>message</code> <code>HumanMessage</code> <p>The message to respond to</p> required <p>Returns:</p> Type Description <code>Optional[BaseMessage]</code> <p>The response to the message. If None, no response is sent.</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def run(self, message: HumanMessage) -&gt; Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.check_end","title":"check_end","text":"<pre><code>check_end(\n    history=None, user_message=None, bot_message=None\n)\n</code></pre> <p>Check if a conversation should end</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -&gt; bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.terminal_session","title":"terminal_session","text":"<pre><code>terminal_session()\n</code></pre> <p>Create a terminal session</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\"&gt;&gt; Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: &lt;No response&gt;\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n</code></pre>"},{"location":"reference/chatbot/base/#chatbot.base.session_chat_storage","title":"session_chat_storage","text":"<pre><code>session_chat_storage(obj)\n</code></pre> <p>Store using the bot location rather than the session location</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/base.py</code> <pre><code>def session_chat_storage(obj):\n    \"\"\"Store using the bot location rather than the session location\"\"\"\n    return obj._store_result\n</code></pre>"},{"location":"reference/chatbot/simple_respondent/","title":"Simple Respondent","text":""},{"location":"reference/chatbot/simple_respondent/#chatbot.simple_respondent.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"<p>               Bases: <code>BaseChatBot</code></p> <p>Simple text respondent chatbot that essentially wraps around a chat LLM</p> Source code in <code>libs/kotaemon/kotaemon/chatbot/simple_respondent.py</code> <pre><code>class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -&gt; str:\n        return self.llm(self.history).text\n</code></pre>"},{"location":"reference/embeddings/","title":"Embeddings","text":""},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings","title":"EndpointEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>An Embeddings component that uses an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of an OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n</code></pre>"},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings.run","title":"run","text":"<pre><code>run(text)\n</code></pre> Generate embeddings from text Args <p>text (str | list[str] | Document | list[Document]): text to generate embeddings from</p> <p>Returns:     list[DocumentWithEmbedding]: embeddings</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n</code></pre>"},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>Utilize fastembed library for embeddings locally without GPU.</p> <p>Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If &gt; 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -&gt; \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings.ainvoke","title":"ainvoke  <code>async</code>","text":"<pre><code>ainvoke(text, *args, **kwargs)\n</code></pre> <p>Fastembed does not support async API.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's Cohere embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's HuggingFace embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&amp;sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's OpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/base/","title":"Base","text":""},{"location":"reference/embeddings/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings","title":"EndpointEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>An Embeddings component that uses an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of an OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n</code></pre>"},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings.run","title":"run","text":"<pre><code>run(text)\n</code></pre> Generate embeddings from text Args <p>text (str | list[str] | Document | list[Document]): text to generate embeddings from</p> <p>Returns:     list[DocumentWithEmbedding]: embeddings</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/endpoint_based.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n</code></pre>"},{"location":"reference/embeddings/fastembed/","title":"Fastembed","text":""},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>Utilize fastembed library for embeddings locally without GPU.</p> <p>Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If &gt; 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -&gt; \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings.ainvoke","title":"ainvoke  <code>async</code>","text":"<pre><code>ainvoke(text, *args, **kwargs)\n</code></pre> <p>Fastembed does not support async API.</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/fastembed.py</code> <pre><code>async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -&gt; list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n</code></pre>"},{"location":"reference/embeddings/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's OpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n</code></pre>"},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's Cohere embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n</code></pre>"},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"<p>               Bases: <code>LCEmbeddingMixin</code>, <code>BaseEmbeddings</code></p> <p>Wrapper around Langchain's HuggingFace embedding, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/langchain_based.py</code> <pre><code>class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&amp;sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n</code></pre>"},{"location":"reference/embeddings/openai/","title":"Openai","text":""},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings","title":"BaseOpenAIEmbeddings","text":"<p>               Bases: <code>BaseEmbeddings</code></p> <p>Base interface for OpenAI embedding model, using the openai library.</p> <p>This class exposes the parameters in resources.Chat. To subclass this class:</p> <pre><code>- Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class BaseOpenAIEmbeddings(BaseEmbeddings):\n    \"\"\"Base interface for OpenAI embedding model, using the openai library.\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n\n    api_key: str = Param(None, help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request.\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request.\"\n    )\n\n    dimensions: Optional[int] = Param(\n        None,\n        help=(\n            \"The number of dimensions the resulting output embeddings should have. \"\n            \"Only supported in `text-embedding-3` and later models.\"\n        ),\n    )\n    context_length: Optional[int] = Param(\n        None, help=\"The maximum context length of the embedding model\"\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_doc = self.prepare_input(text)\n        client = self.prepare_client(async_version=False)\n\n        input_: list[str | list[int]] = []\n        splitted_indices = {}\n        for idx, text in enumerate(input_doc):\n            if self.context_length:\n                chunks = split_text_by_chunk_size(text.text or \" \", self.context_length)\n                splitted_indices[idx] = (len(input_), len(input_) + len(chunks))\n                input_.extend(chunks)\n            else:\n                splitted_indices[idx] = (len(input_), len(input_) + 1)\n                input_.append(text.text)\n\n        resp = self.openai_response(client, input=input_, **kwargs).dict()\n        output_ = list(sorted(resp[\"data\"], key=lambda x: x[\"index\"]))\n\n        output = []\n        for idx, doc in enumerate(input_doc):\n            embs = output_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            if len(embs) == 1:\n                output.append(\n                    DocumentWithEmbedding(embedding=embs[0][\"embedding\"], content=doc)\n                )\n                continue\n\n            chunk_lens = [\n                len(_)\n                for _ in input_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            ]\n            vs: list[list[float]] = [_[\"embedding\"] for _ in embs]\n            emb = np.average(vs, axis=0, weights=chunk_lens)\n            emb = emb / np.linalg.norm(emb)\n            output.append(DocumentWithEmbedding(embedding=emb.tolist(), content=doc))\n\n        return output\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -&gt; list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        client = self.prepare_client(async_version=True)\n        resp = await self.openai_response(\n            client, input=[_.text if _.text else \" \" for _ in input_], **kwargs\n        ).dict()\n        output_ = sorted(resp[\"data\"], key=lambda x: x[\"index\"])\n        return [\n            DocumentWithEmbedding(embedding=o[\"embedding\"], content=i)\n            for i, o in zip(input_, output_)\n        ]\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"<p>               Bases: <code>BaseOpenAIEmbeddings</code></p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>@retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n</code></pre>"},{"location":"reference/embeddings/openai/#embeddings.openai.split_text_by_chunk_size","title":"split_text_by_chunk_size","text":"<pre><code>split_text_by_chunk_size(text, chunk_size)\n</code></pre> <p>Split the text into chunks of a given size</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>text to split</p> required <code>chunk_size</code> <code>int</code> <p>size of each chunk</p> required <p>Returns:</p> Type Description <code>list[list[int]]</code> <p>list of chunks (as tokens)</p> Source code in <code>libs/kotaemon/kotaemon/embeddings/openai.py</code> <pre><code>def split_text_by_chunk_size(text: str, chunk_size: int) -&gt; list[list[int]]:\n    \"\"\"Split the text into chunks of a given size\n\n    Args:\n        text: text to split\n        chunk_size: size of each chunk\n\n    Returns:\n        list of chunks (as tokens)\n    \"\"\"\n    encoding = tiktoken.get_encoding(\"cl100k_base\")\n    tokens = iter(encoding.encode(text))\n    result = []\n    while chunk := list(islice(tokens, chunk_size)):\n        result.append(chunk)\n    return result\n</code></pre>"},{"location":"reference/indices/","title":"Indices","text":""},{"location":"reference/indices/#indices.VectorIndexing","title":"VectorIndexing","text":"<p>               Bases: <code>BaseIndexing</code></p> <p>Ingest the document, run through the embedding, and store the embedding in a vector store.</p> This pipeline supports the following set of inputs <ul> <li>List of documents</li> <li>List of texts</li> </ul> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'&lt;p&gt;&lt;img src=\"{image_origin}\"&gt;&lt;/p&gt;'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n</code></pre>"},{"location":"reference/indices/#indices.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"<pre><code>to_retrieval_pipeline(*args, **kwargs)\n</code></pre> <p>Convert the indexing pipeline to a retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/indices/#indices.VectorRetrieval","title":"VectorRetrieval","text":"<p>               Bases: <code>BaseRetrieval</code></p> <p>Retrieve list of documents from vector store</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -&gt; list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) &lt; thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n</code></pre>"},{"location":"reference/indices/#indices.VectorRetrieval.run","title":"run","text":"<pre><code>run(text, top_k=None, **kwargs)\n</code></pre> <p>Retrieve a list of documents from vector store</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | Document</code> <p>the text to retrieve similar documents</p> required <code>top_k</code> <code>Optional[int]</code> <p>number of top similar documents to return</p> <code>None</code> <p>Returns:</p> Type Description <code>list[RetrievedDocument]</code> <p>list[RetrievedDocument]: list of retrieved documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -&gt; list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) &lt; thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n</code></pre>"},{"location":"reference/indices/base/","title":"Base","text":""},{"location":"reference/indices/base/#indices.base.DocTransformer","title":"DocTransformer","text":"<p>               Bases: <code>BaseComponent</code></p> <p>This is a base class for document transformers</p> <p>A document transformer transforms a list of documents into another list of documents. Transforming can mean splitting a document into multiple documents, reducing a large list of documents into a smaller list of documents, or adding metadata to each document in a list of documents, etc.</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class DocTransformer(BaseComponent):\n    \"\"\"This is a base class for document transformers\n\n    A document transformer transforms a list of documents into another list\n    of documents. Transforming can mean splitting a document into multiple documents,\n    reducing a large list of documents into a smaller list of documents, or adding\n    metadata to each document in a list of documents, etc.\n    \"\"\"\n\n    @abstractmethod\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -&gt; list[Document]:\n        ...\n</code></pre>"},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin","title":"LlamaIndexDocTransformerMixin","text":"<p>Allow automatically wrapping a Llama-index component into kotaemon component</p> Example <p>class TokenSplitter(LlamaIndexMixin, BaseSplitter):     def _get_li_class(self):         from llama_index.core.text_splitter import TokenTextSplitter         return TokenTextSplitter</p> <p>To use this mixin, please:     1. Use this class as the 1st parent class, so that Python will prefer to use     the attributes and methods of this class whenever possible.     2. Overwrite <code>_get_li_class</code> to return the relevant LlamaIndex component.</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class LlamaIndexDocTransformerMixin:\n    \"\"\"Allow automatically wrapping a Llama-index component into kotaemon component\n\n    Example:\n        class TokenSplitter(LlamaIndexMixin, BaseSplitter):\n            def _get_li_class(self):\n                from llama_index.core.text_splitter import TokenTextSplitter\n                return TokenTextSplitter\n\n    To use this mixin, please:\n        1. Use this class as the 1st parent class, so that Python will prefer to use\n        the attributes and methods of this class whenever possible.\n        2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.\n    \"\"\"\n\n    def _get_li_class(self) -&gt; Type[NodeParser]:\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in _get_li_class\"\n        )\n\n    def __init__(self, **params):\n        self._li_cls = self._get_li_class()\n        self._obj = self._li_cls(**params)\n        self._kwargs = params\n        super().__init__()\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) &gt; 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name: str, value: Any) -&gt; None:\n        if name.startswith(\"_\") or name in self._protected_keywords():\n            return super().__setattr__(name, value)\n\n        self._kwargs[name] = value\n        return setattr(self._obj, name, value)\n\n    def __getattr__(self, name: str) -&gt; Any:\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -&gt; list[Document]:\n        \"\"\"Run Llama-index node parser and convert the output to Document from\n        kotaemon\n        \"\"\"\n        docs = self._obj(documents, **kwargs)  # type: ignore\n        return [Document.from_dict(doc.to_dict()) for doc in docs]\n</code></pre>"},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin.run","title":"run","text":"<pre><code>run(documents, **kwargs)\n</code></pre> <p>Run Llama-index node parser and convert the output to Document from kotaemon</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    **kwargs,\n) -&gt; list[Document]:\n    \"\"\"Run Llama-index node parser and convert the output to Document from\n    kotaemon\n    \"\"\"\n    docs = self._obj(documents, **kwargs)  # type: ignore\n    return [Document.from_dict(doc.to_dict()) for doc in docs]\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseIndexing","title":"BaseIndexing","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define the base interface for indexing pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class BaseIndexing(BaseComponent):\n    \"\"\"Define the base interface for indexing pipeline\"\"\"\n\n    def to_retrieval_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        raise NotImplementedError\n\n    def to_qa_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n        raise NotImplementedError\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"<pre><code>to_retrieval_pipeline(**kwargs)\n</code></pre> <p>Convert the indexing pipeline to a retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>def to_retrieval_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_qa_pipeline","title":"to_qa_pipeline","text":"<pre><code>to_qa_pipeline(**kwargs)\n</code></pre> <p>Convert the indexing pipeline to a QA pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>def to_qa_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/indices/base/#indices.base.BaseRetrieval","title":"BaseRetrieval","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Define the base interface for retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/base.py</code> <pre><code>class BaseRetrieval(BaseComponent):\n    \"\"\"Define the base interface for retrieval pipeline\"\"\"\n\n    @abstractmethod\n    def run(self, *args, **kwargs) -&gt; list[RetrievedDocument]:\n        ...\n</code></pre>"},{"location":"reference/indices/vectorindex/","title":"Vectorindex","text":""},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing","title":"VectorIndexing","text":"<p>               Bases: <code>BaseIndexing</code></p> <p>Ingest the document, run through the embedding, and store the embedding in a vector store.</p> This pipeline supports the following set of inputs <ul> <li>List of documents</li> <li>List of texts</li> </ul> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'&lt;p&gt;&lt;img src=\"{image_origin}\"&gt;&lt;/p&gt;'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n</code></pre>"},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"<pre><code>to_retrieval_pipeline(*args, **kwargs)\n</code></pre> <p>Convert the indexing pipeline to a retrieval pipeline</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval","title":"VectorRetrieval","text":"<p>               Bases: <code>BaseRetrieval</code></p> <p>Retrieve list of documents from vector store</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -&gt; list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) &lt; thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n</code></pre>"},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval.run","title":"run","text":"<pre><code>run(text, top_k=None, **kwargs)\n</code></pre> <p>Retrieve a list of documents from vector store</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | Document</code> <p>the text to retrieve similar documents</p> required <code>top_k</code> <code>Optional[int]</code> <p>number of top similar documents to return</p> <code>None</code> <p>Returns:</p> Type Description <code>list[RetrievedDocument]</code> <p>list[RetrievedDocument]: list of retrieved documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/vectorindex.py</code> <pre><code>def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -&gt; list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) &lt; thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n</code></pre>"},{"location":"reference/indices/extractors/","title":"Extractors","text":""},{"location":"reference/indices/extractors/doc_parsers/","title":"Doc Parsers","text":""},{"location":"reference/indices/ingests/","title":"Ingests","text":""},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor","title":"DocumentIngestor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Ingest common office document types into Document for indexing</p> Document types <ul> <li>pdf</li> <li>xlsx, xls</li> <li>docx, doc</li> </ul> <p>Parameters:</p> Name Type Description Default <code>pdf_mode</code> <p>mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax</p> required <code>doc_parsers</code> <p>list of document parsers to parse the document</p> required <code>text_splitter</code> <p>splitter to split the document into text nodes</p> required <code>override_file_extractors</code> <p>override file extractors for specific file extensions The default file extractors are stored in <code>KH_DEFAULT_FILE_EXTRACTORS</code></p> required Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n</code></pre>"},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor.run","title":"run","text":"<pre><code>run(file_paths)\n</code></pre> <p>Ingest the file paths into Document</p> <p>Parameters:</p> Name Type Description Default <code>file_paths</code> <code>list[str | Path] | str | Path</code> <p>list of file paths or a single file path</p> required <p>Returns:</p> Type Description <code>list[Document]</code> <p>list of parsed Documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n</code></pre>"},{"location":"reference/indices/ingests/files/","title":"Files","text":""},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor","title":"DocumentIngestor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Ingest common office document types into Document for indexing</p> Document types <ul> <li>pdf</li> <li>xlsx, xls</li> <li>docx, doc</li> </ul> <p>Parameters:</p> Name Type Description Default <code>pdf_mode</code> <p>mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax</p> required <code>doc_parsers</code> <p>list of document parsers to parse the document</p> required <code>text_splitter</code> <p>splitter to split the document into text nodes</p> required <code>override_file_extractors</code> <p>override file extractors for specific file extensions The default file extractors are stored in <code>KH_DEFAULT_FILE_EXTRACTORS</code></p> required Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n</code></pre>"},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor.run","title":"run","text":"<pre><code>run(file_paths)\n</code></pre> <p>Ingest the file paths into Document</p> <p>Parameters:</p> Name Type Description Default <code>file_paths</code> <code>list[str | Path] | str | Path</code> <p>list of file paths or a single file path</p> required <p>Returns:</p> Type Description <code>list[Document]</code> <p>list of parsed Documents</p> Source code in <code>libs/kotaemon/kotaemon/indices/ingests/files.py</code> <pre><code>def run(self, file_paths: list[str | Path] | str | Path) -&gt; list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n</code></pre>"},{"location":"reference/indices/qa/","title":"Qa","text":""},{"location":"reference/indices/qa/#indices.qa.CitationPipeline","title":"CitationPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Citation pipeline to extract cited evidences from source (based on input question)</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/citation.py</code> <pre><code>class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/indices/qa/#indices.qa.CitationQAPipeline","title":"CitationQAPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Answering question from a text corpus with citation</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/text_based.py</code> <pre><code>class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -&gt; str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -&gt; str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -&gt; Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n</code></pre>"},{"location":"reference/indices/qa/citation/","title":"Citation","text":""},{"location":"reference/indices/qa/citation/#indices.qa.citation.CiteEvidence","title":"CiteEvidence","text":"<p>               Bases: <code>BaseModel</code></p> <p>List of evidences (maximum 5) to support the answer.</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/citation.py</code> <pre><code>class CiteEvidence(BaseModel):\n    \"\"\"List of evidences (maximum 5) to support the answer.\"\"\"\n\n    evidences: List[str] = Field(\n        ...,\n        description=(\n            \"Each source should be a direct quote from the context, \"\n            \"as a substring of the original content (max 15 words).\"\n        ),\n    )\n</code></pre>"},{"location":"reference/indices/qa/citation/#indices.qa.citation.CitationPipeline","title":"CitationPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Citation pipeline to extract cited evidences from source (based on input question)</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/citation.py</code> <pre><code>class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n</code></pre>"},{"location":"reference/indices/qa/text_based/","title":"Text Based","text":""},{"location":"reference/indices/qa/text_based/#indices.qa.text_based.CitationQAPipeline","title":"CitationQAPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Answering question from a text corpus with citation</p> Source code in <code>libs/kotaemon/kotaemon/indices/qa/text_based.py</code> <pre><code>class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -&gt; str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -&gt; str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -&gt; Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n</code></pre>"},{"location":"reference/indices/rankings/","title":"Rankings","text":""},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking","title":"BaseReranking","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(documents, query)\n</code></pre> <p>Main method to transform list of documents (re-ranking, filtering, etc)</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>@abstractmethod\ndef run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking","title":"CohereReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Use Cohere Reranker model to re-order documents with their relevance score</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking","title":"LLMReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring","title":"LLMScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring","title":"LLMTrulensScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/base/","title":"Base","text":""},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking","title":"BaseReranking","text":"<p>               Bases: <code>BaseComponent</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n</code></pre>"},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking.run","title":"run  <code>abstractmethod</code>","text":"<pre><code>run(documents, query)\n</code></pre> <p>Main method to transform list of documents (re-ranking, filtering, etc)</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/base.py</code> <pre><code>@abstractmethod\ndef run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n</code></pre>"},{"location":"reference/indices/rankings/cohere/","title":"Cohere","text":""},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking","title":"CohereReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Use Cohere Reranker model to re-order documents with their relevance score</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/cohere.py</code> <pre><code>def run(self, documents: list[Document], query: str) -&gt; list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm/","title":"Llm","text":""},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking","title":"LLMReranking","text":"<p>               Bases: <code>BaseReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_scoring/","title":"Llm Scoring","text":""},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring","title":"LLMScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/","title":"Llm Trulens","text":""},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.PATTERN_INTEGER","title":"PATTERN_INTEGER  <code>module-attribute</code>","text":"<pre><code>PATTERN_INTEGER = compile('([+-]?[1-9][0-9]*|0)')\n</code></pre> <p>Regex that matches integers.</p>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring","title":"LLMTrulensScoring","text":"<p>               Bases: <code>LLMReranking</code></p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -&gt; list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring.run","title":"run","text":"<pre><code>run(documents, query)\n</code></pre> <p>Filter down documents based on their relevance to the query.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -&gt; list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.validate_rating","title":"validate_rating","text":"<pre><code>validate_rating(rating)\n</code></pre> <p>Validate a rating is between 0 and 10.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def validate_rating(rating) -&gt; int:\n    \"\"\"Validate a rating is between 0 and 10.\"\"\"\n\n    if not 0 &lt;= rating &lt;= 10:\n        raise ValueError(\"Rating must be between 0 and 10\")\n\n    return rating\n</code></pre>"},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.re_0_10_rating","title":"re_0_10_rating","text":"<pre><code>re_0_10_rating(s)\n</code></pre> <p>Extract a 0-10 rating from a string.</p> <p>If the string does not match an integer or matches an integer outside the 0-10 range, raises an error instead. If multiple numbers are found within the expected 0-10 range, the smallest is returned.</p> <p>Parameters:</p> Name Type Description Default <code>s</code> <code>str</code> <p>String to extract rating from.</p> required <p>Returns:</p> Name Type Description <code>int</code> <code>int</code> <p>Extracted rating.</p> <p>Raises:</p> Type Description <code>ParseError</code> <p>If no integers between 0 and 10 are found in the string.</p> Source code in <code>libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py</code> <pre><code>def re_0_10_rating(s: str) -&gt; int:\n    \"\"\"Extract a 0-10 rating from a string.\n\n    If the string does not match an integer or matches an integer outside the\n    0-10 range, raises an error instead. If multiple numbers are found within\n    the expected 0-10 range, the smallest is returned.\n\n    Args:\n        s: String to extract rating from.\n\n    Returns:\n        int: Extracted rating.\n\n    Raises:\n        ParseError: If no integers between 0 and 10 are found in the string.\n    \"\"\"\n\n    matches = PATTERN_INTEGER.findall(s)\n    if not matches:\n        raise AssertionError\n\n    vals = set()\n    for match in matches:\n        try:\n            vals.add(validate_rating(int(match)))\n        except ValueError:\n            pass\n\n    if not vals:\n        raise AssertionError\n\n    # Min to handle cases like \"The rating is 8 out of 10.\"\n    return min(vals)\n</code></pre>"},{"location":"reference/indices/splitters/","title":"Splitters","text":""},{"location":"reference/indices/splitters/#indices.splitters.BaseSplitter","title":"BaseSplitter","text":"<p>               Bases: <code>DocTransformer</code></p> <p>Represent base splitter class</p> Source code in <code>libs/kotaemon/kotaemon/indices/splitters/__init__.py</code> <pre><code>class BaseSplitter(DocTransformer):\n    \"\"\"Represent base splitter class\"\"\"\n\n    ...\n</code></pre>"},{"location":"reference/llms/","title":"LLMs","text":""},{"location":"reference/llms/#llms.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"<p>               Bases: <code>SimpleBranchingPipeline</code></p> <p>A simple gated branching pipeline for executing multiple branches based on a     condition.</p> <p>This class extends the SimpleBranchingPipeline class and adds the ability to execute     the branches until a branch returns a non-empty output based on a condition.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.GatedBranchingPipeline.run","title":"run","text":"<pre><code>run(*, condition_text=None, **prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the output of the first     branch that returns a non-empty output based on the provided condition.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate for each branch. Default to None.</p> <code>None</code> <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Type Description <p>Union[OutputType, None]: The output of the first branch that satisfies the</p> <p>condition, or None if no branch satisfies the condition.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple branching pipeline for executing multiple branches.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n</code></pre>"},{"location":"reference/llms/#llms.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"<pre><code>add_branch(component)\n</code></pre> <p>Add a new branch to the pipeline.</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <code>BaseComponent</code> <p>The branch component to be added.</p> required Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleBranchingPipeline.run","title":"run","text":"<pre><code>run(**prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the outputs as a list.</p> <p>Parameters:</p> Name Type Description Default <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>List</code> <p>The outputs of each branch as a list.</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n</code></pre>"},{"location":"reference/llms/#llms.AzureChatOpenAI","title":"AzureChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model provided by Microsoft Azure</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/llms/#llms.AzureChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.ChatOpenAI","title":"ChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.ChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/llms/#llms.ChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/#llms.EndpointChatLLM","title":"EndpointChatLLM","text":"<p>               Bases: <code>ChatLLM</code></p> <p>A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of a OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        return self.invoke(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.EndpointChatLLM.run","title":"run","text":"<pre><code>run(messages, **kwargs)\n</code></pre> <p>Generate response from messages Args:     messages (str | BaseMessage | list[BaseMessage]): history of messages to         generate response from     **kwargs: additional arguments to pass to the OpenAI API Returns:     LLMInterface: generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n</code></pre>"},{"location":"reference/llms/#llms.EndpointChatLLM.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Same as run</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.LlamaCppChat","title":"LlamaCppChat","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Wrapper around the llama-cpp-python's Llama model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -&gt; \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n</code></pre>"},{"location":"reference/llms/#llms.LlamaCppChat.client_object","title":"client_object","text":"<pre><code>client_object()\n</code></pre> <p>Get the llama-cpp-python client object</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>@Param.auto()\ndef client_object(self) -&gt; \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n</code></pre>"},{"location":"reference/llms/#llms.AzureOpenAI","title":"AzureOpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's AzureOpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n</code></pre>"},{"location":"reference/llms/#llms.LlamaCpp","title":"LlamaCpp","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's LlamaCpp class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n</code></pre>"},{"location":"reference/llms/#llms.OpenAI","title":"OpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's OpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n</code></pre>"},{"location":"reference/llms/#llms.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Perform sequential chain-of-thought with manual pre-defined prompts</p> <p>This method supports variable number of steps. Each step corresponds to a <code>kotaemon.pipelines.cot.Thought</code>. Please refer that section for Thought's detail. This section is about chaining thought together.</p> <p>Usage:</p> <p>Create and run a chain of thought without \"+\" operator:</p> <pre><code>&gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n&gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt;&gt; thought1 = Thought(\n&gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n&gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought2 = Thought(\n&gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n&gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n&gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Create and run a chain of thought without \"+\" operator: Please refer the <code>kotaemon.pipelines.cot.Thought</code> section for examples.</p> <p>This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    &gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    &gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt;&gt; thought1 = Thought(\n    &gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n    &gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought2 = Thought(\n    &gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n    &gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    &gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/#llms.ManualSequentialChainOfThought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the manual chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n</code></pre>"},{"location":"reference/llms/#llms.Thought","title":"Thought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A thought in the chain of thought</p> <ul> <li>Input: <code>**kwargs</code> pairs, where key is the placeholder in the prompt, and value is the value.</li> <li>Output: an output dictionary</li> </ul> <p>Usage:</p> <p>Create and run a thought:</p> <pre><code>&gt;&gt; from kotaemon.pipelines.cot import Thought\n&gt;&gt; thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n&gt;&gt; output = thought(action=\"install\", object=\"python\")\n&gt;&gt; print(output)\n{'tutorial': 'As an AI language model,...'}\n</code></pre> <p>Basically, when a thought is run, it will:</p> <ol> <li>Populate the prompt template with the input <code>**kwargs</code>.</li> <li>Run the LLM model with the populated prompt.</li> <li>Post-process the LLM output with the post-processor.</li> </ol> <p>This <code>Thought</code> allows chaining sequentially with the + operator. For example:</p> <pre><code>&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt; thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n&gt;&gt; thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n&gt;&gt; thought = thought1 + thought2\n&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Under the hood, when the <code>+</code> operator is used, a <code>ManualSequentialChainOfThought</code> is created.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    &gt;&gt; from kotaemon.pipelines.cot import Thought\n    &gt;&gt; thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    &gt;&gt; output = thought(action=\"install\", object=\"python\")\n    &gt;&gt; print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    &gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt; thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    &gt;&gt; thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    &gt;&gt; thought = thought1 + thought2\n    &gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -&gt; List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/#llms.Thought.prompt_template","title":"prompt_template","text":"<pre><code>prompt_template()\n</code></pre> <p>Automatically wrap around param prompt. Can ignore</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>@Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n</code></pre>"},{"location":"reference/llms/#llms.Thought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n</code></pre>"},{"location":"reference/llms/#llms.GatedLinearPipeline","title":"GatedLinearPipeline","text":"<p>               Bases: <code>SimpleLinearPipeline</code></p> <p>A pipeline that extends the SimpleLinearPipeline class and adds a condition     attribute.</p> <p>Attributes:</p> Name Type Description <code>condition</code> <code>Callable[[IO_Type], Any]</code> <p>A callable function that represents the condition.</p> Usage Example Usage<pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -&gt; Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.GatedLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the pipeline with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate. Default to None.</p> <code>None</code> <code>llm_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the language model call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the post-processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <code>Document</code> <p>The final output of the pipeline as a Document object.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -&gt; Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple pipeline for running a function with a prompt, a language model, and an     optional post-processor.</p> <p>Attributes:</p> Name Type Description <code>prompt</code> <code>BasePromptComponent</code> <p>The prompt component used to generate the initial input.</p> <code>llm</code> <code>Union[ChatLLM, LLM]</code> <p>The language model component used to generate the output.</p> <code>post_processor</code> <code>Union[BaseComponent, Callable[[IO_Type], IO_Type]]</code> <p>An optional post-processor component or function.</p> Example Usage <pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n</code></pre>"},{"location":"reference/llms/#llms.SimpleLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the function with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>llm_kwargs</code> <code>dict</code> <p>Keyword arguments for the llm call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Keyword arguments for the post_processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <p>The final output of the function as a Document object.</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n</code></pre>"},{"location":"reference/llms/#llms.BasePromptComponent","title":"BasePromptComponent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base class for prompt components.</p> <p>Parameters:</p> Name Type Description Default <code>template</code> <code>PromptTemplate</code> <p>The prompt template.</p> required <code>**kwargs</code> <p>Any additional keyword arguments that will be used to populate the given template.</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n</code></pre>"},{"location":"reference/llms/#llms.BasePromptComponent.set_value","title":"set_value","text":"<pre><code>set_value(**kwargs)\n</code></pre> <p>Similar to <code>__set</code> but for external use.</p> <p>Set the values of the attributes in the object based on the provided keyword     arguments.</p> <p>Parameters:</p> Name Type Description Default <code>kwargs</code> <code>dict</code> <p>A dictionary with the attribute names as keys and the new values as values.</p> <code>{}</code> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.BasePromptComponent.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the function with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to pass to the function.</p> <code>{}</code> <p>Returns:</p> Type Description <p>The result of calling the <code>populate</code> method of the <code>template</code> object</p> <p>with the given keyword arguments.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate","title":"PromptTemplate","text":"<p>Base class for prompt templates.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -&gt; str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"<pre><code>check_missing_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"<pre><code>check_redundant_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.populate","title":"populate","text":"<pre><code>populate(**kwargs)\n</code></pre> <p>Strictly populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Type Description <code>str</code> <p>The populated template.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If an unknown placeholder is provided.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def populate(self, **kwargs) -&gt; str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n</code></pre>"},{"location":"reference/llms/#llms.PromptTemplate.partial_populate","title":"partial_populate","text":"<pre><code>partial_populate(**kwargs)\n</code></pre> <p>Partially populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>str</code> <p>The populated template.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n</code></pre>"},{"location":"reference/llms/base/","title":"Base","text":""},{"location":"reference/llms/branching/","title":"Branching","text":""},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple branching pipeline for executing multiple branches.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"<pre><code>add_branch(component)\n</code></pre> <p>Add a new branch to the pipeline.</p> <p>Parameters:</p> Name Type Description Default <code>component</code> <code>BaseComponent</code> <p>The branch component to be added.</p> required Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.run","title":"run","text":"<pre><code>run(**prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the outputs as a list.</p> <p>Parameters:</p> Name Type Description Default <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>List</code> <p>The outputs of each branch as a list.</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"<p>               Bases: <code>SimpleBranchingPipeline</code></p> <p>A simple gated branching pipeline for executing multiple branches based on a     condition.</p> <p>This class extends the SimpleBranchingPipeline class and adds the ability to execute     the branches until a branch returns a non-empty output based on a condition.</p> <p>Attributes:</p> Name Type Description <code>branches</code> <code>List[BaseComponent]</code> <p>The list of branches to be executed.</p> Example <pre><code>from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline.run","title":"run","text":"<pre><code>run(*, condition_text=None, **prompt_kwargs)\n</code></pre> <p>Execute the pipeline by running each branch and return the output of the first     branch that returns a non-empty output based on the provided condition.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate for each branch. Default to None.</p> <code>None</code> <code>**prompt_kwargs</code> <p>Keyword arguments for the branches.</p> <code>{}</code> <p>Returns:</p> Type Description <p>Union[OutputType, None]: The output of the first branch that satisfies the</p> <p>condition, or None if no branch satisfies the condition.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/branching.py</code> <pre><code>def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/cot/","title":"Cot","text":""},{"location":"reference/llms/cot/#llms.cot.Thought","title":"Thought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A thought in the chain of thought</p> <ul> <li>Input: <code>**kwargs</code> pairs, where key is the placeholder in the prompt, and value is the value.</li> <li>Output: an output dictionary</li> </ul> <p>Usage:</p> <p>Create and run a thought:</p> <pre><code>&gt;&gt; from kotaemon.pipelines.cot import Thought\n&gt;&gt; thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n&gt;&gt; output = thought(action=\"install\", object=\"python\")\n&gt;&gt; print(output)\n{'tutorial': 'As an AI language model,...'}\n</code></pre> <p>Basically, when a thought is run, it will:</p> <ol> <li>Populate the prompt template with the input <code>**kwargs</code>.</li> <li>Run the LLM model with the populated prompt.</li> <li>Post-process the LLM output with the post-processor.</li> </ol> <p>This <code>Thought</code> allows chaining sequentially with the + operator. For example:</p> <pre><code>&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt; thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n&gt;&gt; thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n&gt;&gt; thought = thought1 + thought2\n&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Under the hood, when the <code>+</code> operator is used, a <code>ManualSequentialChainOfThought</code> is created.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    &gt;&gt; from kotaemon.pipelines.cot import Thought\n    &gt;&gt; thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    &gt;&gt; output = thought(action=\"install\", object=\"python\")\n    &gt;&gt; print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    &gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt; thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    &gt;&gt; thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    &gt;&gt; thought = thought1 + thought2\n    &gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -&gt; List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.Thought.prompt_template","title":"prompt_template","text":"<pre><code>prompt_template()\n</code></pre> <p>Automatically wrap around param prompt. Can ignore</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>@Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.Thought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Perform sequential chain-of-thought with manual pre-defined prompts</p> <p>This method supports variable number of steps. Each step corresponds to a <code>kotaemon.pipelines.cot.Thought</code>. Please refer that section for Thought's detail. This section is about chaining thought together.</p> <p>Usage:</p> <p>Create and run a chain of thought without \"+\" operator:</p> <pre><code>&gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n&gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n&gt;&gt;&gt; thought1 = Thought(\n&gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n&gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought2 = Thought(\n&gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n&gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n&gt;&gt;&gt; )\n&gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n&gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n</code></pre> <p>Create and run a chain of thought without \"+\" operator: Please refer the <code>kotaemon.pipelines.cot.Thought</code> section for examples.</p> <p>This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    &gt;&gt;&gt; from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    &gt;&gt;&gt; llm = LCAzureChatOpenAI(...)\n    &gt;&gt;&gt; thought1 = Thought(\n    &gt;&gt;&gt;    prompt=\"Word {word} in {language} is \",\n    &gt;&gt;&gt;    post_process=lambda string: {\"translated\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought2 = Thought(\n    &gt;&gt;&gt;     prompt=\"Translate {translated} to Japanese\",\n    &gt;&gt;&gt;     post_process=lambda string: {\"output\": string},\n    &gt;&gt;&gt; )\n    &gt;&gt;&gt; thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    &gt;&gt;&gt; thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -&gt; Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -&gt; \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n</code></pre>"},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the manual chain of thought</p> Source code in <code>libs/kotaemon/kotaemon/llms/cot.py</code> <pre><code>def run(self, **kwargs) -&gt; Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n</code></pre>"},{"location":"reference/llms/linear/","title":"Linear","text":""},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"<p>               Bases: <code>BaseComponent</code></p> <p>A simple pipeline for running a function with a prompt, a language model, and an     optional post-processor.</p> <p>Attributes:</p> Name Type Description <code>prompt</code> <code>BasePromptComponent</code> <p>The prompt component used to generate the initial input.</p> <code>llm</code> <code>Union[ChatLLM, LLM]</code> <p>The language model component used to generate the output.</p> <code>post_processor</code> <code>Union[BaseComponent, Callable[[IO_Type], IO_Type]]</code> <p>An optional post-processor component or function.</p> Example Usage <pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n</code></pre>"},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the function with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>llm_kwargs</code> <code>dict</code> <p>Keyword arguments for the llm call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Keyword arguments for the post_processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <p>The final output of the function as a Document object.</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n</code></pre>"},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline","title":"GatedLinearPipeline","text":"<p>               Bases: <code>SimpleLinearPipeline</code></p> <p>A pipeline that extends the SimpleLinearPipeline class and adds a condition     attribute.</p> <p>Attributes:</p> Name Type Description <code>condition</code> <code>Callable[[IO_Type], Any]</code> <p>A callable function that represents the condition.</p> Usage Example Usage<pre><code>from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -&gt; Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n</code></pre>"},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline.run","title":"run","text":"<pre><code>run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n</code></pre> <p>Run the pipeline with the given arguments and return the final output as a     Document object.</p> <p>Parameters:</p> Name Type Description Default <code>condition_text</code> <code>str</code> <p>The condition text to evaluate. Default to None.</p> <code>None</code> <code>llm_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the language model call.</p> <code>{}</code> <code>post_processor_kwargs</code> <code>dict</code> <p>Additional keyword arguments for the post-processor.</p> <code>{}</code> <code>**prompt_kwargs</code> <p>Keyword arguments for populating the prompt.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>Document</code> <code>Document</code> <p>The final output of the pipeline as a Document object.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If condition_text is None</p> Source code in <code>libs/kotaemon/kotaemon/llms/linear.py</code> <pre><code>def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -&gt; Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n</code></pre>"},{"location":"reference/llms/chats/","title":"Chats","text":""},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM","title":"EndpointChatLLM","text":"<p>               Bases: <code>ChatLLM</code></p> <p>A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of a OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        return self.invoke(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.run","title":"run","text":"<pre><code>run(messages, **kwargs)\n</code></pre> <p>Generate response from messages Args:     messages (str | BaseMessage | list[BaseMessage]): history of messages to         generate response from     **kwargs: additional arguments to pass to the OpenAI API Returns:     LLMInterface: generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Same as run</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LCChatMixin","title":"LCChatMixin","text":"<p>Mixin for langchain based chat models</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) &gt; 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) &gt; 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LCChatMixin.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Generate response from messages</p> <p>Parameters:</p> Name Type Description Default <code>messages</code> <code>str | BaseMessage | list[BaseMessage]</code> <p>history of messages to generate response from</p> required <code>**kwargs</code> <p>additional arguments to pass to the langchain chat model</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>LLMInterface</code> <code>LLMInterface</code> <p>generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat","title":"LlamaCppChat","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Wrapper around the llama-cpp-python's Llama model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -&gt; \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat.client_object","title":"client_object","text":"<pre><code>client_object()\n</code></pre> <p>Get the llama-cpp-python client object</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>@Param.auto()\ndef client_object(self) -&gt; \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI","title":"AzureChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model provided by Microsoft Azure</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI","title":"ChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/base/","title":"Base","text":""},{"location":"reference/llms/chats/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM","title":"EndpointChatLLM","text":"<p>               Bases: <code>ChatLLM</code></p> <p>A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.</p> <p>Attributes:</p> Name Type Description <code>endpoint_url</code> <code>str</code> <p>The url of a OpenAI API compatible endpoint.</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        return self.invoke(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.run","title":"run","text":"<pre><code>run(messages, **kwargs)\n</code></pre> <p>Generate response from messages Args:     messages (str | BaseMessage | list[BaseMessage]): history of messages to         generate response from     **kwargs: additional arguments to pass to the OpenAI API Returns:     LLMInterface: generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n</code></pre>"},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Same as run</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/endpoint_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n</code></pre>"},{"location":"reference/llms/chats/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin","title":"LCChatMixin","text":"<p>Mixin for langchain based chat models</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) &gt; 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) &gt; 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n</code></pre>"},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin.invoke","title":"invoke","text":"<pre><code>invoke(messages, **kwargs)\n</code></pre> <p>Generate response from messages</p> <p>Parameters:</p> Name Type Description Default <code>messages</code> <code>str | BaseMessage | list[BaseMessage]</code> <p>history of messages to generate response from</p> required <code>**kwargs</code> <p>additional arguments to pass to the langchain chat model</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>LLMInterface</code> <code>LLMInterface</code> <p>generated response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/langchain_based.py</code> <pre><code>def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -&gt; LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n</code></pre>"},{"location":"reference/llms/chats/llamacpp/","title":"Llamacpp","text":""},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat","title":"LlamaCppChat","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Wrapper around the llama-cpp-python's Llama model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -&gt; \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n</code></pre>"},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat.client_object","title":"client_object","text":"<pre><code>client_object()\n</code></pre> <p>Get the llama-cpp-python client object</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/llamacpp.py</code> <pre><code>@Param.auto()\ndef client_object(self) -&gt; \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n</code></pre>"},{"location":"reference/llms/chats/openai/","title":"Openai","text":""},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI","title":"BaseChatOpenAI","text":"<p>               Bases: <code>ChatLLM</code></p> <p>Base interface for OpenAI chat model, using the openai library</p> <p>This class exposes the parameters in resources.Chat. To subclass this class:</p> <pre><code>- Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class BaseChatOpenAI(ChatLLM):\n    \"\"\"Base interface for OpenAI chat model, using the openai library\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n    _capabilities = [\"chat\", \"text\"]  # consider as mixin\n\n    api_key: str = Param(help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request\"\n    )\n\n    temperature: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between 0 and 2 that controls the randomness of the generated \"\n            \"tokens. Lower values make the model more deterministic, while higher \"\n            \"values make the model more random.\"\n        ),\n    )\n    max_tokens: Optional[int] = Param(\n        None,\n        help=(\n            \"Maximum number of tokens to generate. The total length of input tokens \"\n            \"and generated tokens is limited by the model's context length.\"\n        ),\n    )\n    n: int = Param(\n        1,\n        help=(\n            \"Number of completions to generate. The API will generate n completion \"\n            \"for each prompt.\"\n        ),\n    )\n    stop: Optional[str | list[str]] = Param(\n        None,\n        help=(\n            \"Stop sequence. If a stop sequence is detected, generation will stop \"\n            \"at that point. If not specified, generation will continue until the \"\n            \"maximum token length is reached.\"\n        ),\n    )\n    frequency_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing frequency in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    presence_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing presence in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    tool_choice: Optional[str] = Param(\n        None,\n        help=(\n            \"Choice of tool to use for the completion. Available choices are: \"\n            \"auto, default.\"\n        ),\n    )\n    tools: Optional[list[str]] = Param(\n        None,\n        help=\"List of tools to use for the completion.\",\n    )\n    logprobs: Optional[bool] = Param(\n        None,\n        help=(\n            \"Include log probabilities on the logprobs most likely tokens, \"\n            \"as well as the chosen token.\"\n        ),\n    )\n    logit_bias: Optional[dict] = Param(\n        None,\n        help=(\n            \"Dictionary of logit bias values to add to the logits of the tokens \"\n            \"in the vocabulary.\"\n        ),\n    )\n    top_logprobs: Optional[int] = Param(\n        None,\n        help=(\n            \"An integer between 0 and 5 specifying the number of most likely tokens \"\n            \"to return at each token position, each with an associated log \"\n            \"probability. `logprobs` must also be set to `true` if this parameter \"\n            \"is used.\"\n        ),\n    )\n    top_p: Optional[float] = Param(\n        None,\n        help=(\n            \"An alternative to sampling with temperature, called nucleus sampling, \"\n            \"where the model considers the results of the token with top_p \"\n            \"probability mass. So 0.1 means that only the tokens comprising the \"\n            \"top 10% probability mass are considered.\"\n        ),\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -&gt; list[\"ChatCompletionMessageParam\"]:\n        \"\"\"Prepare the message into OpenAI format\n\n        Returns:\n            list[dict]: List of messages in OpenAI format\n        \"\"\"\n        input_: list[BaseMessage] = []\n        output_: list[\"ChatCompletionMessageParam\"] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        for message in input_:\n            output_.append(message.to_openai_format())\n\n        return output_\n\n    def prepare_output(self, resp: dict) -&gt; LLMInterface:\n        \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n        additional_kwargs = {}\n        if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n            additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n                \"tool_calls\"\n            ]\n\n        if resp[\"choices\"][0].get(\"logprobs\") is None:\n            logprobs = []\n        else:\n            all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n            logprobs = (\n                [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n            )\n\n        output = LLMInterface(\n            candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n            content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n            total_tokens=resp[\"usage\"][\"total_tokens\"],\n            prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n            completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n            additional_kwargs=additional_kwargs,\n            messages=[\n                AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n                for _ in resp[\"choices\"]\n            ],\n            logprobs=logprobs,\n        )\n\n        return output\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; LLMInterface:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n        return self.prepare_output(resp)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; LLMInterface:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = await self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n\n        return self.prepare_output(resp)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; Iterator[LLMInterface]:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        for c in resp:\n            chunk = c.dict()\n            if not chunk[\"choices\"]:\n                continue\n            if chunk[\"choices\"][0][\"delta\"][\"content\"] is not None:\n                if chunk[\"choices\"][0].get(\"logprobs\") is None:\n                    logprobs = []\n                else:\n                    logprobs = [\n                        logprob[\"logprob\"]\n                        for logprob in chunk[\"choices\"][0][\"logprobs\"].get(\n                            \"content\", []\n                        )\n                    ]\n\n                yield LLMInterface(\n                    content=chunk[\"choices\"][0][\"delta\"][\"content\"], logprobs=logprobs\n                )\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -&gt; AsyncGenerator[LLMInterface, None]:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        async for chunk in resp:\n            if not chunk.choices:\n                continue\n            if chunk.choices[0].delta.content is not None:\n                yield LLMInterface(content=chunk.choices[0].delta.content)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_message","title":"prepare_message","text":"<pre><code>prepare_message(messages)\n</code></pre> <p>Prepare the message into OpenAI format</p> <p>Returns:</p> Type Description <code>list[ChatCompletionMessageParam]</code> <p>list[dict]: List of messages in OpenAI format</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_message(\n    self, messages: str | BaseMessage | list[BaseMessage]\n) -&gt; list[\"ChatCompletionMessageParam\"]:\n    \"\"\"Prepare the message into OpenAI format\n\n    Returns:\n        list[dict]: List of messages in OpenAI format\n    \"\"\"\n    input_: list[BaseMessage] = []\n    output_: list[\"ChatCompletionMessageParam\"] = []\n\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    for message in input_:\n        output_.append(message.to_openai_format())\n\n    return output_\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_output","title":"prepare_output","text":"<pre><code>prepare_output(resp)\n</code></pre> <p>Convert the OpenAI response into LLMInterface</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_output(self, resp: dict) -&gt; LLMInterface:\n    \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n    additional_kwargs = {}\n    if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n        additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n            \"tool_calls\"\n        ]\n\n    if resp[\"choices\"][0].get(\"logprobs\") is None:\n        logprobs = []\n    else:\n        all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n        logprobs = (\n            [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n        )\n\n    output = LLMInterface(\n        candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n        content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n        total_tokens=resp[\"usage\"][\"total_tokens\"],\n        prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n        completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n        additional_kwargs=additional_kwargs,\n        messages=[\n            AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n            for _ in resp[\"choices\"]\n        ],\n        logprobs=logprobs,\n    )\n\n    return output\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI","title":"ChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI","title":"AzureChatOpenAI","text":"<p>               Bases: <code>BaseChatOpenAI</code></p> <p>OpenAI chat model provided by Microsoft Azure</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"<pre><code>prepare_client(async_version=False)\n</code></pre> <p>Get the OpenAI client</p> <p>Parameters:</p> Name Type Description Default <code>async_version</code> <code>bool</code> <p>Whether to get the async version of the client</p> <code>False</code> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n</code></pre>"},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.openai_response","title":"openai_response","text":"<pre><code>openai_response(client, **kwargs)\n</code></pre> <p>Get the openai response</p> Source code in <code>libs/kotaemon/kotaemon/llms/chats/openai.py</code> <pre><code>def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n</code></pre>"},{"location":"reference/llms/completions/","title":"Completions","text":""},{"location":"reference/llms/completions/#llms.completions.AzureOpenAI","title":"AzureOpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's AzureOpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n</code></pre>"},{"location":"reference/llms/completions/#llms.completions.LlamaCpp","title":"LlamaCpp","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's LlamaCpp class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n</code></pre>"},{"location":"reference/llms/completions/#llms.completions.OpenAI","title":"OpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's OpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n</code></pre>"},{"location":"reference/llms/completions/base/","title":"Base","text":""},{"location":"reference/llms/completions/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.OpenAI","title":"OpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's OpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n</code></pre>"},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.AzureOpenAI","title":"AzureOpenAI","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's AzureOpenAI class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n</code></pre>"},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.LlamaCpp","title":"LlamaCpp","text":"<p>               Bases: <code>LCCompletionMixin</code>, <code>LLM</code></p> <p>Wrapper around Langchain's LlamaCpp class, focusing on key parameters</p> Source code in <code>libs/kotaemon/kotaemon/llms/completions/langchain_based.py</code> <pre><code>class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n</code></pre>"},{"location":"reference/llms/prompts/","title":"Prompts","text":""},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent","title":"BasePromptComponent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base class for prompt components.</p> <p>Parameters:</p> Name Type Description Default <code>template</code> <code>PromptTemplate</code> <p>The prompt template.</p> required <code>**kwargs</code> <p>Any additional keyword arguments that will be used to populate the given template.</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.set_value","title":"set_value","text":"<pre><code>set_value(**kwargs)\n</code></pre> <p>Similar to <code>__set</code> but for external use.</p> <p>Set the values of the attributes in the object based on the provided keyword     arguments.</p> <p>Parameters:</p> Name Type Description Default <code>kwargs</code> <code>dict</code> <p>A dictionary with the attribute names as keys and the new values as values.</p> <code>{}</code> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the function with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to pass to the function.</p> <code>{}</code> <p>Returns:</p> Type Description <p>The result of calling the <code>populate</code> method of the <code>template</code> object</p> <p>with the given keyword arguments.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate","title":"PromptTemplate","text":"<p>Base class for prompt templates.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -&gt; str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"<pre><code>check_missing_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"<pre><code>check_redundant_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.populate","title":"populate","text":"<pre><code>populate(**kwargs)\n</code></pre> <p>Strictly populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Type Description <code>str</code> <p>The populated template.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If an unknown placeholder is provided.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def populate(self, **kwargs) -&gt; str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.partial_populate","title":"partial_populate","text":"<pre><code>partial_populate(**kwargs)\n</code></pre> <p>Partially populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>str</code> <p>The populated template.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n</code></pre>"},{"location":"reference/llms/prompts/base/","title":"Base","text":""},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent","title":"BasePromptComponent","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base class for prompt components.</p> <p>Parameters:</p> Name Type Description Default <code>template</code> <code>PromptTemplate</code> <p>The prompt template.</p> required <code>**kwargs</code> <p>Any additional keyword arguments that will be used to populate the given template.</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n</code></pre>"},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.set_value","title":"set_value","text":"<pre><code>set_value(**kwargs)\n</code></pre> <p>Similar to <code>__set</code> but for external use.</p> <p>Set the values of the attributes in the object based on the provided keyword     arguments.</p> <p>Parameters:</p> Name Type Description Default <code>kwargs</code> <code>dict</code> <p>A dictionary with the attribute names as keys and the new values as values.</p> <code>{}</code> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.run","title":"run","text":"<pre><code>run(**kwargs)\n</code></pre> <p>Run the function with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to pass to the function.</p> <code>{}</code> <p>Returns:</p> Type Description <p>The result of calling the <code>populate</code> method of the <code>template</code> object</p> <p>with the given keyword arguments.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/base.py</code> <pre><code>def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n</code></pre>"},{"location":"reference/llms/prompts/template/","title":"Template","text":""},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate","title":"PromptTemplate","text":"<p>Base class for prompt templates.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -&gt; str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"<pre><code>check_missing_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"<pre><code>check_redundant_kwargs(**kwargs)\n</code></pre> <p>Check if all the placeholders in the template are set.</p> <p>This function checks if all the expected placeholders in the template are set as     attributes of the object. If any placeholders are missing, a <code>ValueError</code>     is raised with the names of the missing keys.</p> <p>Returns:</p> Type Description <p>None</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.populate","title":"populate","text":"<pre><code>populate(**kwargs)\n</code></pre> <p>Strictly populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Type Description <code>str</code> <p>The populated template.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>If an unknown placeholder is provided.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def populate(self, **kwargs) -&gt; str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n</code></pre>"},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.partial_populate","title":"partial_populate","text":"<pre><code>partial_populate(**kwargs)\n</code></pre> <p>Partially populate the template with the given keyword arguments.</p> <p>Parameters:</p> Name Type Description Default <code>**kwargs</code> <p>The keyword arguments to populate the template.       Each keyword corresponds to a placeholder in the template.</p> <code>{}</code> <p>Returns:</p> Name Type Description <code>str</code> <p>The populated template.</p> Source code in <code>libs/kotaemon/kotaemon/llms/prompts/template.py</code> <pre><code>def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n</code></pre>"},{"location":"reference/loaders/","title":"Loaders","text":""},{"location":"reference/loaders/#loaders.AdobeReader","title":"AdobeReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import AdobeReader\n&gt;&gt; reader = AdobeReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Args:     endpoint: URL to the Vision Language Model endpoint. If not provided,     will use the default <code>kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT</code></p> <pre><code>max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import AdobeReader\n        &gt;&gt; reader = AdobeReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.AdobeReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, **kwargs)\n</code></pre> <p>Load data by calling to the Adobe's API</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>Path to the PDF file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image</p> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Utilize Azure AI Document Intelligence to parse document</p> <p>As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read &amp; crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Extract the input file, allowing multi-modal extraction</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read &amp; crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/#loaders.AutoReader","title":"AutoReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General auto reader for a variety of files. (based on llama-hub)</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -&gt; None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        return self.load_data(file=file, **kwargs)\n</code></pre>"},{"location":"reference/loaders/#loaders.BaseReader","title":"BaseReader","text":"<p>               Bases: <code>BaseComponent</code></p> <p>The base class for all readers</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n</code></pre>"},{"location":"reference/loaders/#loaders.DirectoryReader","title":"DirectoryReader","text":"<p>               Bases: <code>LIReaderMixin</code>, <code>BaseReader</code></p> <p>Wrap around llama-index SimpleDirectoryReader</p> <p>Parameters:</p> Name Type Description Default <code>input_dir</code> <code>str</code> <p>Path to the directory.</p> required <code>input_files</code> <code>List</code> <p>List of file paths to read (Optional; overrides input_dir, exclude)</p> required <code>exclude</code> <code>List</code> <p>glob of python file paths to exclude (Optional)</p> required <code>exclude_hidden</code> <code>bool</code> <p>Whether to exclude hidden files (dotfiles).</p> required <code>encoding</code> <code>str</code> <p>Encoding of the files. Default is utf-8.</p> required <code>errors</code> <code>str</code> <p>how encoding and decoding errors are to be handled,   see https://docs.python.org/3/library/functions.html#open</p> required <code>recursive</code> <code>bool</code> <p>Whether to recursively search in subdirectories. False by default.</p> required <code>filename_as_id</code> <code>bool</code> <p>Whether to use the filename as the document id. False by default.</p> required <code>required_exts</code> <code>Optional[List[str]]</code> <p>List of required extensions. Default is None.</p> required <code>file_extractor</code> <code>Optional[Dict[str, BaseReader]]</code> <p>A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.</p> required <code>num_files_limit</code> <code>Optional[int]</code> <p>Maximum number of files to read. Default is None.</p> required <code>file_metadata</code> <code>Optional[Callable[str, Dict]]</code> <p>A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/composite_loader.py</code> <pre><code>class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -&gt; Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n</code></pre>"},{"location":"reference/loaders/#loaders.DocxReader","title":"DocxReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read Docx files that respect table, using python-docx library</p> Reader behavior <ul> <li>All paragraphs are extracted as a Document</li> <li>Each table is extracted as a Document, rendered as a CSV string</li> <li>The output is a list of Documents, concatenating the above (tables + paragraphs)</li> </ul> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -&gt; List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.DocxReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Docx reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to .docx file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.ExcelReader","title":"ExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Spreadsheet exporter respecting multiple worksheets</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n</code></pre>"},{"location":"reference/loaders/#loaders.ExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>True</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n</code></pre>"},{"location":"reference/loaders/#loaders.PandasExcelReader","title":"PandasExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Pandas-based CSV parser.</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n</code></pre>"},{"location":"reference/loaders/#loaders.PandasExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>False</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n</code></pre>"},{"location":"reference/loaders/#loaders.HtmlReader","title":"HtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Reader HTML usimg html2text</p> Reader behavior <ul> <li>HTML is read with html2text.</li> <li>All of the texts will be split by <code>page_break_pattern</code></li> <li>Each page is extracted as a Document</li> <li>The output is a list of Documents</li> </ul> <p>Parameters:</p> Name Type Description Default <code>page_break_pattern</code> <code>str</code> <p>Pattern to split the HTML into pages</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.HtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Html reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path | str</code> <p>path to HTML file</p> required <code>extra_info</code> <code>Optional[dict]</code> <p>extra information passed to this reader during extracting data</p> <code>None</code> <p>Returns:</p> Type Description <code>list[Document]</code> <p>list[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.MhtmlReader","title":"MhtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Parse <code>MHTML</code> files with <code>BeautifulSoup</code>.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -&gt; None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/#loaders.MhtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load MHTML document into document objects.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/#loaders.MathpixPDFReader","title":"MathpixPDFReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Load <code>PDF</code> files using <code>Mathpix</code> service.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -&gt; Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -&gt; str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -&gt; dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -&gt; str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -&gt; None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -&gt; str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -&gt; str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"<pre><code>wait_for_processing(pdf_id)\n</code></pre> <p>Wait for processing to complete.</p> <p>Parameters:</p> Name Type Description Default <code>pdf_id</code> <code>str</code> <p>a PDF id.</p> required <p>Returns: None</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def wait_for_processing(self, pdf_id: str) -&gt; None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n</code></pre>"},{"location":"reference/loaders/#loaders.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"<pre><code>clean_pdf(contents)\n</code></pre> <p>Clean the PDF file.</p> <p>Parameters:</p> Name Type Description Default <code>contents</code> <code>str</code> <p>a PDF file contents.</p> required <p>Returns:</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def clean_pdf(self, contents: str) -&gt; str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n</code></pre>"},{"location":"reference/loaders/#loaders.ImageReader","title":"ImageReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from knowledgehub.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from knowledgehub.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n</code></pre>"},{"location":"reference/loaders/#loaders.ImageReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n</code></pre>"},{"location":"reference/loaders/#loaders.OCRReader","title":"OCRReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.OCRReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.PDFThumbnailReader","title":"PDFThumbnailReader","text":"<p>               Bases: <code>PDFReader</code></p> <p>PDF parser with thumbnail for each page.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -&gt; None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.PDFThumbnailReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, fs=None)\n</code></pre> <p>Parse file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -&gt; List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/#loaders.UnstructuredReader","title":"UnstructuredReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General unstructured text reader for a variety of files.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n</code></pre>"},{"location":"reference/loaders/#loaders.UnstructuredReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n</code></pre> <p>If api is set, parse through api</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n</code></pre>"},{"location":"reference/loaders/adobe_loader/","title":"Adobe Loader","text":""},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader","title":"AdobeReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import AdobeReader\n&gt;&gt; reader = AdobeReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Args:     endpoint: URL to the Vision Language Model endpoint. If not provided,     will use the default <code>kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT</code></p> <pre><code>max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import AdobeReader\n        &gt;&gt; reader = AdobeReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n</code></pre>"},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, **kwargs)\n</code></pre> <p>Load data by calling to the Adobe's API</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>Path to the PDF file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image</p> Source code in <code>libs/kotaemon/kotaemon/loaders/adobe_loader.py</code> <pre><code>def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n</code></pre>"},{"location":"reference/loaders/azureai_document_intelligence_loader/","title":"Azureai Document Intelligence Loader","text":""},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Utilize Azure AI Document Intelligence to parse document</p> <p>As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read &amp; crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Extract the input file, allowing multi-modal extraction</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read &amp; crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n</code></pre>"},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.crop_image","title":"crop_image","text":"<pre><code>crop_image(file_path, bbox, page_number=0)\n</code></pre> <p>Crop the image based on the bounding box</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>path to the image file</p> required <code>bbox</code> <code>list[float]</code> <p>bounding box of the image (in percentage [x0, y0, x1, y1])</p> required <code>page_number</code> <code>int</code> <p>page number of the image. Defaults to 0.</p> <code>0</code> <p>Returns:</p> Type Description <code>Image</code> <p>Image.Image: cropped image</p> Source code in <code>libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py</code> <pre><code>def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -&gt; Image.Image:\n    \"\"\"Crop the image based on the bounding box\n\n    Args:\n        file_path (Path): path to the image file\n        bbox (list[float]): bounding box of the image (in percentage [x0, y0, x1, y1])\n        page_number (int, optional): page number of the image. Defaults to 0.\n\n    Returns:\n        Image.Image: cropped image\n    \"\"\"\n    left, upper, right, lower = bbox\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    if suffix == \".pdf\":\n        try:\n            import fitz\n        except ImportError:\n            raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n        doc = fitz.open(file_path)\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=150)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n    elif suffix in [\".tif\", \".tiff\"]:\n        img = Image.open(file_path)\n        img.seek(page_number)\n    else:\n        img = Image.open(file_path)\n\n    return img.crop(\n        (\n            int(left * img.width),\n            int(upper * img.height),\n            int(right * img.width),\n            int(lower * img.height),\n        )\n    )\n</code></pre>"},{"location":"reference/loaders/base/","title":"Base","text":""},{"location":"reference/loaders/base/#loaders.base.BaseReader","title":"BaseReader","text":"<p>               Bases: <code>BaseComponent</code></p> <p>The base class for all readers</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n</code></pre>"},{"location":"reference/loaders/base/#loaders.base.AutoReader","title":"AutoReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General auto reader for a variety of files. (based on llama-hub)</p> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -&gt; None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -&gt; List[Document]:\n        return self.load_data(file=file, **kwargs)\n</code></pre>"},{"location":"reference/loaders/base/#loaders.base.LIReaderMixin","title":"LIReaderMixin","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Base wrapper around llama-index reader</p> <p>To use the LIBaseReader, you need to implement the _get_wrapped_class method to return the relevant llama-index reader class that you want to wrap.</p> <p>Example:</p> <pre><code>```python\nclass DirectoryReader(LIBaseReader):\n    def _get_wrapped_class(self) -&gt; Type[\"BaseReader\"]:\n        from llama_index import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n```\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/base.py</code> <pre><code>class LIReaderMixin(BaseComponent):\n    \"\"\"Base wrapper around llama-index reader\n\n    To use the LIBaseReader, you need to implement the _get_wrapped_class method to\n    return the relevant llama-index reader class that you want to wrap.\n\n    Example:\n\n        ```python\n        class DirectoryReader(LIBaseReader):\n            def _get_wrapped_class(self) -&gt; Type[\"BaseReader\"]:\n                from llama_index import SimpleDirectoryReader\n\n                return SimpleDirectoryReader\n        ```\n    \"\"\"\n\n    def _get_wrapped_class(self) -&gt; Type[\"LIBaseReader\"]:\n        raise NotImplementedError(\n            \"Please return the relevant llama-index class in in _get_wrapped_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        self._reader_class = self._get_wrapped_class()\n        self._reader = self._reader_class(*args, **kwargs)\n        super().__init__()\n\n    def __setattr__(self, name: str, value: Any) -&gt; None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._reader, name, value)\n\n    def __getattr__(self, name: str) -&gt; Any:\n        return getattr(self._reader, name)\n\n    def load_data(self, *args, **kwargs: Any) -&gt; List[Document]:\n        documents = self._reader.load_data(*args, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, *args, **kwargs: Any) -&gt; List[Document]:\n        return self.load_data(*args, **kwargs)\n</code></pre>"},{"location":"reference/loaders/composite_loader/","title":"Composite Loader","text":""},{"location":"reference/loaders/composite_loader/#loaders.composite_loader.DirectoryReader","title":"DirectoryReader","text":"<p>               Bases: <code>LIReaderMixin</code>, <code>BaseReader</code></p> <p>Wrap around llama-index SimpleDirectoryReader</p> <p>Parameters:</p> Name Type Description Default <code>input_dir</code> <code>str</code> <p>Path to the directory.</p> required <code>input_files</code> <code>List</code> <p>List of file paths to read (Optional; overrides input_dir, exclude)</p> required <code>exclude</code> <code>List</code> <p>glob of python file paths to exclude (Optional)</p> required <code>exclude_hidden</code> <code>bool</code> <p>Whether to exclude hidden files (dotfiles).</p> required <code>encoding</code> <code>str</code> <p>Encoding of the files. Default is utf-8.</p> required <code>errors</code> <code>str</code> <p>how encoding and decoding errors are to be handled,   see https://docs.python.org/3/library/functions.html#open</p> required <code>recursive</code> <code>bool</code> <p>Whether to recursively search in subdirectories. False by default.</p> required <code>filename_as_id</code> <code>bool</code> <p>Whether to use the filename as the document id. False by default.</p> required <code>required_exts</code> <code>Optional[List[str]]</code> <p>List of required extensions. Default is None.</p> required <code>file_extractor</code> <code>Optional[Dict[str, BaseReader]]</code> <p>A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.</p> required <code>num_files_limit</code> <code>Optional[int]</code> <p>Maximum number of files to read. Default is None.</p> required <code>file_metadata</code> <code>Optional[Callable[str, Dict]]</code> <p>A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/composite_loader.py</code> <pre><code>class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -&gt; Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n</code></pre>"},{"location":"reference/loaders/docx_loader/","title":"Docx Loader","text":""},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader","title":"DocxReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read Docx files that respect table, using python-docx library</p> Reader behavior <ul> <li>All paragraphs are extracted as a Document</li> <li>Each table is extracted as a Document, rendered as a CSV string</li> <li>The output is a list of Documents, concatenating the above (tables + paragraphs)</li> </ul> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -&gt; List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Docx reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to .docx file</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/docx_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/excel_loader/","title":"Excel Loader","text":"<p>Pandas Excel reader.</p> <p>Pandas parser for .xlsx files.</p>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader","title":"PandasExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Pandas-based CSV parser.</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n</code></pre>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>False</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n</code></pre>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader","title":"ExcelReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Spreadsheet exporter respecting multiple worksheets</p> <p>Parses CSVs using the separator detection from Pandas <code>read_csv</code> function. If special parameters are required, use the <code>pandas_config</code> dict.</p> <p>Args:</p> <pre><code>pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n</code></pre>"},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n</code></pre> <p>Parse file and extract values from a specific column.</p> <p>Parameters:</p> Name Type Description Default <code>file</code> <code>Path</code> <p>The path to the Excel file to read.</p> required <code>include_sheetname</code> <code>bool</code> <p>Whether to include the sheet name in the output.</p> <code>True</code> <code>sheet_name</code> <code>Union[str, int, None]</code> <p>The specific sheet to read from, default is None which reads all sheets.</p> <code>None</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/excel_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n</code></pre>"},{"location":"reference/loaders/html_loader/","title":"Html Loader","text":""},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader","title":"HtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Reader HTML usimg html2text</p> Reader behavior <ul> <li>HTML is read with html2text.</li> <li>All of the texts will be split by <code>page_break_pattern</code></li> <li>Each page is extracted as a Document</li> <li>The output is a list of Documents</li> </ul> <p>Parameters:</p> Name Type Description Default <code>page_break_pattern</code> <code>str</code> <p>Pattern to split the HTML into pages</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using Html reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path | str</code> <p>path to HTML file</p> required <code>extra_info</code> <code>Optional[dict]</code> <p>extra information passed to this reader during extracting data</p> <code>None</code> <p>Returns:</p> Type Description <code>list[Document]</code> <p>list[Document]: list of documents extracted from the HTML file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader","title":"MhtmlReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Parse <code>MHTML</code> files with <code>BeautifulSoup</code>.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -&gt; None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load MHTML document into document objects.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/html_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -&gt; list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n</code></pre>"},{"location":"reference/loaders/mathpix_loader/","title":"Mathpix Loader","text":""},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader","title":"MathpixPDFReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Load <code>PDF</code> files using <code>Mathpix</code> service.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -&gt; Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -&gt; str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -&gt; dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -&gt; str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -&gt; None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -&gt; str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -&gt; str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"<pre><code>wait_for_processing(pdf_id)\n</code></pre> <p>Wait for processing to complete.</p> <p>Parameters:</p> Name Type Description Default <code>pdf_id</code> <code>str</code> <p>a PDF id.</p> required <p>Returns: None</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def wait_for_processing(self, pdf_id: str) -&gt; None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n</code></pre>"},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"<pre><code>clean_pdf(contents)\n</code></pre> <p>Clean the PDF file.</p> <p>Parameters:</p> Name Type Description Default <code>contents</code> <code>str</code> <p>a PDF file contents.</p> required <p>Returns:</p> Source code in <code>libs/kotaemon/kotaemon/loaders/mathpix_loader.py</code> <pre><code>def clean_pdf(self, contents: str) -&gt; str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n</code></pre>"},{"location":"reference/loaders/ocr_loader/","title":"Ocr Loader","text":""},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader","title":"OCRReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from kotaemon.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from kotaemon.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader","title":"ImageReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>Read PDF using OCR, with high focus on table extraction</p> Example <pre><code>&gt;&gt; from knowledgehub.loaders import OCRReader\n&gt;&gt; reader = OCRReader()\n&gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n</code></pre> <p>Parameters:</p> Name Type Description Default <code>endpoint</code> <code>Optional[str]</code> <p>URL to FullOCR endpoint. If not provided, will look for environment variable <code>OCR_READER_ENDPOINT</code> or use the default <code>knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT</code> (http://127.0.0.1:8000/v2/ai/infer/)</p> <code>None</code> <code>use_ocr</code> <p>whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.</p> required Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        &gt;&gt; from knowledgehub.loaders import OCRReader\n        &gt;&gt; reader = OCRReader()\n        &gt;&gt; documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -&gt; List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n</code></pre>"},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader.load_data","title":"load_data","text":"<pre><code>load_data(file_path, extra_info=None, **kwargs)\n</code></pre> <p>Load data using OCR reader</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>Path to PDF file</p> required <code>debug_path</code> <code>Path</code> <p>Path to store debug image output</p> required <code>artifact_path</code> <code>Path</code> <p>Path to OCR endpoints artifacts directory</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: list of documents extracted from the PDF file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/ocr_loader.py</code> <pre><code>def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -&gt; List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n</code></pre>"},{"location":"reference/loaders/pdf_loader/","title":"Pdf Loader","text":""},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader","title":"PDFThumbnailReader","text":"<p>               Bases: <code>PDFReader</code></p> <p>PDF parser with thumbnail for each page.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -&gt; None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -&gt; List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n</code></pre>"},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader.load_data","title":"load_data","text":"<pre><code>load_data(file, extra_info=None, fs=None)\n</code></pre> <p>Parse file.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -&gt; List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n</code></pre>"},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.get_page_thumbnails","title":"get_page_thumbnails","text":"<pre><code>get_page_thumbnails(file_path, pages, dpi=80)\n</code></pre> <p>Get image thumbnails of the pages in the PDF file.</p> <p>Parameters:</p> Name Type Description Default <code>file_path</code> <code>Path</code> <p>path to the image file</p> required <code>page_number</code> <code>list[int]</code> <p>list of page numbers to extract</p> required <p>Returns:</p> Type Description <code>List[Image]</code> <p>list[Image.Image]: list of page thumbnails</p> Source code in <code>libs/kotaemon/kotaemon/loaders/pdf_loader.py</code> <pre><code>def get_page_thumbnails(\n    file_path: Path, pages: list[int], dpi: int = 80\n) -&gt; List[Image.Image]:\n    \"\"\"Get image thumbnails of the pages in the PDF file.\n\n    Args:\n        file_path (Path): path to the image file\n        page_number (list[int]): list of page numbers to extract\n\n    Returns:\n        list[Image.Image]: list of page thumbnails\n    \"\"\"\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    assert suffix == \".pdf\", \"This function only supports PDF files.\"\n    try:\n        import fitz\n    except ImportError:\n        raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n    doc = fitz.open(file_path)\n\n    output_imgs = []\n    for page_number in pages:\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=dpi)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n        output_imgs.append(convert_image_to_base64(img))\n\n    return output_imgs\n</code></pre>"},{"location":"reference/loaders/txt_loader/","title":"Txt Loader","text":""},{"location":"reference/loaders/unstructured_loader/","title":"Unstructured Loader","text":"<p>Unstructured file reader.</p> <p>A parser for unstructured text files using Unstructured.io. Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.</p> <p>To use .doc and .xls parser, install</p> <p>sudo apt-get install -y libmagic-dev poppler-utils libreoffice pip install xlrd</p>"},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader","title":"UnstructuredReader","text":"<p>               Bases: <code>BaseReader</code></p> <p>General unstructured text reader for a variety of files.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -&gt; None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -&gt; List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n</code></pre>"},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader.load_data","title":"load_data","text":"<pre><code>load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n</code></pre> <p>If api is set, parse through api</p> Source code in <code>libs/kotaemon/kotaemon/loaders/unstructured_loader.py</code> <pre><code>def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -&gt; List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n</code></pre>"},{"location":"reference/loaders/utils/","title":"Utils","text":""},{"location":"reference/loaders/utils/adobe/","title":"Adobe","text":""},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.request_adobe_service","title":"request_adobe_service","text":"<pre><code>request_adobe_service(file_path, output_path='')\n</code></pre> <p>Main function to call the adobe service, and unzip the results. Args:     file_path (str): path to the pdf file     output_path (str): path to store the results</p> <p>Returns:</p> Name Type Description <code>output_path</code> <code>str</code> <p>path to the results</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def request_adobe_service(file_path: str, output_path: str = \"\") -&gt; str:\n    \"\"\"Main function to call the adobe service, and unzip the results.\n    Args:\n        file_path (str): path to the pdf file\n        output_path (str): path to store the results\n\n    Returns:\n        output_path (str): path to the results\n\n    \"\"\"\n    try:\n        from adobe.pdfservices.operation.auth.credentials import Credentials\n        from adobe.pdfservices.operation.exception.exceptions import (\n            SdkException,\n            ServiceApiException,\n            ServiceUsageException,\n        )\n        from adobe.pdfservices.operation.execution_context import ExecutionContext\n        from adobe.pdfservices.operation.io.file_ref import FileRef\n        from adobe.pdfservices.operation.pdfops.extract_pdf_operation import (\n            ExtractPDFOperation,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import (  # noqa: E501\n            ExtractElementType,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import (  # noqa: E501\n            ExtractPDFOptions,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import (  # noqa: E501\n            ExtractRenditionsElementType,\n        )\n    except ImportError:\n        raise ImportError(\n            \"pdfservices-sdk is not installed. \"\n            \"Please install it by running `pip install pdfservices-sdk\"\n            \"@git+https://github.com/niallcm/pdfservices-python-sdk.git\"\n            \"@bump-and-unfreeze-requirements`\"\n        )\n\n    if not output_path:\n        output_path = tempfile.mkdtemp()\n\n    try:\n        # Initial setup, create credentials instance.\n        credentials = (\n            Credentials.service_principal_credentials_builder()\n            .with_client_id(config(\"PDF_SERVICES_CLIENT_ID\", default=\"\"))\n            .with_client_secret(config(\"PDF_SERVICES_CLIENT_SECRET\", default=\"\"))\n            .build()\n        )\n\n        # Create an ExecutionContext using credentials\n        # and create a new operation instance.\n        execution_context = ExecutionContext.create(credentials)\n        extract_pdf_operation = ExtractPDFOperation.create_new()\n\n        # Set operation input from a source file.\n        source = FileRef.create_from_local_file(file_path)\n        extract_pdf_operation.set_input(source)\n\n        # Build ExtractPDF options and set them into the operation\n        extract_pdf_options: ExtractPDFOptions = (\n            ExtractPDFOptions.builder()\n            .with_elements_to_extract(\n                [ExtractElementType.TEXT, ExtractElementType.TABLES]\n            )\n            .with_elements_to_extract_renditions(\n                [\n                    ExtractRenditionsElementType.TABLES,\n                    ExtractRenditionsElementType.FIGURES,\n                ]\n            )\n            .build()\n        )\n        extract_pdf_operation.set_options(extract_pdf_options)\n\n        # Execute the operation.\n        result: FileRef = extract_pdf_operation.execute(execution_context)\n\n        # Save the result to the specified location.\n        zip_file_path = os.path.join(\n            output_path, \"ExtractTextTableWithFigureTableRendition.zip\"\n        )\n        result.save_as(zip_file_path)\n        # Open the ZIP file\n        with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n            # Extract all contents to the destination folder\n            zip_ref.extractall(output_path)\n    except (ServiceApiException, ServiceUsageException, SdkException):\n        logging.exception(\"Exception encountered while executing operation\")\n\n    return output_path\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.make_markdown_table","title":"make_markdown_table","text":"<pre><code>make_markdown_table(table_as_list)\n</code></pre> <p>Convert table from python list representation to markdown format. The input list consists of rows of tables, the first row is the header.</p> <p>Parameters:</p> Name Type Description Default <code>table_as_list</code> <code>List[str]</code> <p>list of table rows Example: [[\"Name\", \"Age\", \"Height\"],         [\"Jake\", 20, 5'10],         [\"Mary\", 21, 5'7]]</p> required <p>Returns:     markdown representation of the table</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def make_markdown_table(table_as_list: List[str]) -&gt; str:\n    \"\"\"\n    Convert table from python list representation to markdown format.\n    The input list consists of rows of tables, the first row is the header.\n\n    Args:\n        table_as_list: list of table rows\n            Example: [[\"Name\", \"Age\", \"Height\"],\n                    [\"Jake\", 20, 5'10],\n                    [\"Mary\", 21, 5'7]]\n    Returns:\n        markdown representation of the table\n    \"\"\"\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in table_as_list[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(table_as_list[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in table_as_list[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_json","title":"load_json","text":"<pre><code>load_json(input_path)\n</code></pre> <p>Load json file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def load_json(input_path: Union[str | Path]) -&gt; dict:\n    \"\"\"Load json file\"\"\"\n    with open(input_path, \"r\") as fi:\n        data = json.load(fi)\n\n    return data\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_excel","title":"load_excel","text":"<pre><code>load_excel(input_path)\n</code></pre> <p>Load excel file and convert to markdown</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def load_excel(input_path: Union[str | Path]) -&gt; str:\n    \"\"\"Load excel file and convert to markdown\"\"\"\n\n    df = pd.read_excel(input_path).fillna(\"\")\n    # Convert dataframe to a list of rows\n    row_list = [df.columns.values.tolist()] + df.values.tolist()\n\n    for item_id, item in enumerate(row_list[0]):\n        if \"Unnamed\" in item:\n            row_list[0][item_id] = \"\"\n\n    for row in row_list:\n        for item_id, item in enumerate(row):\n            row[item_id] = str(item).replace(\"_x000D_\", \" \").replace(\"\\n\", \" \").strip()\n\n    markdown_str = make_markdown_table(row_list)\n    return markdown_str\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.encode_image_base64","title":"encode_image_base64","text":"<pre><code>encode_image_base64(image_path)\n</code></pre> <p>Convert image to base64</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def encode_image_base64(image_path: Union[str | Path]) -&gt; Union[bytes, str]:\n    \"\"\"Convert image to base64\"\"\"\n\n    with open(image_path, \"rb\") as image_file:\n        return base64.b64encode(image_file.read()).decode(\"utf-8\")\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_table_paths","title":"parse_table_paths","text":"<pre><code>parse_table_paths(file_paths)\n</code></pre> <p>Read the table stored in an excel file given the file path</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def parse_table_paths(file_paths: List[Path]) -&gt; str:\n    \"\"\"Read the table stored in an excel file given the file path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".xlsx\":\n            content = load_excel(path)\n            break\n    return content\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_figure_paths","title":"parse_figure_paths","text":"<pre><code>parse_figure_paths(file_paths)\n</code></pre> <p>Read and convert an image to base64 given the image path</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def parse_figure_paths(file_paths: List[Path]) -&gt; Union[bytes, str]:\n    \"\"\"Read and convert an image to base64 given the image path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".png\":\n            base64_image = encode_image_base64(path)\n            content = f\"data:image/png;base64,{base64_image}\"  # type: ignore\n            break\n    return content\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_single_figure_caption","title":"generate_single_figure_caption","text":"<pre><code>generate_single_figure_caption(vlm_endpoint, figure)\n</code></pre> <p>Summarize a single figure using GPT-4V</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def generate_single_figure_caption(vlm_endpoint: str, figure: str) -&gt; str:\n    \"\"\"Summarize a single figure using GPT-4V\"\"\"\n    if figure:\n        output = generate_gpt4v(\n            endpoint=vlm_endpoint,\n            prompt=\"Provide a short 2 sentence summary of this image?\",\n            images=figure,\n        )\n        if \"sorry\" in output.lower():\n            output = \"\"\n    else:\n        output = \"\"\n    return output\n</code></pre>"},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_figure_captions","title":"generate_figure_captions","text":"<pre><code>generate_figure_captions(\n    vlm_endpoint, figures, max_figures_to_process\n)\n</code></pre> <p>Summarize several figures using GPT-4V. Args:     vlm_endpoint (str): endpoint to the vision language model service     figures (List): list of base64 images     max_figures_to_process (int): the maximum number of figures will be summarized,     the rest are ignored.</p> <p>Returns:</p> Name Type Description <code>results</code> <code>List[str]</code> <p>list of all figure captions and empty strings for</p> <code>List</code> <p>ignored figures.</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/adobe.py</code> <pre><code>def generate_figure_captions(\n    vlm_endpoint: str, figures: List, max_figures_to_process: int\n) -&gt; List:\n    \"\"\"Summarize several figures using GPT-4V.\n    Args:\n        vlm_endpoint (str): endpoint to the vision language model service\n        figures (List): list of base64 images\n        max_figures_to_process (int): the maximum number of figures will be summarized,\n        the rest are ignored.\n\n    Returns:\n        results (List[str]): list of all figure captions and empty strings for\n        ignored figures.\n    \"\"\"\n    to_gen_figures = figures[:max_figures_to_process]\n    other_figures = figures[max_figures_to_process:]\n\n    with ThreadPoolExecutor() as executor:\n        futures = [\n            executor.submit(\n                lambda: generate_single_figure_caption(vlm_endpoint, figure)\n            )\n            for figure in to_gen_figures\n        ]\n\n    results = [future.result() for future in futures]\n    return results + [\"\"] * len(other_figures)\n</code></pre>"},{"location":"reference/loaders/utils/box/","title":"Box","text":""},{"location":"reference/loaders/utils/box/#loaders.utils.box.bbox_to_points","title":"bbox_to_points","text":"<pre><code>bbox_to_points(box)\n</code></pre> <p>Convert bounding box to list of points</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def bbox_to_points(box: List[int]):\n    \"\"\"Convert bounding box to list of points\"\"\"\n    x1, y1, x2, y2 = box\n    return [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.points_to_bbox","title":"points_to_bbox","text":"<pre><code>points_to_bbox(points)\n</code></pre> <p>Convert list of points to bounding box</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def points_to_bbox(points: List[Tuple[int, int]]):\n    \"\"\"Convert list of points to bounding box\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    return [min(all_x), min(all_y), max(all_x), max(all_y)]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_points","title":"scale_points","text":"<pre><code>scale_points(points, scale_factor=1.0)\n</code></pre> <p>Scale points by a scale factor</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def scale_points(points: List[Tuple[int, int]], scale_factor: float = 1.0):\n    \"\"\"Scale points by a scale factor\"\"\"\n    return [(int(pos[0] * scale_factor), int(pos[1] * scale_factor)) for pos in points]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.union_points","title":"union_points","text":"<pre><code>union_points(points)\n</code></pre> <p>Return union bounding box of list of points</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def union_points(points: List[Tuple[int, int]]):\n    \"\"\"Return union bounding box of list of points\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    bbox = (min(all_x), min(all_y), max(all_x), max(all_y))\n    return bbox\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_box","title":"scale_box","text":"<pre><code>scale_box(box, scale_factor=1.0)\n</code></pre> <p>Scale box by a scale factor</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def scale_box(box: List[int], scale_factor: float = 1.0):\n    \"\"\"Scale box by a scale factor\"\"\"\n    return [int(pos * scale_factor) for pos in box]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_h","title":"box_h","text":"<pre><code>box_h(box)\n</code></pre> <p>Return box height</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def box_h(box: List[int]):\n    \"Return box height\"\n    return box[3] - box[1]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_w","title":"box_w","text":"<pre><code>box_w(box)\n</code></pre> <p>Return box width</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def box_w(box: List[int]):\n    \"Return box width\"\n    return box[2] - box[0]\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_area","title":"box_area","text":"<pre><code>box_area(box)\n</code></pre> <p>Return box area</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def box_area(box: List[int]):\n    \"Return box area\"\n    x1, y1, x2, y2 = box\n    return (x2 - x1) * (y2 - y1)\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.get_rect_iou","title":"get_rect_iou","text":"<pre><code>get_rect_iou(gt_box, pd_box, iou_type=0)\n</code></pre> <p>Intersection over union on layout rectangle</p> <p>Parameters:</p> Name Type Description Default <code>gt_box</code> <code>List[tuple]</code> <p>List[tuple] A list contains bounding box coordinates of ground truth</p> required <code>pd_box</code> <code>List[tuple]</code> <p>List[tuple] A list contains bounding box coordinates of prediction</p> required <code>iou_type</code> <p>int 0: intersection / union, normal IOU 1: intersection / min(areas), useful when boxes are under/over-segmented</p> <code>0</code> <code>Input</code> <code>format</code> <p>[(x1, y1), (x2, y1), (x2, y2), (x1, y2)]</p> required <code>Annotation</code> <code>for each element in bbox</code> required <p>Returns:</p> Type Description <code>int</code> <p>Intersection over union value</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -&gt; int:\n    \"\"\"Intersection over union on layout rectangle\n\n    Args:\n        gt_box: List[tuple]\n            A list contains bounding box coordinates of ground truth\n        pd_box: List[tuple]\n            A list contains bounding box coordinates of prediction\n        iou_type: int\n            0: intersection / union, normal IOU\n            1: intersection / min(areas), useful when boxes are under/over-segmented\n\n        Input format: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n        Annotation for each element in bbox:\n        (x1, y1)        (x2, y1)\n            +-------+\n            |       |\n            |       |\n            +-------+\n        (x1, y2)        (x2, y2)\n\n    Returns:\n        Intersection over union value\n    \"\"\"\n\n    assert iou_type in [0, 1], \"Only support 0: origin iou, 1: intersection / min(area)\"\n\n    # determine the (x, y)-coordinates of the intersection rectangle\n    # gt_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    # pd_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    x_left = max(gt_box[0][0], pd_box[0][0])\n    y_top = max(gt_box[0][1], pd_box[0][1])\n    x_right = min(gt_box[2][0], pd_box[2][0])\n    y_bottom = min(gt_box[2][1], pd_box[2][1])\n\n    # compute the area of intersection rectangle\n    interArea = max(0, x_right - x_left) * max(0, y_bottom - y_top)\n\n    # compute the area of both the prediction and ground-truth\n    # rectangles\n    gt_area = (gt_box[2][0] - gt_box[0][0]) * (gt_box[2][1] - gt_box[0][1])\n    pd_area = (pd_box[2][0] - pd_box[0][0]) * (pd_box[2][1] - pd_box[0][1])\n\n    # compute the intersection over union by taking the intersection\n    # area and dividing it by the sum of prediction + ground-truth\n    # areas - the intersection area\n    if iou_type == 0:\n        iou = interArea / float(gt_area + pd_area - interArea)\n    elif iou_type == 1:\n        iou = interArea / max(min(gt_area, pd_area), 1)\n\n    # return the intersection over union value\n    return iou\n</code></pre>"},{"location":"reference/loaders/utils/box/#loaders.utils.box.sort_funsd_reading_order","title":"sort_funsd_reading_order","text":"<pre><code>sort_funsd_reading_order(lines, box_key_name='box')\n</code></pre> <p>Sort cell list to create the right reading order using their locations</p> <p>Parameters:</p> Name Type Description Default <code>lines</code> <code>List[dict]</code> <p>list of cells to sort</p> required <p>Returns:</p> Type Description <p>a list of cell lists in the right reading order that contain</p> <p>no key or start with a key and contain no other key</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/box.py</code> <pre><code>def sort_funsd_reading_order(lines: List[dict], box_key_name: str = \"box\"):\n    \"\"\"Sort cell list to create the right reading order using their locations\n\n    Args:\n        lines: list of cells to sort\n\n    Returns:\n        a list of cell lists in the right reading order that contain\n        no key or start with a key and contain no other key\n    \"\"\"\n    sorted_list = []\n\n    if len(lines) == 0:\n        return lines\n\n    while len(lines) &gt; 1:\n        topleft_line = lines[0]\n        for line in lines[1:]:\n            topleft_line_pos = topleft_line[box_key_name]\n            topleft_line_center_y = (topleft_line_pos[1] + topleft_line_pos[3]) / 2\n            x1, y1, x2, y2 = line[box_key_name]\n            box_center_x = (x1 + x2) / 2\n            box_center_y = (y1 + y2) / 2\n            cell_h = y2 - y1\n            if box_center_y &lt;= topleft_line_center_y - cell_h / 2:\n                topleft_line = line\n                continue\n            if (\n                box_center_x &lt; topleft_line_pos[2]\n                and box_center_y &lt; topleft_line_pos[3]\n            ):\n                topleft_line = line\n                continue\n        sorted_list.append(topleft_line)\n        lines.remove(topleft_line)\n\n    sorted_list.append(lines[0])\n\n    return sorted_list\n</code></pre>"},{"location":"reference/loaders/utils/gpt4v/","title":"Gpt4V","text":""},{"location":"reference/loaders/utils/pdf_ocr/","title":"Pdf Ocr","text":""},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.read_pdf_unstructured","title":"read_pdf_unstructured","text":"<pre><code>read_pdf_unstructured(input_path)\n</code></pre> <p>Convert PDF from specified path to list of text items with location information</p> <p>Parameters:</p> Name Type Description Default <code>input_path</code> <code>Union[Path, str]</code> <p>path to input file</p> required <p>Returns:</p> Type Description <p>Dict page_number: list of text boxes</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def read_pdf_unstructured(input_path: Union[Path, str]):\n    \"\"\"Convert PDF from specified path to list of text items with\n    location information\n\n    Args:\n        input_path: path to input file\n\n    Returns:\n        Dict page_number: list of text boxes\n    \"\"\"\n    try:\n        from unstructured.partition.auto import partition\n    except ImportError as e:\n        raise ImportError(\n            \"Please install unstructured PDF reader `pip install unstructured[pdf]`: \"\n            f\"{e}\"\n        )\n\n    page_items = defaultdict(list)\n    items = partition(input_path)\n    for item in items:\n        page_number = item.metadata.page_number\n        bbox = points_to_bbox(item.metadata.coordinates.points)\n        coord_system = item.metadata.coordinates.system\n        max_w, max_h = coord_system.width, coord_system.height\n        page_items[page_number - 1].append(\n            {\n                \"text\": item.text,\n                \"box\": bbox,\n                \"location\": bbox_to_points(bbox),\n                \"page_shape\": (max_w, max_h),\n            }\n        )\n\n    return page_items\n</code></pre>"},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_ocr_and_pdf_texts","title":"merge_ocr_and_pdf_texts","text":"<pre><code>merge_ocr_and_pdf_texts(\n    ocr_list, pdf_text_list, debug_info=None\n)\n</code></pre> <p>Merge PDF and OCR text using IOU overlapping location Args:     ocr_list: List of OCR items {\"text\", \"box\", \"location\"}     pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}</p> <p>Returns:</p> Type Description <p>Combined list of PDF text and non-overlap OCR text</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def merge_ocr_and_pdf_texts(\n    ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None\n):\n    \"\"\"Merge PDF and OCR text using IOU overlapping location\n    Args:\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        Combined list of PDF text and non-overlap OCR text\n    \"\"\"\n    not_matched_ocr = []\n\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    for ocr_item in ocr_list:\n        matched = False\n        for pdf_item in pdf_text_list:\n            if (\n                get_rect_iou(ocr_item[\"location\"], pdf_item[\"location\"], iou_type=1)\n                &gt; IOU_THRES\n            ):\n                matched = True\n                break\n\n        color = (255, 0, 0)\n        if not matched:\n            ocr_item[\"matched\"] = False\n            not_matched_ocr.append(ocr_item)\n            color = (0, 255, 255)\n\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                ocr_item[\"location\"][0],\n                ocr_item[\"location\"][2],\n                color=color,\n                thickness=1,\n            )\n\n    if debug_info is not None:\n        for pdf_item in pdf_text_list:\n            cv2.rectangle(\n                debug_im,\n                pdf_item[\"location\"][0],\n                pdf_item[\"location\"][2],\n                color=(0, 255, 0),\n                thickness=2,\n            )\n\n    return pdf_text_list + not_matched_ocr\n</code></pre>"},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_table_cell_and_ocr","title":"merge_table_cell_and_ocr","text":"<pre><code>merge_table_cell_and_ocr(\n    table_list, ocr_list, pdf_list, debug_info=None\n)\n</code></pre> <p>Merge table items with OCR text using IOU overlapping location Args:     table_list: List of table items         \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"}     ocr_list: List of OCR items {\"text\", \"box\", \"location\"}     pdf_list: List of PDF items {\"text\", \"box\", \"location\"}</p> <p>Returns:</p> Name Type Description <code>all_table_cells</code> <p>List of tables, each of table is represented by list of cells with combined text from OCR</p> <code>not_matched_items</code> <p>List of PDF text which is not overlapped by table region</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def merge_table_cell_and_ocr(\n    table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None\n):\n    \"\"\"Merge table items with OCR text using IOU overlapping location\n    Args:\n        table_list: List of table items\n            \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"}\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        all_table_cells: List of tables, each of table is represented\n            by list of cells with combined text from OCR\n        not_matched_items: List of PDF text which is not overlapped by table region\n    \"\"\"\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    cell_list = [item for item in table_list if item[\"type\"] == \"cell\"]\n    table_list = [item for item in table_list if item[\"type\"] == \"table\"]\n\n    # sort table by area\n    table_list = sorted(table_list, key=lambda item: box_area(item[\"bbox\"]))\n\n    all_tables = []\n    matched_pdf_ids = []\n    matched_cell_ids = []\n\n    for table in table_list:\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                table[\"location\"][0],\n                table[\"location\"][2],\n                color=[0, 0, 255],\n                thickness=5,\n            )\n\n        cur_table_cells = []\n        for cell_id, cell in enumerate(cell_list):\n            if cell_id in matched_cell_ids:\n                continue\n\n            if get_rect_iou(\n                table[\"location\"], cell[\"location\"], iou_type=1\n            ) &gt; IOU_THRES and box_area(table[\"bbox\"]) &gt; box_area(cell[\"bbox\"]):\n                color = [128, 0, 128]\n                # cell matched to table\n                for item_list, item_type in [(pdf_list, \"pdf\"), (ocr_list, \"ocr\")]:\n                    cell[\"ocr\"] = []\n                    for item_id, item in enumerate(item_list):\n                        if item_type == \"pdf\" and item_id in matched_pdf_ids:\n                            continue\n                        if (\n                            get_rect_iou(item[\"location\"], cell[\"location\"], iou_type=1)\n                            &gt; IOU_THRES\n                        ):\n                            cell[\"ocr\"].append(item)\n                            if item_type == \"pdf\":\n                                matched_pdf_ids.append(item_id)\n\n                    if len(cell[\"ocr\"]) &gt; 0:\n                        # check if union of matched ocr does\n                        # not extend over cell boundary,\n                        # if True, continue to use OCR_list to match\n                        all_box_points_in_cell = []\n                        for item in cell[\"ocr\"]:\n                            all_box_points_in_cell.extend(item[\"location\"])\n                        union_box = union_points(all_box_points_in_cell)\n                        cell_okay = (\n                            box_h(union_box) &lt;= box_h(cell[\"bbox\"]) * PADDING_THRES\n                            and box_w(union_box) &lt;= box_w(cell[\"bbox\"]) * PADDING_THRES\n                        )\n                    else:\n                        cell_okay = False\n\n                    if cell_okay:\n                        if item_type == \"pdf\":\n                            color = [255, 0, 255]\n                        break\n\n                if debug_info is not None:\n                    cv2.rectangle(\n                        debug_im,\n                        cell[\"location\"][0],\n                        cell[\"location\"][2],\n                        color=color,\n                        thickness=3,\n                    )\n\n                matched_cell_ids.append(cell_id)\n                cur_table_cells.append(cell)\n\n        all_tables.append(cur_table_cells)\n\n    not_matched_items = [\n        item for _id, item in enumerate(pdf_list) if _id not in matched_pdf_ids\n    ]\n    if debug_info is not None:\n        for item in not_matched_items:\n            cv2.rectangle(\n                debug_im,\n                item[\"location\"][0],\n                item[\"location\"][2],\n                color=[128, 128, 128],\n                thickness=3,\n            )\n\n    return all_tables, not_matched_items\n</code></pre>"},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.parse_ocr_output","title":"parse_ocr_output","text":"<pre><code>parse_ocr_output(\n    ocr_page_items,\n    pdf_page_items,\n    artifact_path=None,\n    debug_path=None,\n)\n</code></pre> <p>Main function to combine OCR output and PDF text to form list of table / non-table regions Args:     ocr_page_items: List of OCR items by page     pdf_page_items: Dict of PDF texts (page number as key)     debug_path: If specified, use OpenCV to plot debug image and save to debug_path</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py</code> <pre><code>def parse_ocr_output(\n    ocr_page_items: List[dict],\n    pdf_page_items: Dict[int, List[dict]],\n    artifact_path: Optional[str] = None,\n    debug_path: Optional[str] = None,\n):\n    \"\"\"Main function to combine OCR output and PDF text to\n    form list of table / non-table regions\n    Args:\n        ocr_page_items: List of OCR items by page\n        pdf_page_items: Dict of PDF texts (page number as key)\n        debug_path: If specified, use OpenCV to plot debug image and save to debug_path\n    \"\"\"\n    all_tables = []\n    all_texts = []\n\n    for page_id, page in enumerate(ocr_page_items):\n        ocr_list = page[\"json\"][\"ocr\"]\n        table_list = page[\"json\"][\"table\"]\n        page_shape = page[\"image_shape\"]\n        pdf_item_list = pdf_page_items[page_id]\n\n        # create bbox additional information\n        for item in ocr_list:\n            item[\"box\"] = points_to_bbox(item[\"location\"])\n\n        # re-scale pdf items according to new image size\n        for item in pdf_item_list:\n            scale_factor = page_shape[0] / item[\"page_shape\"][0]\n            item[\"box\"] = scale_box(item[\"box\"], scale_factor=scale_factor)\n            item[\"location\"] = scale_points(item[\"location\"], scale_factor=scale_factor)\n\n        # if using debug mode, openCV must be installed\n        if debug_path and artifact_path is not None:\n            try:\n                import cv2\n            except ImportError:\n                raise ImportError(\n                    \"Please install openCV first to use OCRReader debug mode\"\n                )\n            image_path = Path(artifact_path) / page[\"image\"]\n            image = cv2.imread(str(image_path))\n            debug_info = (cv2, image)\n        else:\n            debug_info = None\n\n        new_pdf_list = merge_ocr_and_pdf_texts(\n            ocr_list, pdf_item_list, debug_info=debug_info\n        )\n\n        # sort by reading order\n        ocr_list = sort_funsd_reading_order(ocr_list)\n        new_pdf_list = sort_funsd_reading_order(new_pdf_list)\n\n        all_table_cells, non_table_text_list = merge_table_cell_and_ocr(\n            table_list, ocr_list, new_pdf_list, debug_info=debug_info\n        )\n\n        table_texts = [table_cells_to_markdown(cells) for cells in all_table_cells]\n        all_tables.extend([(page_id, text) for text in table_texts])\n        all_texts.append(\n            (page_id, \" \".join(item[\"text\"] for item in non_table_text_list))\n        )\n\n        # export debug image to debug_path\n        if debug_path:\n            cv2.imwrite(str(Path(debug_path) / \"page_{}.png\".format(page_id)), image)\n\n    return all_tables, all_texts\n</code></pre>"},{"location":"reference/loaders/utils/table/","title":"Table","text":""},{"location":"reference/loaders/utils/table/#loaders.utils.table.check_col_conflicts","title":"check_col_conflicts","text":"<pre><code>check_col_conflicts(col_a, col_b, thres=0.15)\n</code></pre> <p>Check if 2 columns A and B has non-empty content in the same row (to be used with merge_cols)</p> <p>Parameters:</p> Name Type Description Default <code>col_a</code> <code>List[str]</code> <p>column A (list of str)</p> required <code>col_b</code> <code>List[str]</code> <p>column B (list of str)</p> required <code>thres</code> <code>float</code> <p>percentage of overlapping allowed</p> <code>0.15</code> <p>Returns:     if number of overlapping greater than threshold</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def check_col_conflicts(\n    col_a: List[str], col_b: List[str], thres: float = 0.15\n) -&gt; bool:\n    \"\"\"Check if 2 columns A and B has non-empty content in the same row\n    (to be used with merge_cols)\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n        thres: percentage of overlapping allowed\n    Returns:\n        if number of overlapping greater than threshold\n    \"\"\"\n    num_rows = len([cell for cell in col_a if cell])\n    assert len(col_a) == len(col_b)\n    conflict_count = 0\n    for cell_a, cell_b in zip(col_a, col_b):\n        if cell_a and cell_b:\n            conflict_count += 1\n    return conflict_count &gt; num_rows * thres\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.merge_cols","title":"merge_cols","text":"<pre><code>merge_cols(col_a, col_b)\n</code></pre> <p>Merge column A and B if they do not have conflict rows</p> <p>Parameters:</p> Name Type Description Default <code>col_a</code> <code>List[str]</code> <p>column A (list of str)</p> required <code>col_b</code> <code>List[str]</code> <p>column B (list of str)</p> required <p>Returns:     merged column</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def merge_cols(col_a: List[str], col_b: List[str]) -&gt; List[str]:\n    \"\"\"Merge column A and B if they do not have conflict rows\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n    Returns:\n        merged column\n    \"\"\"\n    for r_id in range(len(col_a)):\n        if col_b[r_id]:\n            col_a[r_id] = col_a[r_id] + \" \" + col_b[r_id]\n    return col_a\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.add_index_col","title":"add_index_col","text":"<pre><code>add_index_col(csv_rows)\n</code></pre> <p>Add index column as the first column of the table csv_rows</p> <p>Parameters:</p> Name Type Description Default <code>csv_rows</code> <code>List[List[str]]</code> <p>input table</p> required <p>Returns:     output table with index column</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def add_index_col(csv_rows: List[List[str]]) -&gt; List[List[str]]:\n    \"\"\"Add index column as the first column of the table csv_rows\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output table with index column\n    \"\"\"\n    new_csv_rows = [[\"row id\"] + [\"\"] * len(csv_rows[0])]\n    for r_id, row in enumerate(csv_rows):\n        new_csv_rows.append([str(r_id + 1)] + row)\n    return new_csv_rows\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.compress_csv","title":"compress_csv","text":"<pre><code>compress_csv(csv_rows)\n</code></pre> <p>Compress table csv_rows by merging sparse columns (merge_cols)</p> <p>Parameters:</p> Name Type Description Default <code>csv_rows</code> <code>List[List[str]]</code> <p>input table</p> required <p>Returns:     output: compressed table</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def compress_csv(csv_rows: List[List[str]]) -&gt; List[List[str]]:\n    \"\"\"Compress table csv_rows by merging sparse columns (merge_cols)\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output: compressed table\n    \"\"\"\n    csv_cols = [[r[c_id] for r in csv_rows] for c_id in range(len(csv_rows[0]))]\n    to_remove_col_ids = []\n    last_c_id = 0\n    for c_id in range(1, len(csv_cols)):\n        if not check_col_conflicts(csv_cols[last_c_id], csv_cols[c_id]):\n            to_remove_col_ids.append(c_id)\n            csv_cols[last_c_id] = merge_cols(csv_cols[last_c_id], csv_cols[c_id])\n        else:\n            last_c_id = c_id\n\n    csv_cols = [r for c_id, r in enumerate(csv_cols) if c_id not in to_remove_col_ids]\n    csv_rows = [[c[r_id] for c in csv_cols] for r_id in range(len(csv_cols[0]))]\n    return csv_rows\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.get_table_from_ocr","title":"get_table_from_ocr","text":"<pre><code>get_table_from_ocr(ocr_list, table_list)\n</code></pre> <p>Get list of text lines belong to table regions specified by table_list</p> <p>Parameters:</p> Name Type Description Default <code>ocr_list</code> <code>List[dict]</code> <p>list of OCR output in Casia format (Flax)</p> required <code>table_list</code> <code>List[dict]</code> <p>list of table output in Casia format (Flax)</p> required <p>Returns:</p> Name Type Description <code>_type_</code> <p>description</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def get_table_from_ocr(ocr_list: List[dict], table_list: List[dict]):\n    \"\"\"Get list of text lines belong to table regions specified by table_list\n\n    Args:\n        ocr_list: list of OCR output in Casia format (Flax)\n        table_list: list of table output in Casia format (Flax)\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    table_texts = []\n    for table in table_list:\n        if table[\"type\"] != \"table\":\n            continue\n        cur_table_texts = []\n        for ocr in ocr_list:\n            _iou = get_rect_iou(table[\"location\"], ocr[\"location\"], iou_type=1)\n            if _iou &gt; 0.8:\n                cur_table_texts.append(ocr[\"text\"])\n        table_texts.append(cur_table_texts)\n\n    return table_texts\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.make_markdown_table","title":"make_markdown_table","text":"<pre><code>make_markdown_table(array)\n</code></pre> <p>Convert table rows in list format to markdown string</p> <p>Parameters:</p> Name Type Description Default <code>Example</code> <code>Input</code> <pre><code>[[\"Name\", \"Age\", \"Height\"],\n[\"Jake\", 20, 5'10],\n[\"Mary\", 21, 5'7]]\n</code></pre> required <p>Returns:     String to put into a .md file</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def make_markdown_table(array: List[List[str]]) -&gt; str:\n    \"\"\"Convert table rows in list format to markdown string\n\n    Args:\n        Python list with rows of table as lists\n        First element as header.\n        Example Input:\n                [[\"Name\", \"Age\", \"Height\"],\n                [\"Jake\", 20, 5'10],\n                [\"Mary\", 21, 5'7]]\n    Returns:\n        String to put into a .md file\n    \"\"\"\n    array = compress_csv(array)\n    array = add_index_col(array)\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in array[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(array[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in array[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_csv_string_to_list","title":"parse_csv_string_to_list","text":"<pre><code>parse_csv_string_to_list(csv_str)\n</code></pre> <p>Convert CSV string to list of rows</p> <p>Parameters:</p> Name Type Description Default <code>csv_str</code> <code>str</code> <p>input CSV string</p> required <p>Returns:</p> Type Description <code>List[List[str]]</code> <p>Output table in list format</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def parse_csv_string_to_list(csv_str: str) -&gt; List[List[str]]:\n    \"\"\"Convert CSV string to list of rows\n\n    Args:\n        csv_str: input CSV string\n\n    Returns:\n        Output table in list format\n    \"\"\"\n    io = StringIO(csv_str)\n    csv_reader = csv.reader(io, delimiter=\",\")\n    rows = [row for row in csv_reader]\n    return rows\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.format_cell","title":"format_cell","text":"<pre><code>format_cell(cell, length_limit=None)\n</code></pre> <p>Format cell content by remove redundant character and enforce length limit</p> <p>Parameters:</p> Name Type Description Default <code>cell</code> <code>str</code> <p>input cell text</p> required <code>length_limit</code> <code>Optional[int]</code> <p>limit of text length.</p> <code>None</code> <p>Returns:</p> Type Description <code>str</code> <p>new cell text</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def format_cell(cell: str, length_limit: Optional[int] = None) -&gt; str:\n    \"\"\"Format cell content by remove redundant character and enforce length limit\n\n    Args:\n        cell: input cell text\n        length_limit: limit of text length.\n\n    Returns:\n        new cell text\n    \"\"\"\n    cell = cell.replace(\"\\n\", \" \")\n    if length_limit:\n        cell = cell[:length_limit]\n    return cell\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.extract_tables_from_csv_string","title":"extract_tables_from_csv_string","text":"<pre><code>extract_tables_from_csv_string(csv_content, table_texts)\n</code></pre> <p>Extract list of table from FullOCR output (csv_content) with the specified table_texts</p> <p>Parameters:</p> Name Type Description Default <code>csv_content</code> <code>str</code> <p>CSV output from FullOCR pipeline</p> required <code>table_texts</code> <code>List[List[str]]</code> <p>list of table texts extracted</p> required <p>Returns:</p> Type Description <code>Tuple[List[str], str]</code> <p>List of tables and non-text content</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def extract_tables_from_csv_string(\n    csv_content: str, table_texts: List[List[str]]\n) -&gt; Tuple[List[str], str]:\n    \"\"\"Extract list of table from FullOCR output\n    (csv_content) with the specified table_texts\n\n    Args:\n        csv_content: CSV output from FullOCR pipeline\n        table_texts: list of table texts extracted\n        from get_table_from_ocr()\n\n    Returns:\n        List of tables and non-text content\n    \"\"\"\n    rows = parse_csv_string_to_list(csv_content)\n    used_row_ids = []\n    table_csv_list = []\n    for table in table_texts:\n        cur_rows = []\n        for row_id, row in enumerate(rows):\n            scores = [\n                any(cell in cell_reference for cell in table)\n                for cell_reference in row\n                if cell_reference\n            ]\n            score = sum(scores) / len(scores)\n            if score &gt; 0.5 and row_id not in used_row_ids:\n                used_row_ids.append(row_id)\n                cur_rows.append([format_cell(cell) for cell in row])\n        if cur_rows:\n            table_csv_list.append(make_markdown_table(cur_rows))\n        else:\n            print(\"table not matched\", table)\n\n    non_table_rows = [\n        row for row_id, row in enumerate(rows) if row_id not in used_row_ids\n    ]\n    non_table_text = \"\\n\".join(\n        \" \".join(format_cell(cell) for cell in row) for row in non_table_rows\n    )\n    return table_csv_list, non_table_text\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.strip_special_chars_markdown","title":"strip_special_chars_markdown","text":"<pre><code>strip_special_chars_markdown(text)\n</code></pre> <p>Strip special characters from input text in markdown table format</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def strip_special_chars_markdown(text: str) -&gt; str:\n    \"\"\"Strip special characters from input text in markdown table format\"\"\"\n    return text.replace(\"|\", \"\").replace(\":---:\", \"\").replace(\"---\", \"\")\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_markdown_text_to_tables","title":"parse_markdown_text_to_tables","text":"<pre><code>parse_markdown_text_to_tables(text)\n</code></pre> <p>Convert markdown text to list of non-table spans and table spans</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>input markdown text</p> required <p>Returns:</p> Type Description <code>Tuple[List[str], List[str]]</code> <p>list of table spans and non-table spans</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def parse_markdown_text_to_tables(text: str) -&gt; Tuple[List[str], List[str]]:\n    \"\"\"Convert markdown text to list of non-table spans and table spans\n\n    Args:\n        text: input markdown text\n\n    Returns:\n        list of table spans and non-table spans\n    \"\"\"\n    # init empty tables and texts list\n    tables = []\n    texts = []\n\n    # split input by line break\n    lines = text.split(\"\\n\")\n    cur_table = []\n    cur_text: List[str] = []\n    for line in lines:\n        line = line.strip()\n        if line.startswith(\"|\"):\n            if len(cur_text) &gt; 0:\n                texts.append(cur_text)\n                cur_text = []\n            cur_table.append(line)\n        else:\n            # add new table to the list\n            if len(cur_table) &gt; 0:\n                tables.append(cur_table)\n                cur_table = []\n            cur_text.append(line)\n\n    table_texts = [\"\\n\".join(table) for table in tables]\n    non_table_texts = [\"\\n\".join(text) for text in texts]\n    return table_texts, non_table_texts\n</code></pre>"},{"location":"reference/loaders/utils/table/#loaders.utils.table.table_cells_to_markdown","title":"table_cells_to_markdown","text":"<pre><code>table_cells_to_markdown(cells)\n</code></pre> <p>Convert list of cells with attached text to Markdown table</p> Source code in <code>libs/kotaemon/kotaemon/loaders/utils/table.py</code> <pre><code>def table_cells_to_markdown(cells: List[dict]):\n    \"\"\"Convert list of cells with attached text to Markdown table\"\"\"\n\n    if len(cells) == 0:\n        return \"\"\n\n    all_row_ids = []\n    all_col_ids = []\n    for cell in cells:\n        all_row_ids.extend(cell[\"rows\"])\n        all_col_ids.extend(cell[\"columns\"])\n\n    num_rows, num_cols = max(all_row_ids) + 1, max(all_col_ids) + 1\n    table_rows = [[\"\" for c in range(num_cols)] for r in range(num_rows)]\n\n    # start filling in the grid\n    for cell in cells:\n        cell_text = \" \".join(item[\"text\"] for item in cell[\"ocr\"])\n        start_row_id, end_row_id = cell[\"rows\"]\n        start_col_id, end_col_id = cell[\"columns\"]\n        span_cell = end_row_id != start_row_id or end_col_id != start_col_id\n\n        # do not repeat long text in span cell to prevent context length issue\n        if span_cell and len(cell_text.replace(\" \", \"\")) &lt; 20 and start_row_id &gt; 0:\n            for row in range(start_row_id, end_row_id + 1):\n                for col in range(start_col_id, end_col_id + 1):\n                    table_rows[row][col] += cell_text + \" \"\n        else:\n            table_rows[start_row_id][start_col_id] += cell_text + \" \"\n\n    return make_markdown_table(table_rows)\n</code></pre>"},{"location":"reference/parsers/","title":"Parsers","text":""},{"location":"reference/parsers/#parsers.RegexExtractor","title":"RegexExtractor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Simple class for extracting text from a document using a regex pattern.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>List[str]</code> <p>The regex pattern(s) to use.</p> required <code>output_map</code> <code>dict</code> <p>A mapping from extracted text to the desired output. Defaults to None.</p> required Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -&gt; list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -&gt; str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -&gt; ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            &gt;&gt;&gt; document1 = Document(...)\n            &gt;&gt;&gt; document2 = Document(...)\n            &gt;&gt;&gt; document_batch = [document1, document2]\n            &gt;&gt;&gt; batch_output = self(document_batch)\n            &gt;&gt;&gt; print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw_static","title":"run_raw_static  <code>staticmethod</code>","text":"<pre><code>run_raw_static(pattern, text)\n</code></pre> <p>Finds all non-overlapping occurrences of a pattern in a string.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>str</code> <p>The regular expression pattern to search for.</p> required <code>text</code> <code>str</code> <p>The input string to search in.</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List[str]: A list of all non-overlapping occurrences of the pattern in the string.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef run_raw_static(pattern: str, text: str) -&gt; list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.map_output","title":"map_output  <code>staticmethod</code>","text":"<pre><code>map_output(text, output_map)\n</code></pre> <p>Maps the given <code>text</code> to its corresponding value in the <code>output_map</code> dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The input text to be mapped.</p> required <code>output_map</code> <code>dict</code> <p>A dictionary containing mapping of input text to output values.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>The corresponding value from the <code>output_map</code> if <code>text</code> is found in the dictionary, otherwise returns the original <code>text</code>.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef map_output(text, output_map) -&gt; str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw","title":"run_raw","text":"<pre><code>run_raw(text)\n</code></pre> <p>Matches the raw text against the pattern and rans the output mapping, returning     an instance of ExtractorOutput.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The raw text to be processed.</p> required <p>Returns:</p> Name Type Description <code>ExtractorOutput</code> <code>ExtractorOutput</code> <p>The processed output as a list of ExtractorOutput.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run_raw(self, text: str) -&gt; ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n</code></pre>"},{"location":"reference/parsers/#parsers.RegexExtractor.run","title":"run","text":"<pre><code>run(text)\n</code></pre> <p>Match the input against a pattern and return the output for each input</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | list[str] | Document | list[Document]</code> <p>contains the input string to be processed</p> required <p>Returns:</p> Type Description <code>list[ExtractorOutput]</code> <p>A list contains the output ExtractorOutput for each input</p> Example <pre><code>&gt;&gt;&gt; document1 = Document(...)\n&gt;&gt;&gt; document2 = Document(...)\n&gt;&gt;&gt; document_batch = [document1, document2]\n&gt;&gt;&gt; batch_output = self(document_batch)\n&gt;&gt;&gt; print(batch_output)\n[output1_document1, output1_document2]\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        &gt;&gt;&gt; document1 = Document(...)\n        &gt;&gt;&gt; document2 = Document(...)\n        &gt;&gt;&gt; document_batch = [document1, document2]\n        &gt;&gt;&gt; batch_output = self(document_batch)\n        &gt;&gt;&gt; print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n</code></pre>"},{"location":"reference/parsers/regex_extractor/","title":"Regex Extractor","text":""},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor","title":"RegexExtractor","text":"<p>               Bases: <code>BaseComponent</code></p> <p>Simple class for extracting text from a document using a regex pattern.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>List[str]</code> <p>The regex pattern(s) to use.</p> required <code>output_map</code> <code>dict</code> <p>A mapping from extracted text to the desired output. Defaults to None.</p> required Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -&gt; list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -&gt; str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -&gt; ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -&gt; list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            &gt;&gt;&gt; document1 = Document(...)\n            &gt;&gt;&gt; document2 = Document(...)\n            &gt;&gt;&gt; document_batch = [document1, document2]\n            &gt;&gt;&gt; batch_output = self(document_batch)\n            &gt;&gt;&gt; print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw_static","title":"run_raw_static  <code>staticmethod</code>","text":"<pre><code>run_raw_static(pattern, text)\n</code></pre> <p>Finds all non-overlapping occurrences of a pattern in a string.</p> <p>Parameters:</p> Name Type Description Default <code>pattern</code> <code>str</code> <p>The regular expression pattern to search for.</p> required <code>text</code> <code>str</code> <p>The input string to search in.</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List[str]: A list of all non-overlapping occurrences of the pattern in the string.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef run_raw_static(pattern: str, text: str) -&gt; list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.map_output","title":"map_output  <code>staticmethod</code>","text":"<pre><code>map_output(text, output_map)\n</code></pre> <p>Maps the given <code>text</code> to its corresponding value in the <code>output_map</code> dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The input text to be mapped.</p> required <code>output_map</code> <code>dict</code> <p>A dictionary containing mapping of input text to output values.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>The corresponding value from the <code>output_map</code> if <code>text</code> is found in the dictionary, otherwise returns the original <code>text</code>.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>@staticmethod\ndef map_output(text, output_map) -&gt; str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw","title":"run_raw","text":"<pre><code>run_raw(text)\n</code></pre> <p>Matches the raw text against the pattern and rans the output mapping, returning     an instance of ExtractorOutput.</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str</code> <p>The raw text to be processed.</p> required <p>Returns:</p> Name Type Description <code>ExtractorOutput</code> <code>ExtractorOutput</code> <p>The processed output as a list of ExtractorOutput.</p> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run_raw(self, text: str) -&gt; ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n</code></pre>"},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run","title":"run","text":"<pre><code>run(text)\n</code></pre> <p>Match the input against a pattern and return the output for each input</p> <p>Parameters:</p> Name Type Description Default <code>text</code> <code>str | list[str] | Document | list[Document]</code> <p>contains the input string to be processed</p> required <p>Returns:</p> Type Description <code>list[ExtractorOutput]</code> <p>A list contains the output ExtractorOutput for each input</p> Example <pre><code>&gt;&gt;&gt; document1 = Document(...)\n&gt;&gt;&gt; document2 = Document(...)\n&gt;&gt;&gt; document_batch = [document1, document2]\n&gt;&gt;&gt; batch_output = self(document_batch)\n&gt;&gt;&gt; print(batch_output)\n[output1_document1, output1_document2]\n</code></pre> Source code in <code>libs/kotaemon/kotaemon/parsers/regex_extractor.py</code> <pre><code>def run(\n    self, text: str | list[str] | Document | list[Document]\n) -&gt; list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        &gt;&gt;&gt; document1 = Document(...)\n        &gt;&gt;&gt; document2 = Document(...)\n        &gt;&gt;&gt; document_batch = [document1, document2]\n        &gt;&gt;&gt; batch_output = self(document_batch)\n        &gt;&gt;&gt; print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n</code></pre>"},{"location":"reference/storages/","title":"Storages","text":""},{"location":"reference/storages/#storages.BaseDocumentStore","title":"BaseDocumentStore","text":"<p>               Bases: <code>ABC</code></p> <p>A document store is in charged of storing and managing documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>Document or list of documents</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>List of ids of the documents. Optional, if not set will use doc.doc_id</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.get","title":"get  <code>abstractmethod</code>","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.get_all","title":"get_all  <code>abstractmethod</code>","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.count","title":"count  <code>abstractmethod</code>","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search document store using search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseDocumentStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -&gt; List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>refresh_indices</code> <code>bool</code> <p>request Elasticsearch to update its index (default to True)</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"<pre><code>query_raw(query)\n</code></pre> <p>Query Elasticsearch store using query format of ES client</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>dict</code> <p>Elasticsearch query format</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query_raw(self, query: dict) -&gt; List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search Elasticsearch docstore using search query (BM25)</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>str</code> <p>query text</p> required <code>top_k</code> <code>int</code> <p>number of top documents to return. Defaults to 10.</p> <code>10</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.save","title":"save","text":"<pre><code>save(path)\n</code></pre> <p>Save document to path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.load","title":"load","text":"<pre><code>load(path)\n</code></pre> <p>Load document store from path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Perform full-text search on document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>LancdDB document store which support full-text search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def get_all(self) -&gt; List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Load documents into lancedb storage.</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids, refresh_indices=True)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"<p>               Bases: <code>InMemoryDocumentStore</code></p> <p>Improve InMemoryDocumentStore by auto saving whenever the corpus is changed</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore","title":"BaseVectorStore","text":"<p>               Bases: <code>ABC</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -&gt; list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(embeddings, metadatas=None, ids=None)\n</code></pre> <p>Add vector embeddings to vector stores</p> <p>Parameters:</p> Name Type Description Default <code>embeddings</code> <code>list[list[float]] | list[DocumentWithEmbedding]</code> <p>List of embeddings</p> required <code>metadatas</code> <code>Optional[list[dict]]</code> <p>List of metadata of the embeddings</p> <code>None</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings</p> <code>None</code> <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List of ids of the embeddings</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -&gt; list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.BaseVectorStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the vector store</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/#storages.ChromaVectorStore","title":"ChromaVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -&gt; int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.ChromaVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n</code></pre>"},{"location":"reference/storages/#storages.ChromaVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore","title":"InMemoryVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore.save","title":"save","text":"<pre><code>save(save_path, fs=None, **kwargs)\n</code></pre> <p>save a simpleVectorStore to a dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>save_path</code> <code>str</code> <p>Path of saving vector to disk.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore.load","title":"load","text":"<pre><code>load(load_path, fs=None)\n</code></pre> <p>Create a SimpleKVStore from a load directory.</p> <p>Parameters:</p> Name Type Description Default <code>load_path</code> <code>str</code> <p>Path of loading vector.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/#storages.InMemoryVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Clear the old data</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBVectorStore","title":"LanceDBVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n</code></pre>"},{"location":"reference/storages/#storages.LanceDBVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/#storages.MilvusVectorStore","title":"MilvusVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/milvus.py</code> <pre><code>class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.QdrantVectorStore","title":"QdrantVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/#storages.QdrantVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/storages/#storages.QdrantVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n</code></pre>"},{"location":"reference/storages/#storages.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> <p>Similar to InMemoryVectorStore but is backed by file by default</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py</code> <pre><code>class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/","title":"Docstores","text":""},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore","title":"BaseDocumentStore","text":"<p>               Bases: <code>ABC</code></p> <p>A document store is in charged of storing and managing documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>Document or list of documents</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>List of ids of the documents. Optional, if not set will use doc.doc_id</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get","title":"get  <code>abstractmethod</code>","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get_all","title":"get_all  <code>abstractmethod</code>","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.count","title":"count  <code>abstractmethod</code>","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search document store using search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -&gt; List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>refresh_indices</code> <code>bool</code> <p>request Elasticsearch to update its index (default to True)</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"<pre><code>query_raw(query)\n</code></pre> <p>Query Elasticsearch store using query format of ES client</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>dict</code> <p>Elasticsearch query format</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query_raw(self, query: dict) -&gt; List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search Elasticsearch docstore using search query (BM25)</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>str</code> <p>query text</p> required <code>top_k</code> <code>int</code> <p>number of top documents to return. Defaults to 10.</p> <code>10</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.save","title":"save","text":"<pre><code>save(path)\n</code></pre> <p>Save document to path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.load","title":"load","text":"<pre><code>load(path)\n</code></pre> <p>Load document store from path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Perform full-text search on document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>LancdDB document store which support full-text search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def get_all(self) -&gt; List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Load documents into lancedb storage.</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids, refresh_indices=True)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"<p>               Bases: <code>InMemoryDocumentStore</code></p> <p>Improve InMemoryDocumentStore by auto saving whenever the corpus is changed</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n</code></pre>"},{"location":"reference/storages/docstores/base/","title":"Base","text":""},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore","title":"BaseDocumentStore","text":"<p>               Bases: <code>ABC</code></p> <p>A document store is in charged of storing and managing documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>Document or list of documents</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>List of ids of the documents. Optional, if not set will use doc.doc_id</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get","title":"get  <code>abstractmethod</code>","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get_all","title":"get_all  <code>abstractmethod</code>","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.count","title":"count  <code>abstractmethod</code>","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search document store using search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/","title":"Elasticsearch","text":""},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -&gt; List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>refresh_indices</code> <code>bool</code> <p>request Elasticsearch to update its index (default to True)</p> <code>True</code> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"<pre><code>query_raw(query)\n</code></pre> <p>Query Elasticsearch store using query format of ES client</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>dict</code> <p>Elasticsearch query format</p> required <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query_raw(self, query: dict) -&gt; List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Search Elasticsearch docstore using search query (BM25)</p> <p>Parameters:</p> Name Type Description Default <code>query</code> <code>str</code> <p>query text</p> required <code>top_k</code> <code>int</code> <p>number of top documents to return. Defaults to 10.</p> <code>10</code> <p>Returns:</p> Type Description <code>List[Document]</code> <p>List[Document]: List of result documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/","title":"In Memory","text":""},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>Simple memory document store that store document in a dictionary</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -&gt; List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -&gt; int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get_all","title":"get_all","text":"<pre><code>get_all()\n</code></pre> <p>Get all documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def get_all(self) -&gt; List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.count","title":"count","text":"<pre><code>count()\n</code></pre> <p>Count number of documents</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def count(self) -&gt; int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.save","title":"save","text":"<pre><code>save(path)\n</code></pre> <p>Save document to path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.load","title":"load","text":"<pre><code>load(path)\n</code></pre> <p>Load document store from path</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.query","title":"query","text":"<pre><code>query(query, top_k=10, doc_ids=None)\n</code></pre> <p>Perform full-text search on document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -&gt; List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n</code></pre>"},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"<p>               Bases: <code>BaseDocumentStore</code></p> <p>LancdDB document store which support full-text search query</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -&gt; List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def get_all(self) -&gt; List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, refresh_indices=True, **kwargs)\n</code></pre> <p>Load documents into lancedb storage.</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"&lt;empty&gt;\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids, refresh_indices=True)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n</code></pre>"},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/","title":"Simple File","text":""},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"<p>               Bases: <code>InMemoryDocumentStore</code></p> <p>Improve InMemoryDocumentStore by auto saving whenever the corpus is changed</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.get","title":"get","text":"<pre><code>get(ids)\n</code></pre> <p>Get document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def get(self, ids: Union[List[str], str]) -&gt; List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.add","title":"add","text":"<pre><code>add(docs, ids=None, **kwargs)\n</code></pre> <p>Add document into document store</p> <p>Parameters:</p> Name Type Description Default <code>docs</code> <code>Union[Document, List[Document]]</code> <p>list of documents to add</p> required <code>ids</code> <code>Optional[Union[List[str], str]]</code> <p>specify the ids of documents to add or use existing doc.doc_id</p> <code>None</code> <code>exist_ok</code> <p>raise error when duplicate doc-id found in the docstore (default to False)</p> required Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.delete","title":"delete","text":"<pre><code>delete(ids)\n</code></pre> <p>Delete document by id</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n</code></pre>"},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Drop the document store</p> Source code in <code>libs/kotaemon/kotaemon/storages/docstores/simple_file.py</code> <pre><code>def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n</code></pre>"},{"location":"reference/storages/vectorstores/","title":"Vectorstores","text":""},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore","title":"BaseVectorStore","text":"<p>               Bases: <code>ABC</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -&gt; list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(embeddings, metadatas=None, ids=None)\n</code></pre> <p>Add vector embeddings to vector stores</p> <p>Parameters:</p> Name Type Description Default <code>embeddings</code> <code>list[list[float]] | list[DocumentWithEmbedding]</code> <p>List of embeddings</p> required <code>metadatas</code> <code>Optional[list[dict]]</code> <p>List of metadata of the embeddings</p> <code>None</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings</p> <code>None</code> <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List of ids of the embeddings</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -&gt; list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the vector store</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore","title":"ChromaVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -&gt; int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore","title":"InMemoryVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.save","title":"save","text":"<pre><code>save(save_path, fs=None, **kwargs)\n</code></pre> <p>save a simpleVectorStore to a dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>save_path</code> <code>str</code> <p>Path of saving vector to disk.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.load","title":"load","text":"<pre><code>load(load_path, fs=None)\n</code></pre> <p>Create a SimpleKVStore from a load directory.</p> <p>Parameters:</p> Name Type Description Default <code>load_path</code> <code>str</code> <p>Path of loading vector.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Clear the old data</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore","title":"LanceDBVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.MilvusVectorStore","title":"MilvusVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/milvus.py</code> <pre><code>class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore","title":"QdrantVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/#storages.vectorstores.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> <p>Similar to InMemoryVectorStore but is backed by file by default</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py</code> <pre><code>class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/base/","title":"Base","text":""},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore","title":"BaseVectorStore","text":"<p>               Bases: <code>ABC</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -&gt; list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.add","title":"add  <code>abstractmethod</code>","text":"<pre><code>add(embeddings, metadatas=None, ids=None)\n</code></pre> <p>Add vector embeddings to vector stores</p> <p>Parameters:</p> Name Type Description Default <code>embeddings</code> <code>list[list[float]] | list[DocumentWithEmbedding]</code> <p>List of embeddings</p> required <code>metadatas</code> <code>Optional[list[dict]]</code> <p>List of metadata of the embeddings</p> <code>None</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings</p> <code>None</code> <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> required <p>Returns:</p> Type Description <code>list[str]</code> <p>List of ids of the embeddings</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -&gt; list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.delete","title":"delete  <code>abstractmethod</code>","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>list[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.query","title":"query  <code>abstractmethod</code>","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.drop","title":"drop  <code>abstractmethod</code>","text":"<pre><code>drop()\n</code></pre> <p>Drop the vector store</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>@abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore","title":"LlamaIndexVectorStore","text":"<p>               Bases: <code>BaseVectorStore</code></p> <p>Mixin for LlamaIndex based vectorstores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>class LlamaIndexVectorStore(BaseVectorStore):\n    \"\"\"Mixin for LlamaIndex based vectorstores\"\"\"\n\n    _li_class: type[LIVectorStore | BasePydanticVectorStore] | None\n\n    def _get_li_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in in _get_li_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        # get li_class from the method if not set\n        if not self._li_class:\n            LIClass = self._get_li_class()\n        else:\n            LIClass = self._li_class\n\n        from dataclasses import fields\n\n        self._client = LIClass(*args, **kwargs)\n\n        self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)}\n        for key in [\"query_embedding\", \"similarity_top_k\", \"node_ids\"]:\n            if key in self._vsq_kwargs:\n                self._vsq_kwargs.remove(key)\n\n    def __setattr__(self, name: str, value: Any) -&gt; None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._client, name, value)\n\n    def __getattr__(self, name: str) -&gt; Any:\n        if name == \"_li_class\":\n            return super().__getattribute__(name)\n\n        return getattr(self._client, name)\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if isinstance(embeddings[0], list):\n            nodes: list[DocumentWithEmbedding] = [\n                DocumentWithEmbedding(embedding=embedding) for embedding in embeddings\n            ]\n        else:\n            nodes = embeddings  # type: ignore\n        if metadatas is not None:\n            for node, metadata in zip(nodes, metadatas):\n                node.metadata = metadata\n        if ids is not None:\n            for node, id in zip(nodes, ids):\n                node.id_ = id\n                node.relationships = {\n                    NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)\n                }\n\n        return self._client.add(nodes=nodes)\n\n    def delete(self, ids: list[str], **kwargs):\n        for id_ in ids:\n            self._client.delete(ref_doc_id=id_, **kwargs)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n            kwargs: extra query parameters. Depending on the name, these parameters\n                will be used when constructing the VectorStoreQuery object or when\n                performing querying of the underlying vector store.\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        vsq_kwargs = {}\n        vs_kwargs = {}\n        for kwkey, kwvalue in kwargs.items():\n            if kwkey in self._vsq_kwargs:\n                vsq_kwargs[kwkey] = kwvalue\n            else:\n                vs_kwargs[kwkey] = kwvalue\n\n        output = self._client.query(\n            query=VectorStoreQuery(\n                query_embedding=embedding,\n                similarity_top_k=top_k,\n                node_ids=ids,\n                **vsq_kwargs,\n            ),\n            **vs_kwargs,\n        )\n\n        embeddings = []\n        if output.nodes:\n            for node in output.nodes:\n                embeddings.append(node.embedding)\n        similarities = output.similarities if output.similarities else []\n        out_ids = output.ids if output.ids else []\n\n        return embeddings, similarities, out_ids\n</code></pre>"},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore.query","title":"query","text":"<pre><code>query(embedding, top_k=1, ids=None, **kwargs)\n</code></pre> <p>Return the top k most similar vector embeddings</p> <p>Parameters:</p> Name Type Description Default <code>embedding</code> <code>list[float]</code> <p>List of embeddings</p> required <code>top_k</code> <code>int</code> <p>Number of most similar embeddings to return</p> <code>1</code> <code>ids</code> <code>Optional[list[str]]</code> <p>List of ids of the embeddings to be queried</p> <code>None</code> <code>kwargs</code> <p>extra query parameters. Depending on the name, these parameters will be used when constructing the VectorStoreQuery object or when performing querying of the underlying vector store.</p> <code>{}</code> <p>Returns:</p> Type Description <code>tuple[list[list[float]], list[float], list[str]]</code> <p>the matched embeddings, the similarity scores, and the ids</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/base.py</code> <pre><code>def query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -&gt; tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n        kwargs: extra query parameters. Depending on the name, these parameters\n            will be used when constructing the VectorStoreQuery object or when\n            performing querying of the underlying vector store.\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    vsq_kwargs = {}\n    vs_kwargs = {}\n    for kwkey, kwvalue in kwargs.items():\n        if kwkey in self._vsq_kwargs:\n            vsq_kwargs[kwkey] = kwvalue\n        else:\n            vs_kwargs[kwkey] = kwvalue\n\n    output = self._client.query(\n        query=VectorStoreQuery(\n            query_embedding=embedding,\n            similarity_top_k=top_k,\n            node_ids=ids,\n            **vsq_kwargs,\n        ),\n        **vs_kwargs,\n    )\n\n    embeddings = []\n    if output.nodes:\n        for node in output.nodes:\n            embeddings.append(node.embedding)\n    similarities = output.similarities if output.similarities else []\n    out_ids = output.ids if output.ids else []\n\n    return embeddings, similarities, out_ids\n</code></pre>"},{"location":"reference/storages/vectorstores/chroma/","title":"Chroma","text":""},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore","title":"ChromaVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -&gt; int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/chroma.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/","title":"In Memory","text":"<p>Simple vector store index.</p>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore","title":"InMemoryVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.save","title":"save","text":"<pre><code>save(save_path, fs=None, **kwargs)\n</code></pre> <p>save a simpleVectorStore to a dictionary.</p> <p>Parameters:</p> Name Type Description Default <code>save_path</code> <code>str</code> <p>Path of saving vector to disk.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.load","title":"load","text":"<pre><code>load(load_path, fs=None)\n</code></pre> <p>Create a SimpleKVStore from a load directory.</p> <p>Parameters:</p> Name Type Description Default <code>load_path</code> <code>str</code> <p>Path of loading vector.</p> required <code>fs</code> <code>Optional[AbstractFileSystem]</code> <p>An abstract super-class for pythonic file-systems</p> <code>None</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n</code></pre>"},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Clear the old data</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py</code> <pre><code>def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n</code></pre>"},{"location":"reference/storages/vectorstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore","title":"LanceDBVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -&gt; int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n</code></pre>"},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/milvus/","title":"Milvus","text":""},{"location":"reference/storages/vectorstores/milvus/#storages.vectorstores.milvus.MilvusVectorStore","title":"MilvusVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/milvus.py</code> <pre><code>class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -&gt; tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/qdrant/","title":"Qdrant","text":""},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore","title":"QdrantVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -&gt; int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n</code></pre>"},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.delete","title":"delete","text":"<pre><code>delete(ids, **kwargs)\n</code></pre> <p>Delete vector embeddings from vector stores</p> <p>Parameters:</p> Name Type Description Default <code>ids</code> <code>List[str]</code> <p>List of ids of the embeddings to be deleted</p> required <code>kwargs</code> <p>meant for vectorstore-specific parameters</p> <code>{}</code> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n</code></pre>"},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.drop","title":"drop","text":"<pre><code>drop()\n</code></pre> <p>Delete entire collection from vector stores</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py</code> <pre><code>def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n</code></pre>"},{"location":"reference/storages/vectorstores/simple_file/","title":"Simple File","text":"<p>Simple file vector store index.</p>"},{"location":"reference/storages/vectorstores/simple_file/#storages.vectorstores.simple_file.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"<p>               Bases: <code>LlamaIndexVectorStore</code></p> <p>Similar to InMemoryVectorStore but is backed by file by default</p> Source code in <code>libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py</code> <pre><code>class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -&gt; None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n</code></pre>"}]}
\ No newline at end of file