diff --git a/online_install/index.html b/online_install/index.html index ca8031eb3..a7bad3b5f 100644 --- a/online_install/index.html +++ b/online_install/index.html @@ -3630,28 +3630,18 @@

Online install

Installation (Online HuggingFace Space)ΒΆ

    -
  1. -

    Go to kotaemon_template

    -
  2. -
  3. -

    Use Duplicate function to create your own space

    -
  4. -
-

Duplicate space

-

Change space params

-
    -
  1. Wait for the build to complete and start up (apprx 10 mins).
  2. -
-

Wait space build

-

Close space build

-
    -
  1. Follow the first setup instructions (and register for Cohere API key if needed)
  2. -
-

Cohere API

-
    -
  1. Complete the setup and use your own private space!
  2. -
-

App Startup

+
  • Go to kotaemon_template
  • +
  • Use Duplicate function to create your own space + Duplicate space + Change space params
  • +
  • Wait for the build to complete and start up (apprx 10 mins). + Wait space build + Close space build
  • +
  • Follow the first setup instructions (and register for Cohere API key if needed)\ + Cohere API
  • +
  • Complete the setup and use your own private space! + App Startup
  • + diff --git a/reference/Summary/index.html b/reference/Summary/index.html index 7b9b7c3e1..f47ad10fa 100644 --- a/reference/Summary/index.html +++ b/reference/Summary/index.html @@ -3724,7 +3724,7 @@

    Summary

    - 2024-09-23 + 2024-09-23 @@ -3734,7 +3734,7 @@

    Summary

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/base/index.html b/reference/agents/base/index.html index 5f25eb005..4fbb96e81 100644 --- a/reference/agents/base/index.html +++ b/reference/agents/base/index.html @@ -3914,7 +3914,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3924,7 +3924,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/index.html b/reference/agents/index.html index 846ccf07e..1cabb71b8 100644 --- a/reference/agents/index.html +++ b/reference/agents/index.html @@ -8727,7 +8727,7 @@

    - 2024-09-23 + 2024-09-23 @@ -8737,7 +8737,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/io/base/index.html b/reference/agents/io/base/index.html index 1d757bb9d..e1ea8682c 100644 --- a/reference/agents/io/base/index.html +++ b/reference/agents/io/base/index.html @@ -5385,7 +5385,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5395,7 +5395,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/io/index.html b/reference/agents/io/index.html index 62971cbe3..15fbe030c 100644 --- a/reference/agents/io/index.html +++ b/reference/agents/io/index.html @@ -5338,7 +5338,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5348,7 +5348,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/langchain_based/index.html b/reference/agents/langchain_based/index.html index 77bfcbc13..1a9dd0470 100644 --- a/reference/agents/langchain_based/index.html +++ b/reference/agents/langchain_based/index.html @@ -3870,7 +3870,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3880,7 +3880,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/react/agent/index.html b/reference/agents/react/agent/index.html index 275338675..dc24a5224 100644 --- a/reference/agents/react/agent/index.html +++ b/reference/agents/react/agent/index.html @@ -4926,7 +4926,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4936,7 +4936,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/react/index.html b/reference/agents/react/index.html index 5899ff4aa..95d9402a1 100644 --- a/reference/agents/react/index.html +++ b/reference/agents/react/index.html @@ -4926,7 +4926,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4936,7 +4936,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/react/prompt/index.html b/reference/agents/react/prompt/index.html index da753395b..d5ab50e6e 100644 --- a/reference/agents/react/prompt/index.html +++ b/reference/agents/react/prompt/index.html @@ -3687,7 +3687,7 @@

    Prompt

    - 2024-09-23 + 2024-09-23 @@ -3697,7 +3697,7 @@

    Prompt

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/rewoo/agent/index.html b/reference/agents/rewoo/agent/index.html index d6bdb76d9..10650066d 100644 --- a/reference/agents/rewoo/agent/index.html +++ b/reference/agents/rewoo/agent/index.html @@ -4769,7 +4769,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4779,7 +4779,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/rewoo/index.html b/reference/agents/rewoo/index.html index 21c61a20b..5de44a9ae 100644 --- a/reference/agents/rewoo/index.html +++ b/reference/agents/rewoo/index.html @@ -4769,7 +4769,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4779,7 +4779,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/rewoo/planner/index.html b/reference/agents/rewoo/planner/index.html index 4df5b98e8..836a2f748 100644 --- a/reference/agents/rewoo/planner/index.html +++ b/reference/agents/rewoo/planner/index.html @@ -3928,7 +3928,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3938,7 +3938,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/rewoo/prompt/index.html b/reference/agents/rewoo/prompt/index.html index 7161b5888..5bf7a6004 100644 --- a/reference/agents/rewoo/prompt/index.html +++ b/reference/agents/rewoo/prompt/index.html @@ -3687,7 +3687,7 @@

    Prompt

    - 2024-09-23 + 2024-09-23 @@ -3697,7 +3697,7 @@

    Prompt

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/rewoo/solver/index.html b/reference/agents/rewoo/solver/index.html index 3735edd6b..952a5af78 100644 --- a/reference/agents/rewoo/solver/index.html +++ b/reference/agents/rewoo/solver/index.html @@ -3916,7 +3916,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3926,7 +3926,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/tools/base/index.html b/reference/agents/tools/base/index.html index a15530b34..2f3451704 100644 --- a/reference/agents/tools/base/index.html +++ b/reference/agents/tools/base/index.html @@ -4447,7 +4447,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4457,7 +4457,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/tools/google/index.html b/reference/agents/tools/google/index.html index e639831e1..3020ac270 100644 --- a/reference/agents/tools/google/index.html +++ b/reference/agents/tools/google/index.html @@ -3687,7 +3687,7 @@

    Google

    - 2024-09-23 + 2024-09-23 @@ -3697,7 +3697,7 @@

    Google

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/tools/index.html b/reference/agents/tools/index.html index 35402d134..0b62e97ac 100644 --- a/reference/agents/tools/index.html +++ b/reference/agents/tools/index.html @@ -4481,7 +4481,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4491,7 +4491,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/tools/llm/index.html b/reference/agents/tools/llm/index.html index 4d230a79b..deae043b3 100644 --- a/reference/agents/tools/llm/index.html +++ b/reference/agents/tools/llm/index.html @@ -3687,7 +3687,7 @@

    Llm

    - 2024-09-23 + 2024-09-23 @@ -3697,7 +3697,7 @@

    Llm

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/tools/wikipedia/index.html b/reference/agents/tools/wikipedia/index.html index 6d0509c47..639c84ae8 100644 --- a/reference/agents/tools/wikipedia/index.html +++ b/reference/agents/tools/wikipedia/index.html @@ -3966,7 +3966,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3976,7 +3976,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/agents/utils/index.html b/reference/agents/utils/index.html index 006f076a7..5bf5ee245 100644 --- a/reference/agents/utils/index.html +++ b/reference/agents/utils/index.html @@ -3806,7 +3806,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3816,7 +3816,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/base/component/index.html b/reference/base/component/index.html index addae65d0..c27d72a50 100644 --- a/reference/base/component/index.html +++ b/reference/base/component/index.html @@ -3989,7 +3989,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3999,7 +3999,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/base/index.html b/reference/base/index.html index 19a7ad265..0c1174279 100644 --- a/reference/base/index.html +++ b/reference/base/index.html @@ -4448,7 +4448,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4458,7 +4458,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/base/schema/index.html b/reference/base/schema/index.html index 6e85a57e0..8c5797bb3 100644 --- a/reference/base/schema/index.html +++ b/reference/base/schema/index.html @@ -4317,7 +4317,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4327,7 +4327,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/chatbot/base/index.html b/reference/chatbot/base/index.html index 6efbfa395..f7c639520 100644 --- a/reference/chatbot/base/index.html +++ b/reference/chatbot/base/index.html @@ -4332,7 +4332,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4342,7 +4342,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/chatbot/index.html b/reference/chatbot/index.html index e2243ed2a..b8f8ed523 100644 --- a/reference/chatbot/index.html +++ b/reference/chatbot/index.html @@ -4262,7 +4262,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4272,7 +4272,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/chatbot/simple_respondent/index.html b/reference/chatbot/simple_respondent/index.html index c96c775cd..f1a964a18 100644 --- a/reference/chatbot/simple_respondent/index.html +++ b/reference/chatbot/simple_respondent/index.html @@ -3810,7 +3810,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3820,7 +3820,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/cli/index.html b/reference/cli/index.html index 99640d25f..5ec58f35a 100644 --- a/reference/cli/index.html +++ b/reference/cli/index.html @@ -4232,7 +4232,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4242,7 +4242,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/embeddings/base/index.html b/reference/embeddings/base/index.html index 1e8a68226..2dd7849d2 100644 --- a/reference/embeddings/base/index.html +++ b/reference/embeddings/base/index.html @@ -3736,7 +3736,7 @@

    Base

    - 2024-09-23 + 2024-09-23 @@ -3746,7 +3746,7 @@

    Base

    - 2024-09-23 + 2024-09-23 diff --git a/reference/embeddings/endpoint_based/index.html b/reference/embeddings/endpoint_based/index.html index 788d259bc..188c2ac5a 100644 --- a/reference/embeddings/endpoint_based/index.html +++ b/reference/embeddings/endpoint_based/index.html @@ -4016,7 +4016,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4026,7 +4026,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/embeddings/fastembed/index.html b/reference/embeddings/fastembed/index.html index 7876de7e9..bef69e2d5 100644 --- a/reference/embeddings/fastembed/index.html +++ b/reference/embeddings/fastembed/index.html @@ -3988,7 +3988,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3998,7 +3998,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/embeddings/index.html b/reference/embeddings/index.html index b63aeca3b..4b3129945 100644 --- a/reference/embeddings/index.html +++ b/reference/embeddings/index.html @@ -5309,7 +5309,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5319,7 +5319,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/embeddings/langchain_based/index.html b/reference/embeddings/langchain_based/index.html index a91934f0d..ba3370b5e 100644 --- a/reference/embeddings/langchain_based/index.html +++ b/reference/embeddings/langchain_based/index.html @@ -4230,7 +4230,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4240,7 +4240,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/embeddings/openai/index.html b/reference/embeddings/openai/index.html index f1ae7c159..4c7354b91 100644 --- a/reference/embeddings/openai/index.html +++ b/reference/embeddings/openai/index.html @@ -5030,7 +5030,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5040,7 +5040,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/base/index.html b/reference/indices/base/index.html index 5015b334b..be457d512 100644 --- a/reference/indices/base/index.html +++ b/reference/indices/base/index.html @@ -4381,7 +4381,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4391,7 +4391,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/extractors/doc_parsers/index.html b/reference/indices/extractors/doc_parsers/index.html index df6a908a9..1e4a73d77 100644 --- a/reference/indices/extractors/doc_parsers/index.html +++ b/reference/indices/extractors/doc_parsers/index.html @@ -3738,7 +3738,7 @@

    Doc Parsers

    - 2024-09-23 + 2024-09-23 @@ -3748,7 +3748,7 @@

    Doc Parsers

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/extractors/index.html b/reference/indices/extractors/index.html index 3052273f5..62ae2d198 100644 --- a/reference/indices/extractors/index.html +++ b/reference/indices/extractors/index.html @@ -3691,7 +3691,7 @@

    Extractors

    - 2024-09-23 + 2024-09-23 @@ -3701,7 +3701,7 @@

    Extractors

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/index.html b/reference/indices/index.html index 76855a096..5920e6c7b 100644 --- a/reference/indices/index.html +++ b/reference/indices/index.html @@ -4846,7 +4846,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4856,7 +4856,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/ingests/files/index.html b/reference/indices/ingests/files/index.html index db87cfdbe..62556fea0 100644 --- a/reference/indices/ingests/files/index.html +++ b/reference/indices/ingests/files/index.html @@ -4184,7 +4184,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4194,7 +4194,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/ingests/index.html b/reference/indices/ingests/index.html index 5aadbe6bd..42fe917c0 100644 --- a/reference/indices/ingests/index.html +++ b/reference/indices/ingests/index.html @@ -4113,7 +4113,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4123,7 +4123,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/qa/citation/index.html b/reference/indices/qa/citation/index.html index 2db1a17e3..facbb8722 100644 --- a/reference/indices/qa/citation/index.html +++ b/reference/indices/qa/citation/index.html @@ -4033,7 +4033,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4043,7 +4043,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/qa/index.html b/reference/indices/qa/index.html index dc5ec0602..e1c21da31 100644 --- a/reference/indices/qa/index.html +++ b/reference/indices/qa/index.html @@ -4058,7 +4058,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4068,7 +4068,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/qa/text_based/index.html b/reference/indices/qa/text_based/index.html index 826ab6054..0494c419b 100644 --- a/reference/indices/qa/text_based/index.html +++ b/reference/indices/qa/text_based/index.html @@ -3908,7 +3908,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3918,7 +3918,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/rankings/base/index.html b/reference/indices/rankings/base/index.html index dbfaf5b3b..000751a4e 100644 --- a/reference/indices/rankings/base/index.html +++ b/reference/indices/rankings/base/index.html @@ -3875,7 +3875,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3885,7 +3885,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/rankings/cohere/index.html b/reference/indices/rankings/cohere/index.html index a94e5abf6..178437267 100644 --- a/reference/indices/rankings/cohere/index.html +++ b/reference/indices/rankings/cohere/index.html @@ -4047,7 +4047,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4057,7 +4057,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/rankings/index.html b/reference/indices/rankings/index.html index 84d0178cd..f09b8ec78 100644 --- a/reference/indices/rankings/index.html +++ b/reference/indices/rankings/index.html @@ -4991,7 +4991,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5001,7 +5001,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/rankings/llm/index.html b/reference/indices/rankings/llm/index.html index 8831c4ccb..9efb7a49d 100644 --- a/reference/indices/rankings/llm/index.html +++ b/reference/indices/rankings/llm/index.html @@ -4012,7 +4012,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4022,7 +4022,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/rankings/llm_scoring/index.html b/reference/indices/rankings/llm_scoring/index.html index 4beef466e..7b8e99654 100644 --- a/reference/indices/rankings/llm_scoring/index.html +++ b/reference/indices/rankings/llm_scoring/index.html @@ -4014,7 +4014,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4024,7 +4024,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/rankings/llm_trulens/index.html b/reference/indices/rankings/llm_trulens/index.html index f74d14a12..f94f292e7 100644 --- a/reference/indices/rankings/llm_trulens/index.html +++ b/reference/indices/rankings/llm_trulens/index.html @@ -4441,7 +4441,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4451,7 +4451,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/splitters/index.html b/reference/indices/splitters/index.html index 1beb78325..b20032a05 100644 --- a/reference/indices/splitters/index.html +++ b/reference/indices/splitters/index.html @@ -3804,7 +3804,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3814,7 +3814,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/indices/vectorindex/index.html b/reference/indices/vectorindex/index.html index 5eae46138..5fa6423fc 100644 --- a/reference/indices/vectorindex/index.html +++ b/reference/indices/vectorindex/index.html @@ -4941,7 +4941,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4951,7 +4951,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/base/index.html b/reference/llms/base/index.html index 3631725ed..0eb53d1dd 100644 --- a/reference/llms/base/index.html +++ b/reference/llms/base/index.html @@ -3736,7 +3736,7 @@

    Base

    - 2024-09-23 + 2024-09-23 @@ -3746,7 +3746,7 @@

    Base

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/branching/index.html b/reference/llms/branching/index.html index 46642db40..b123d4f23 100644 --- a/reference/llms/branching/index.html +++ b/reference/llms/branching/index.html @@ -4756,7 +4756,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4766,7 +4766,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/chats/base/index.html b/reference/llms/chats/base/index.html index 51091bd94..49962e3e6 100644 --- a/reference/llms/chats/base/index.html +++ b/reference/llms/chats/base/index.html @@ -3738,7 +3738,7 @@

    Base

    - 2024-09-23 + 2024-09-23 @@ -3748,7 +3748,7 @@

    Base

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/chats/endpoint_based/index.html b/reference/llms/chats/endpoint_based/index.html index 654578d23..b4980b035 100644 --- a/reference/llms/chats/endpoint_based/index.html +++ b/reference/llms/chats/endpoint_based/index.html @@ -4180,7 +4180,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4190,7 +4190,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/chats/index.html b/reference/llms/chats/index.html index 7352b0c99..8bac7b417 100644 --- a/reference/llms/chats/index.html +++ b/reference/llms/chats/index.html @@ -5875,7 +5875,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5885,7 +5885,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/chats/langchain_based/index.html b/reference/llms/chats/langchain_based/index.html index 1f44c16cb..9250e15d6 100644 --- a/reference/llms/chats/langchain_based/index.html +++ b/reference/llms/chats/langchain_based/index.html @@ -4339,7 +4339,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4349,7 +4349,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/chats/llamacpp/index.html b/reference/llms/chats/llamacpp/index.html index 38f07f410..93a3a4ffb 100644 --- a/reference/llms/chats/llamacpp/index.html +++ b/reference/llms/chats/llamacpp/index.html @@ -4236,7 +4236,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4246,7 +4246,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/chats/openai/index.html b/reference/llms/chats/openai/index.html index 3304518dd..6fe949c4b 100644 --- a/reference/llms/chats/openai/index.html +++ b/reference/llms/chats/openai/index.html @@ -5439,7 +5439,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5449,7 +5449,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/completions/base/index.html b/reference/llms/completions/base/index.html index 3b1abd07a..68f17ca02 100644 --- a/reference/llms/completions/base/index.html +++ b/reference/llms/completions/base/index.html @@ -3738,7 +3738,7 @@

    Base

    - 2024-09-23 + 2024-09-23 @@ -3748,7 +3748,7 @@

    Base

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/completions/index.html b/reference/llms/completions/index.html index e32ff02f8..a64e5ef96 100644 --- a/reference/llms/completions/index.html +++ b/reference/llms/completions/index.html @@ -4076,7 +4076,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4086,7 +4086,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/completions/langchain_based/index.html b/reference/llms/completions/langchain_based/index.html index 247413df9..f9ee74d73 100644 --- a/reference/llms/completions/langchain_based/index.html +++ b/reference/llms/completions/langchain_based/index.html @@ -4150,7 +4150,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4160,7 +4160,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/cot/index.html b/reference/llms/cot/index.html index 3a8fca1e0..a38ecc51f 100644 --- a/reference/llms/cot/index.html +++ b/reference/llms/cot/index.html @@ -4488,7 +4488,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4498,7 +4498,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/index.html b/reference/llms/index.html index 741aefbe4..8f6dbea36 100644 --- a/reference/llms/index.html +++ b/reference/llms/index.html @@ -9824,7 +9824,7 @@

    - 2024-09-23 + 2024-09-23 @@ -9834,7 +9834,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/linear/index.html b/reference/llms/linear/index.html index fe3c1f77b..5f5ebe005 100644 --- a/reference/llms/linear/index.html +++ b/reference/llms/linear/index.html @@ -4765,7 +4765,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4775,7 +4775,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/prompts/base/index.html b/reference/llms/prompts/base/index.html index ed75e1f3d..6df69f818 100644 --- a/reference/llms/prompts/base/index.html +++ b/reference/llms/prompts/base/index.html @@ -4466,7 +4466,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4476,7 +4476,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/prompts/index.html b/reference/llms/prompts/index.html index 9e9786eb3..7b5f64ef3 100644 --- a/reference/llms/prompts/index.html +++ b/reference/llms/prompts/index.html @@ -5195,7 +5195,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5205,7 +5205,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/llms/prompts/template/index.html b/reference/llms/prompts/template/index.html index db6a3ac46..eca89e819 100644 --- a/reference/llms/prompts/template/index.html +++ b/reference/llms/prompts/template/index.html @@ -4598,7 +4598,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4608,7 +4608,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/adobe_loader/index.html b/reference/loaders/adobe_loader/index.html index e414e6f6a..a43938a10 100644 --- a/reference/loaders/adobe_loader/index.html +++ b/reference/loaders/adobe_loader/index.html @@ -4507,7 +4507,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4517,7 +4517,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/azureai_document_intelligence_loader/index.html b/reference/loaders/azureai_document_intelligence_loader/index.html index 71cf9fa57..f3c76f5dc 100644 --- a/reference/loaders/azureai_document_intelligence_loader/index.html +++ b/reference/loaders/azureai_document_intelligence_loader/index.html @@ -4640,7 +4640,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4650,7 +4650,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/base/index.html b/reference/loaders/base/index.html index b1ed0449a..45166eb7f 100644 --- a/reference/loaders/base/index.html +++ b/reference/loaders/base/index.html @@ -4061,7 +4061,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4071,7 +4071,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/composite_loader/index.html b/reference/loaders/composite_loader/index.html index e2664ade9..6ac6035b7 100644 --- a/reference/loaders/composite_loader/index.html +++ b/reference/loaders/composite_loader/index.html @@ -4082,7 +4082,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4092,7 +4092,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/docx_loader/index.html b/reference/loaders/docx_loader/index.html index 03eaa588a..1453a5760 100644 --- a/reference/loaders/docx_loader/index.html +++ b/reference/loaders/docx_loader/index.html @@ -4214,7 +4214,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4224,7 +4224,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/excel_loader/index.html b/reference/loaders/excel_loader/index.html index 5793c5ef4..f57fc2863 100644 --- a/reference/loaders/excel_loader/index.html +++ b/reference/loaders/excel_loader/index.html @@ -4765,7 +4765,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4775,7 +4775,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/html_loader/index.html b/reference/loaders/html_loader/index.html index e389b36fd..15592ee36 100644 --- a/reference/loaders/html_loader/index.html +++ b/reference/loaders/html_loader/index.html @@ -4535,7 +4535,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4545,7 +4545,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/index.html b/reference/loaders/index.html index 2d38916ed..76b695645 100644 --- a/reference/loaders/index.html +++ b/reference/loaders/index.html @@ -10067,7 +10067,7 @@

    - 2024-09-23 + 2024-09-23 @@ -10077,7 +10077,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/mathpix_loader/index.html b/reference/loaders/mathpix_loader/index.html index 131236626..4c161d258 100644 --- a/reference/loaders/mathpix_loader/index.html +++ b/reference/loaders/mathpix_loader/index.html @@ -4384,7 +4384,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4394,7 +4394,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/ocr_loader/index.html b/reference/loaders/ocr_loader/index.html index f51afb879..aa243bff4 100644 --- a/reference/loaders/ocr_loader/index.html +++ b/reference/loaders/ocr_loader/index.html @@ -4754,7 +4754,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4764,7 +4764,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/pdf_loader/index.html b/reference/loaders/pdf_loader/index.html index 18efe4229..71e700205 100644 --- a/reference/loaders/pdf_loader/index.html +++ b/reference/loaders/pdf_loader/index.html @@ -4234,7 +4234,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4244,7 +4244,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/txt_loader/index.html b/reference/loaders/txt_loader/index.html index 32df41f4e..9f98e9a71 100644 --- a/reference/loaders/txt_loader/index.html +++ b/reference/loaders/txt_loader/index.html @@ -3736,7 +3736,7 @@

    Txt Loader

    - 2024-09-23 + 2024-09-23 @@ -3746,7 +3746,7 @@

    Txt Loader

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/unstructured_loader/index.html b/reference/loaders/unstructured_loader/index.html index 6264f78e4..51632d555 100644 --- a/reference/loaders/unstructured_loader/index.html +++ b/reference/loaders/unstructured_loader/index.html @@ -4165,7 +4165,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4175,7 +4175,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/utils/adobe/index.html b/reference/loaders/utils/adobe/index.html index 2701b8aff..64b70b3bf 100644 --- a/reference/loaders/utils/adobe/index.html +++ b/reference/loaders/utils/adobe/index.html @@ -4625,7 +4625,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4635,7 +4635,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/utils/box/index.html b/reference/loaders/utils/box/index.html index e52e593ea..cb3129648 100644 --- a/reference/loaders/utils/box/index.html +++ b/reference/loaders/utils/box/index.html @@ -4555,7 +4555,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4565,7 +4565,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/utils/gpt4v/index.html b/reference/loaders/utils/gpt4v/index.html index d3baf66af..68e88807c 100644 --- a/reference/loaders/utils/gpt4v/index.html +++ b/reference/loaders/utils/gpt4v/index.html @@ -3738,7 +3738,7 @@

    Gpt4V

    - 2024-09-23 + 2024-09-23 @@ -3748,7 +3748,7 @@

    Gpt4V

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/utils/index.html b/reference/loaders/utils/index.html index 3fa90296c..9832961d6 100644 --- a/reference/loaders/utils/index.html +++ b/reference/loaders/utils/index.html @@ -3691,7 +3691,7 @@

    Utils

    - 2024-09-23 + 2024-09-23 @@ -3701,7 +3701,7 @@

    Utils

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/utils/pdf_ocr/index.html b/reference/loaders/utils/pdf_ocr/index.html index 6c85117d9..5fb2cd600 100644 --- a/reference/loaders/utils/pdf_ocr/index.html +++ b/reference/loaders/utils/pdf_ocr/index.html @@ -4569,7 +4569,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4579,7 +4579,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/loaders/utils/table/index.html b/reference/loaders/utils/table/index.html index 2a82b7d4d..ed787524d 100644 --- a/reference/loaders/utils/table/index.html +++ b/reference/loaders/utils/table/index.html @@ -5241,7 +5241,7 @@

    - 2024-09-23 + 2024-09-23 @@ -5251,7 +5251,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/parsers/index.html b/reference/parsers/index.html index 8862ff2b7..82c6013dc 100644 --- a/reference/parsers/index.html +++ b/reference/parsers/index.html @@ -4618,7 +4618,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4628,7 +4628,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/parsers/regex_extractor/index.html b/reference/parsers/regex_extractor/index.html index 0ba0cdb42..c567808f7 100644 --- a/reference/parsers/regex_extractor/index.html +++ b/reference/parsers/regex_extractor/index.html @@ -4716,7 +4716,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4726,7 +4726,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/docstores/base/index.html b/reference/storages/docstores/base/index.html index 155409687..b2a95c125 100644 --- a/reference/storages/docstores/base/index.html +++ b/reference/storages/docstores/base/index.html @@ -4347,7 +4347,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4357,7 +4357,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/docstores/elasticsearch/index.html b/reference/storages/docstores/elasticsearch/index.html index bac00336d..533f938b6 100644 --- a/reference/storages/docstores/elasticsearch/index.html +++ b/reference/storages/docstores/elasticsearch/index.html @@ -4862,7 +4862,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4872,7 +4872,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/docstores/in_memory/index.html b/reference/storages/docstores/in_memory/index.html index c49e36f74..b68e6693f 100644 --- a/reference/storages/docstores/in_memory/index.html +++ b/reference/storages/docstores/in_memory/index.html @@ -4538,7 +4538,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4548,7 +4548,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/docstores/index.html b/reference/storages/docstores/index.html index 69b96eb4b..2b2b0590d 100644 --- a/reference/storages/docstores/index.html +++ b/reference/storages/docstores/index.html @@ -7019,7 +7019,7 @@

    - 2024-09-23 + 2024-09-23 @@ -7029,7 +7029,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/docstores/lancedb/index.html b/reference/storages/docstores/lancedb/index.html index 2d11398ca..9c9a272d4 100644 --- a/reference/storages/docstores/lancedb/index.html +++ b/reference/storages/docstores/lancedb/index.html @@ -4414,7 +4414,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4424,7 +4424,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/docstores/simple_file/index.html b/reference/storages/docstores/simple_file/index.html index 506b96d1d..2f8d87984 100644 --- a/reference/storages/docstores/simple_file/index.html +++ b/reference/storages/docstores/simple_file/index.html @@ -4220,7 +4220,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4230,7 +4230,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/index.html b/reference/storages/index.html index 9fd0df15a..0e295b659 100644 --- a/reference/storages/index.html +++ b/reference/storages/index.html @@ -9470,7 +9470,7 @@

    - 2024-09-23 + 2024-09-23 @@ -9480,7 +9480,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/base/index.html b/reference/storages/vectorstores/base/index.html index 52cfc8d21..8125d806b 100644 --- a/reference/storages/vectorstores/base/index.html +++ b/reference/storages/vectorstores/base/index.html @@ -4942,7 +4942,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4952,7 +4952,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/chroma/index.html b/reference/storages/vectorstores/chroma/index.html index 40ca0baff..2a8dd2186 100644 --- a/reference/storages/vectorstores/chroma/index.html +++ b/reference/storages/vectorstores/chroma/index.html @@ -4114,7 +4114,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4124,7 +4124,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/in_memory/index.html b/reference/storages/vectorstores/in_memory/index.html index 1e9d3b3d5..16b06810c 100644 --- a/reference/storages/vectorstores/in_memory/index.html +++ b/reference/storages/vectorstores/in_memory/index.html @@ -4180,7 +4180,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4190,7 +4190,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/index.html b/reference/storages/vectorstores/index.html index bf7ec5d8b..03ca9b47b 100644 --- a/reference/storages/vectorstores/index.html +++ b/reference/storages/vectorstores/index.html @@ -6144,7 +6144,7 @@

    - 2024-09-23 + 2024-09-23 @@ -6154,7 +6154,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/lancedb/index.html b/reference/storages/vectorstores/lancedb/index.html index 6c5424974..136f39302 100644 --- a/reference/storages/vectorstores/lancedb/index.html +++ b/reference/storages/vectorstores/lancedb/index.html @@ -4068,7 +4068,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4078,7 +4078,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/milvus/index.html b/reference/storages/vectorstores/milvus/index.html index ed1c31214..99e47cb05 100644 --- a/reference/storages/vectorstores/milvus/index.html +++ b/reference/storages/vectorstores/milvus/index.html @@ -4010,7 +4010,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4020,7 +4020,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/qdrant/index.html b/reference/storages/vectorstores/qdrant/index.html index c1cceeced..280c460d2 100644 --- a/reference/storages/vectorstores/qdrant/index.html +++ b/reference/storages/vectorstores/qdrant/index.html @@ -4122,7 +4122,7 @@

    - 2024-09-23 + 2024-09-23 @@ -4132,7 +4132,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/reference/storages/vectorstores/simple_file/index.html b/reference/storages/vectorstores/simple_file/index.html index 9607e5b08..0546c0d78 100644 --- a/reference/storages/vectorstores/simple_file/index.html +++ b/reference/storages/vectorstores/simple_file/index.html @@ -3918,7 +3918,7 @@

    - 2024-09-23 + 2024-09-23 @@ -3928,7 +3928,7 @@

    - 2024-09-23 + 2024-09-23 diff --git a/search/search_index.json b/search/search_index.json index a34293309..5df330dc9 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Quick Start","text":""},{"location":"#getting-started-with-kotaemon","title":"Getting Started with Kotaemon","text":"

    This page is intended for end users who want to use the kotaemon tool for Question Answering on local documents. If you are a developer who wants contribute to the project, please visit the development page.

    "},{"location":"#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"

    Visit this guide.

    "},{"location":"#installation-offline","title":"Installation (Offline)","text":""},{"location":"#download","title":"Download","text":"

    Download the kotaemon-app.zip file from the latest release.

    "},{"location":"#run-setup-script","title":"Run setup script","text":"
    1. Unzip the downloaded file.
    2. Navigate to the scripts folder and start an installer that matches your OS:
      • Windows: run_windows.bat. Just double click the file.
      • macOS: run_macos.sh
        1. Right click on your file and select Open with and Other.
        2. Enable All Applications and choose Terminal.
        3. NOTE: If you always want to open that file with Terminal, then check Always Open With.
        4. From now on, double click on your file and it should work.
      • Linux: run_linux.sh. Please run the script using bash run_linux.sh in your terminal.
    3. After the installation, the installer will ask to launch the ktem's UI, answer to continue.
    4. If launched, the application will be open automatically in your browser.
    "},{"location":"#launch","title":"Launch","text":"

    To launch the app after initial setup or any change, simply run the run_* script again.

    A browser window will be opened and greets you with this screen:

    "},{"location":"#usage","title":"Usage","text":"

    For how to use the application, see Usage. This page will also be available to you within the application.

    "},{"location":"#feedback","title":"Feedback","text":"

    Feel free to create a bug report or a feature request on our repo.

    "},{"location":"about/","title":"About Kotaemon","text":""},{"location":"about/#about-kotaemon","title":"About Kotaemon","text":"

    An open-source tool for chatting with your documents. Built with both end users and developers in mind.

    Source Code | Live Demo

    User Guide | Developer Guide | Feedback

    Dark Mode | Light Mode

    "},{"location":"local_model/","title":"Setup local LLMs & Embedding models","text":""},{"location":"local_model/#setup-local-llms-embedding-models","title":"Setup local LLMs & Embedding models","text":""},{"location":"local_model/#prepare-local-models","title":"Prepare local models","text":""},{"location":"local_model/#note","title":"NOTE","text":"

    In the case of using Docker image, please replace http://localhost with http://host.docker.internal to correctly communicate with service on the host machine. See more detail.

    "},{"location":"local_model/#ollama-openai-compatible-server-recommended","title":"Ollama OpenAI compatible server (recommended)","text":"

    Install ollama and start the application.

    Pull your model (e.g):

    ollama pull llama3.1:8b\nollama pull nomic-embed-text\n

    Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to Ollama:

    api_key: ollama\nbase_url: http://localhost:11434/v1/\nmodel: gemma2:2b (for llm) | nomic-embed-text (for embedding)\n

    "},{"location":"local_model/#oobaboogatext-generation-webui-openai-compatible-server","title":"oobabooga/text-generation-webui OpenAI compatible server","text":"

    Install oobabooga/text-generation-webui.

    Follow the setup guide to download your models (GGUF, HF). Also take a look at OpenAI compatible server for detail instructions.

    Here is a short version

    # install sentence-transformer for embeddings creation\npip install sentence_transformers\n# change to text-generation-webui src dir\npython server.py --api\n

    Use the Models tab to download new model and press Load.

    Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to text-generation-webui:

    api_key: dummy\nbase_url: http://localhost:5000/v1/\nmodel: any\n
    "},{"location":"local_model/#llama-cpp-python-server-llm-only","title":"llama-cpp-python server (LLM only)","text":"

    See llama-cpp-python OpenAI server.

    Download any GGUF model weight on HuggingFace or other source. Place it somewhere on your local machine.

    Run

    LOCAL_MODEL=<path/to/GGUF> python scripts/serve_local.py\n

    Setup LLM model on Resources tab with type OpenAI. Set these model parameters to connect to llama-cpp-python:

    api_key: dummy\nbase_url: http://localhost:8000/v1/\nmodel: model_name\n
    "},{"location":"local_model/#use-local-models-for-rag","title":"Use local models for RAG","text":"

    You are set! Start a new conversation to test your local RAG pipeline.

    "},{"location":"online_install/","title":"Online install","text":""},{"location":"online_install/#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"
    1. Go to kotaemon_template

    2. Use Duplicate function to create your own space

    1. Wait for the build to complete and start up (apprx 10 mins).

    1. Follow the first setup instructions (and register for Cohere API key if needed)

    1. Complete the setup and use your own private space!

    "},{"location":"usage/","title":"Basic Usage","text":""},{"location":"usage/#1-add-your-ai-models","title":"1. Add your AI models","text":"

    To add a model:

    1. Navigate to the Resources tab.
    2. Select the LLMs sub-tab.
    3. Select the Add sub-tab.
    4. Config the model to add:
      • Give it a name.
      • Pick a vendor/provider (e.g. ChatOpenAI).
      • Provide the specifications.
      • (Optional) Set the model as default.
    5. Click Add to add the model.
    6. Select Embedding Models sub-tab and repeat the step 3 to 5 to add an embedding model.
    (Optional) Configure model via the .env file

    Alternatively, you can configure the models via the .env file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.

    Currently, the following providers are supported:

    "},{"location":"usage/#openai","title":"OpenAI","text":"

    In the .env file, set the OPENAI_API_KEY variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.

    OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=<your OpenAI API key here>\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n
    "},{"location":"usage/#azure-openai","title":"Azure OpenAI","text":"

    For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.

    AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview # could be different for you\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo # change to your deployment name\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 # change to your deployment name\n
    "},{"location":"usage/#local-models","title":"Local models","text":"

    Pros:

    Cons:

    "},{"location":"usage/#find-and-download-a-llm","title":"Find and download a LLM","text":"

    You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:

    You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that take up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.

    Here are some recommendations and their size in memory:

    "},{"location":"usage/#enable-local-models","title":"Enable local models","text":"

    To add a local model to the model pool, set the LOCAL_MODEL variable in the .env file to the path of the model file.

    LOCAL_MODEL=<full path to your model file>\n

    Here is how to get the full path of your model file:

    "},{"location":"usage/#2-upload-your-documents","title":"2. Upload your documents","text":"

    In order to do QA on your documents, you need to upload them to the application first. Navigate to the File Index tab and you will see 2 sections:

    1. File upload:
      • Drag and drop your file to the UI or select it from your file system. Then click Upload and Index.
      • The application will take some time to process the file and show a message once it is done.
    2. File list:
      • This section shows the list of files that have been uploaded to the application and allows users to delete them.
    "},{"location":"usage/#3-chat-with-your-documents","title":"3. Chat with your documents","text":"

    Now navigate back to the Chat tab. The chat tab is divided into 3 regions:

    1. Conversation Settings Panel
      • Here you can select, create, rename, and delete conversations.
        • By default, a new conversation is created automatically if no conversation is selected.
      • Below that you have the file index, where you can choose whether to disable, select all files, or select which files to retrieve references from.
        • If you choose \"Disabled\", no files will be considered as context during chat.
        • If you choose \"Search All\", all files will be considered during chat.
        • If you choose \"Select\", a dropdown will appear for you to select the files to be considered during chat. If no files are selected, then no files will be considered during chat.
    2. Chat Panel
      • This is where you can chat with the chatbot.
    3. Information Panel

    Generally, the score quality is LLM relevant score > Reranking score > Vectorscore. By default, overall relevance score is taken directly from LLM relevant score. Evidences are sorted based on their overall relevance score and whether they have citation or not.

    "},{"location":"development/","title":"Development","text":""},{"location":"development/#kotaemon","title":"kotaemon","text":"

    An open-source clean & customizable RAG UI for chatting with your documents. Built with both end users and developers in mind.

    Live Demo | Source Code

    User Guide | Developer Guide | Feedback

    "},{"location":"development/#introduction","title":"Introduction","text":"

    This project serves as a functional RAG UI for both end users who want to do QA on their documents and developers who want to build their own RAG pipeline.

    +----------------------------------------------------------------------------+\n| End users: Those who use apps built with `kotaemon`.                       |\n| (You use an app like the one in the demo above)                            |\n|     +----------------------------------------------------------------+     |\n|     | Developers: Those who built with `kotaemon`.                   |     |\n|     | (You have `import kotaemon` somewhere in your project)         |     |\n|     |     +----------------------------------------------------+     |     |\n|     |     | Contributors: Those who make `kotaemon` better.    |     |     |\n|     |     | (You make PR to this repo)                         |     |     |\n|     |     +----------------------------------------------------+     |     |\n|     +----------------------------------------------------------------+     |\n+----------------------------------------------------------------------------+\n

    This repository is under active development. Feedback, issues, and PRs are highly appreciated.

    "},{"location":"development/#key-features","title":"Key Features","text":" "},{"location":"development/#installation","title":"Installation","text":""},{"location":"development/#for-end-users","title":"For end users","text":"

    This document is intended for developers. If you just want to install and use the app as it is, please follow the non-technical User Guide. Use the most recent release .zip to include latest features and bug-fixes.

    "},{"location":"development/#for-developers","title":"For developers","text":""},{"location":"development/#with-docker-recommended","title":"With Docker (recommended)","text":"

    We support lite & full version of Docker images. With full, the extra packages of unstructured will be installed as well, it can support additional file types (.doc, .docx, ...) but the cost is larger docker image size. For most users, the lite image should work well in most cases.

    docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-lite\n
    docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-full\n

    Currently, two platforms: linux/amd64 and linux/arm64 (for newer Mac) are provided & tested. User can specify the platform by passing --platform in the docker run command. For example:

    # To run docker with platform linux/arm64\ndocker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\n--platform linux/arm64 \\\nghcr.io/cinnamon/kotaemon:main-lite\n

    If everything is set up fine, navigate to http://localhost:7860/ to access the web UI.

    We use GHCR to store docker images, all images can be found here.

    "},{"location":"development/#without-docker","title":"Without Docker","text":"
    # optional (setup env)\nconda create -n kotaemon python=3.10\nconda activate kotaemon\n\n# clone this repo\ngit clone https://github.com/Cinnamon/kotaemon\ncd kotaemon\n\npip install -e \"libs/kotaemon[all]\"\npip install -e \"libs/ktem\"\n
    python app.py\n

    The app will be automatically launched in your browser.

    Default username / password are: admin / admin. You can setup additional users directly on the UI.

    "},{"location":"development/#setup-local-models-for-local-private-rag","title":"Setup local models (for local / private RAG)","text":"

    See Local model setup.

    "},{"location":"development/#customize-your-application","title":"Customize your application","text":"

    By default, all application data are stored in ./ktem_app_data folder. You can backup or copy this folder to move your installation to a new machine.

    For advance users or specific use-cases, you can customize those files:

    "},{"location":"development/#flowsettingspy","title":"flowsettings.py","text":"

    This file contains the configuration of your application. You can use the example here as the starting point.

    Notable settings
    # setup your preferred document store (with full-text search capabilities)\nKH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore)\n\n# setup your preferred vectorstore (for vector-based search)\nKH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Qdrant)\n\n# Enable / disable multimodal QA\nKH_REASONINGS_USE_MULTIMODAL=True\n\n# Setup your new reasoning pipeline or modify existing one.\nKH_REASONINGS = [\n    \"ktem.reasoning.simple.FullQAPipeline\",\n    \"ktem.reasoning.simple.FullDecomposeQAPipeline\",\n    \"ktem.reasoning.react.ReactAgentPipeline\",\n    \"ktem.reasoning.rewoo.RewooAgentPipeline\",\n]\n)\n
    "},{"location":"development/#env","title":".env","text":"

    This file provides another way to configure your models and credentials.

    Configure model via the .env file

    Alternatively, you can configure the models via the .env file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.

    Currently, the following providers are supported:

    "},{"location":"development/#openai","title":"OpenAI","text":"

    In the .env file, set the OPENAI_API_KEY variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.

    OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=<your OpenAI API key here>\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n
    "},{"location":"development/#azure-openai","title":"Azure OpenAI","text":"

    For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.

    AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002\n
    "},{"location":"development/#local-models","title":"Local models","text":""},{"location":"development/#using-ollama-openai-compatible-server","title":"Using ollama OpenAI compatible server","text":"

    Install ollama and start the application.

    Pull your model (e.g):

    ollama pull llama3.1:8b\nollama pull nomic-embed-text\n

    Set the model names on web UI and make it as default.

    "},{"location":"development/#using-gguf-with-llama-cpp-python","title":"Using GGUF with llama-cpp-python","text":"

    You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:

    You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.

    Here are some recommendations and their size in memory:

    Add a new LlamaCpp model with the provided model name on the web uI.

    "},{"location":"development/#adding-your-own-rag-pipeline","title":"Adding your own RAG pipeline","text":""},{"location":"development/#custom-reasoning-pipeline","title":"Custom reasoning pipeline","text":"

    First, check the default pipeline implementation in here. You can make quick adjustment to how the default QA pipeline work.

    Next, if you feel comfortable adding new pipeline, add new .py implementation in libs/ktem/ktem/reasoning/ and later include it in flowssettings to enable it on the UI.

    "},{"location":"development/#custom-indexing-pipeline","title":"Custom indexing pipeline","text":"

    Check sample implementation in libs/ktem/ktem/index/file/graph

    (more instruction WIP).

    "},{"location":"development/#developer-guide","title":"Developer guide","text":"

    Please refer to the Developer Guide for more details.

    "},{"location":"development/#star-history","title":"Star History","text":""},{"location":"development/contributing/","title":"Contributing","text":""},{"location":"development/contributing/#contributing","title":"Contributing","text":""},{"location":"development/contributing/#setting-up","title":"Setting up","text":" "},{"location":"development/contributing/#package-overview","title":"Package overview","text":"

    kotaemon library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities:

    mindmap\n  root((kotaemon))\n    Base Interfaces\n      Document\n      LLMInterface\n      RetrievedDocument\n      BaseEmbeddings\n      BaseChat\n      BaseCompletion\n      ...\n    Core Components\n      LLMs\n        AzureOpenAI\n        OpenAI\n      Embeddings\n        AzureOpenAI\n        OpenAI\n        HuggingFaceEmbedding\n      VectorStore\n        InMemoryVectorstore\n        ChromaVectorstore\n      Agent\n      Tool\n      DocumentStore\n      ...\n    Utilities\n      Scaffold project\n      PromptUI\n      Documentation Support
    "},{"location":"development/contributing/#common-conventions","title":"Common conventions","text":""},{"location":"development/contributing/#environment-caching-on-pr","title":"Environment caching on PR","text":""},{"location":"development/contributing/#merge-pr-guideline","title":"Merge PR guideline","text":""},{"location":"development/create-a-component/","title":"Creating a Component","text":""},{"location":"development/create-a-component/#creating-a-component","title":"Creating a component","text":"

    A fundamental concept in kotaemon is \"component\".

    Anything that isn't data or data structure is a \"component\". A component can be thought of as a step within a pipeline. It takes in some input, processes it, and returns an output, just the same as a Python function! The output will then become an input for the next component in a pipeline. In fact, a pipeline is just a component. More appropriately, a nested component: a component that makes use of one or more other components in the processing step. So in reality, there isn't a difference between a pipeline and a component! Because of that, in kotaemon, we will consider them the same as \"component\".

    To define a component, you will:

    1. Create a class that subclasses from kotaemon.base.BaseComponent
    2. Declare init params with type annotation
    3. Declare nodes (nodes are just other components!) with type annotation
    4. Implement the processing logic in run.

    The syntax of a component is as follow:

    from kotaemon.base import BaseComponent\nfrom kotaemon.llms import LCAzureChatOpenAI\nfrom kotaemon.parsers import RegexExtractor\n\n\nclass FancyPipeline(BaseComponent):\n    param1: str = \"This is param1\"\n    param2: int = 10\n    param3: float\n\n    node1: BaseComponent    # this is a node because of BaseComponent type annotation\n    node2: LCAzureChatOpenAI  # this is also a node because LCAzureChatOpenAI subclasses BaseComponent\n    node3: RegexExtractor   # this is also a node bceause RegexExtractor subclasses BaseComponent\n\n    def run(self, some_text: str):\n        prompt = (self.param1 + some_text) * int(self.param2 + self.param3)\n        llm_pred = self.node2(prompt).text\n        matches = self.node3(llm_pred)\n        return matches\n

    Then this component can be used as follow:

    llm = LCAzureChatOpenAI(endpoint=\"some-endpont\")\nextractor = RegexExtractor(pattern=[\"yes\", \"Yes\"])\n\ncomponent = FancyPipeline(\n    param1=\"Hello\"\n    param3=1.5\n    node1=llm,\n    node2=llm,\n    node3=extractor\n)\ncomponent(\"goodbye\")\n

    This way, we can define each operation as a reusable component, and use them to compose larger reusable components!

    "},{"location":"development/create-a-component/#benefits-of-component","title":"Benefits of component","text":"

    By defining a component as above, we formally encapsulate all the necessary information inside a single class. This introduces several benefits:

    1. Allow tools like promptui to inspect the inner working of a component in order to automatically generate the promptui.
    2. Allow visualizing a pipeline for debugging purpose.
    "},{"location":"development/data-components/","title":"Data & Data Structure Components","text":""},{"location":"development/data-components/#data-data-structure-components","title":"Data & Data Structure Components","text":"

    The data & data structure components include:

    "},{"location":"development/data-components/#data-loader","title":"Data Loader","text":""},{"location":"development/data-components/#document-store","title":"Document Store","text":""},{"location":"development/data-components/#vector-store","title":"Vector Store","text":""},{"location":"development/utilities/","title":"Utilities","text":""},{"location":"development/utilities/#utilities","title":"Utilities","text":""},{"location":"development/utilities/#prompt-engineering-ui","title":"Prompt engineering UI","text":"

    Important: despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...).

    In the development process, developers typically build the pipeline. However, for use cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more effective. To facilitate this, kotaemon offers a user-friendly prompt engineering UI that developers integrate into their pipelines. This enables non-technical members to adjust prompts and parameters, run experiments, and export results for optimization.

    As of Sept 2023, there are 2 kinds of prompt engineering UI:

    "},{"location":"development/utilities/#simple-pipeline","title":"Simple pipeline","text":"

    For simple pipeline, the supported client project workflow looks as follow:

    1. [tech] Build pipeline
    2. [tech] Export pipeline to config: $ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>
    3. [tech] Customize the config
    4. [tech] Spin up prompt engineering UI: $ kotaemon promptui run <path/to/config/file.yml>
    5. [non-tech] Change params, run inference
    6. [non-tech] Export to Excel
    7. [non-tech] Select the set of params that achieve the best output

    The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally done by the developers, while step 7 happens exclusively in Excel file).

    "},{"location":"development/utilities/#step-2-export-pipeline-to-config","title":"Step 2 - Export pipeline to config","text":"

    Command:

    $ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>\n

    where:

    By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with ignore_ui=True, and they will be ignored in the config generation process. Example:

    class Pipeline(BaseComponent):\n    param1: str = Param(default=\"hello\")\n    param2: str = Param(default=\"goodbye\", ignore_ui=True)\n

    Declared as above, and param1 will show up in the config YAML file, while param2 will not.

    "},{"location":"development/utilities/#step-3-customize-the-config","title":"Step 3 - Customize the config","text":"

    developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema:

    <module.path.pipelineclass1>:\n  params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.)\n  inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.)\n  outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.)\n  logs: ... (Detail what information should show up in the log.)\n
    "},{"location":"development/utilities/#input-and-params","title":"Input and params","text":"

    The inputs section have the overall schema as follow:

    inputs:\n  <input-variable-name-1>:\n    component: <supported-UI-component>\n    params: # this section is optional)\n      value: <default-value>\n  <input-variable-name-2>: ... # similar to above\nparams:\n  <param-variable-name-1>: ... # similar to those in the inputs\n

    The list of supported prompt UI and their corresponding gradio UI components:

    COMPONENTS_CLASS = {\n    \"text\": gr.components.Textbox,\n    \"checkbox\": gr.components.CheckboxGroup,\n    \"dropdown\": gr.components.Dropdown,\n    \"file\": gr.components.File,\n    \"image\": gr.components.Image,\n    \"number\": gr.components.Number,\n    \"radio\": gr.components.Radio,\n    \"slider\": gr.components.Slider,\n}\n
    "},{"location":"development/utilities/#outputs","title":"Outputs","text":"

    The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration:

    outputs:\n  - component: <supported-UI-component>\n    step: <name-of-pipeline-step>\n    item: <jsonpath way to retrieve the info>\n  - ... # similar to above\n

    where:

    "},{"location":"development/utilities/#logs","title":"Logs","text":"

    The logs show a list of sheetname and how to retrieve the desired information.

    logs:\n  <logname>:\n    inputs:\n      - name: <column name>\n        step: <the pipeline step that we would wish to see the input>\n        variable: <the variable in the step>\n      - ...\n    outputs:\n      - name: <column name>\n        step: <the pipeline step that we would wish to see the output>\n        item: <how to retrieve the output of that step>\n
    "},{"location":"development/utilities/#step-4-5-spin-up-prompt-engineering-ui-perform-prompt-engineering","title":"Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering","text":"

    Command:

    $ kotaemon promptui run <path/to/config/file.yml>\n

    This will generate an UI as follow:

    where:

    "},{"location":"development/utilities/#step-6-export-to-excel","title":"Step 6 - Export to Excel","text":"

    Upon clicking export, the users can download Excel file.

    "},{"location":"development/utilities/#chat-pipeline","title":"Chat pipeline","text":"

    Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow:

    1. Set the desired parameters.
    2. Click \"New chat\" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect.
    3. Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot.
    4. During chat, the log of the chat will show up in the \"Output\" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings.
    5. When finishing chat, select your preference in the radio box. Click \"End chat\". This will save the chat log and the preference to disk.
    6. To compare the result of different run, click \"Export\" to get an Excel spreadsheet summary of different run.
    "},{"location":"pages/app/customize-flows/","title":"Customize flow logic","text":""},{"location":"pages/app/customize-flows/#add-new-indexing-and-reasoning-pipeline-to-the-application","title":"Add new indexing and reasoning pipeline to the application","text":"

    @trducng

    At high level, to add new indexing and reasoning pipeline:

    1. You define your indexing or reasoning pipeline as a class from BaseComponent.
    2. You declare that class in the setting files flowsettings.py.

    Then when python app.py, the application will dynamically load those pipelines.

    The below sections talk in more detail about how the pipelines should be constructed.

    "},{"location":"pages/app/customize-flows/#define-a-pipeline-as-a-class","title":"Define a pipeline as a class","text":"

    In essence, a pipeline will subclass from kotaemon.base.BaseComponent. Each pipeline has 2 main parts:

    An example pipeline:

    from kotaemon.base import BaseComponent\n\n\nclass SoSimple(BaseComponent):\n    arg1: int\n    arg2: str\n\n    def run(self, arg3: str):\n        return self.arg1 * self.arg2 + arg3\n

    This pipeline is simple for demonstration purpose, but we can imagine pipelines with much more arguments, that can take other pipelines as arguments, and have more complicated logic in the run method.

    An indexing or reasoning pipeline is just a class subclass from BaseComponent like above.

    For more detail on this topic, please refer to Creating a Component

    "},{"location":"pages/app/customize-flows/#run-signatures","title":"Run signatures","text":"

    Note: this section is tentative at the moment. We will finalize def run function signature by latest early April.

    The indexing pipeline:

        def run(\n        self,\n        file_paths: str | Path | list[str | Path],\n        reindex: bool = False,\n        **kwargs,\n    ):\n        \"\"\"Index files to intermediate representation (e.g. vector, database...)\n\n        Args:\n            file_paths: the list of paths to files\n            reindex: if True, files in `file_paths` that already exists in database\n                should be reindex.\n        \"\"\"\n

    The reasoning pipeline:

        def run(self, question: str, history: list, **kwargs) -> Document:\n        \"\"\"Answer the question\n\n        Args:\n            question: the user input\n            history: the chat history [(user_msg1, bot_msg1), (user_msg2, bot_msg2)...]\n\n        Returns:\n            kotaemon.base.Document: the final answer\n        \"\"\"\n
    "},{"location":"pages/app/customize-flows/#register-your-pipeline-to-ktem","title":"Register your pipeline to ktem","text":"

    To register your pipelines to ktem, you declare it in the flowsettings.py file. This file locates at the current working directory where you start the ktem. In most use cases, it is this one.

    KH_REASONING = [\"<python.module.path.to.the.reasoning.class>\"]\n\nKH_INDEX = \"<python.module.path.to.the.indexing.class>\"\n

    You can register multiple reasoning pipelines to ktem by populating the KH_REASONING list. The user can select which reasoning pipeline to use in their Settings page.

    For now, there's only one supported index option for KH_INDEX.

    Make sure that your class is discoverable by Python.

    "},{"location":"pages/app/customize-flows/#allow-users-to-customize-your-pipeline-in-the-app-settings","title":"Allow users to customize your pipeline in the app settings","text":"

    To allow the users to configure your pipeline, you need to declare what you allow the users to configure as a dictionary. ktem will include them into the application settings.

    In your pipeline class, add a classmethod get_user_settings that returns a setting dictionary, add a classmethod get_info that returns an info dictionary. Example:

    class SoSimple(BaseComponent):\n\n    ... # as above\n\n    @classmethod\n    def get_user_settings(cls) -> dict:\n        \"\"\"The settings to the user\"\"\"\n        return {\n            \"setting_1\": {\n                \"name\": \"Human-friendly name\",\n                \"value\": \"Default value\",\n                \"choices\": [(\"Human-friendly Choice 1\", \"choice1-id\"), (\"HFC 2\", \"choice2-id\")], # optional\n                \"component\": \"Which Gradio UI component to render, can be: text, number, checkbox, dropdown, radio, checkboxgroup\"\n            },\n            \"setting_2\": {\n                # follow the same rule as above\n            }\n        }\n\n    @classmethod\n    def get_info(cls) -> dict:\n        \"\"\"Pipeline information for bookkeeping purpose\"\"\"\n        return {\n            \"id\": \"a unique id to differentiate this pipeline from other pipeline\",\n            \"name\": \"Human-friendly name of the pipeline\",\n            \"description\": \"Can be a short description of this pipeline\"\n        }\n

    Once adding these methods to your pipeline class, ktem will automatically extract and add them to the settings.

    "},{"location":"pages/app/customize-flows/#construct-to-pipeline-object","title":"Construct to pipeline object","text":"

    Once ktem runs your pipeline, it will call your classmethod get_pipeline with the full user settings and expect to obtain the pipeline object. Within this get_pipeline method, you implement all the necessary logics to initiate the pipeline object. Example:

    class SoSimple(BaseComponent):\n    ... # as above\n\n    @classmethod\n    def get_pipeline(self, setting):\n        obj = cls(arg1=setting[\"reasoning.id.setting1\"])\n        return obj\n
    "},{"location":"pages/app/customize-flows/#reasoning-stream-output-to-ui","title":"Reasoning: Stream output to UI","text":"

    For fast user experience, you can stream the output directly to UI. This way, user can start observing the output as soon as the LLM model generates the 1st token, rather than having to wait the pipeline finishes to read the whole message.

    To stream the output, you need to;

    1. Turn the run function to async.
    2. Pass in the output to a special queue with self.report_output.
        async def run(self, question: str, history: list, **kwargs) -> Document:\n        for char in \"This is a long messages\":\n            self.report_output({\"output\": text.text})\n

    The argument to self.report_output is a dictionary, that contains either or all of these 2 keys: \"output\", \"evidence\". The \"output\" string will be streamed to the chat message, and the \"evidence\" string will be streamed to the information panel.

    "},{"location":"pages/app/customize-flows/#access-application-llms-embeddings","title":"Access application LLMs, Embeddings","text":"

    You can access users' collections of LLMs and embedding models with:

    from ktem.embeddings.manager import embeddings\nfrom ktem.llms.manager import llms\n\n\nllm = llms.get_default()\nembedding_model = embeddings.get_default()\n

    You can also allow the users to specifically select which llms or embedding models they want to use through the settings.

        @classmethod\n    def get_user_settings(cls) -> dict:\n        from ktem.llms.manager import llms\n\n        return {\n            \"citation_llm\": {\n                \"name\": \"LLM for citation\",\n                \"value\": llms.get_default(),\n                \"component: \"dropdown\",\n                \"choices\": list(llms.options().keys()),\n            },\n            ...\n        }\n
    "},{"location":"pages/app/customize-flows/#optional-access-application-data","title":"Optional: Access application data","text":"

    You can access the user's application database, vector store as follow:

    # get the database that contains the source files\nfrom ktem.db.models import Source, Index, Conversation, User\n\n# get the vector store\n
    "},{"location":"pages/app/features/","title":"Features","text":""},{"location":"pages/app/features/#chat","title":"Chat","text":"

    The kotaemon focuses on question and answering over a corpus of data. Below is the gentle introduction about the chat functionality.

    "},{"location":"pages/app/functional-description/","title":"Functional description","text":""},{"location":"pages/app/functional-description/#user-group-tenant-management","title":"User group / tenant management","text":""},{"location":"pages/app/functional-description/#create-new-user-group","title":"Create new user group","text":"

    (6 man-days)

    Description: each client has a dedicated user group. Each user group has an admin user who can do administrative tasks (e.g. creating user account in that user group...). The workflow for creating new user group is as follow:

    1. Cinnamon accesses the user group management UI.
    2. On \"Create user group\" panel, we supply: a. Client name: e.g. Apple. b. Sub-domain name: e.g. apple. c. Admin email, username & password.
    3. The system will: a. An Aurora Platform deployment with the specified sub-domain. b. Send an email to the admin, with the username & password.

    Expectation:

    Condition:

    "},{"location":"pages/app/functional-description/#delete-user-group","title":"Delete user group","text":"

    (2 man-days)

    Description: in the tenant management page, we can delete the selected user group. The user flow is as follow:

    1. Cinnamon accesses the user group management UI,
    2. View list of user groups.
    3. Next to target user group, click delete.
    4. Confirm whether to delete.
    5. If Yes, delete the user group. If No, cancel the operation.

    Expectation: when a user group is deleted, we expect to delete everything related to the user groups: domain, files, databases, caches, deployments.

    "},{"location":"pages/app/functional-description/#user-management","title":"User management","text":""},{"location":"pages/app/functional-description/#create-user-account-for-admin-user","title":"Create user account (for admin user)","text":"

    (1 man-day)

    Description: the admin user in the client's account can create user account for that user group. To create the new user, the client admin do:

    1. Navigate to \"Admin\" > \"Users\"
    2. In the \"Create user\" panel, supply:
      • Username
      • Password
      • Confirm password
    3. Click \"Create\"

    Expectation:

    "},{"location":"pages/app/functional-description/#delete-user-account-for-admin-user","title":"Delete user account (for admin user)","text":"

    Description: the admin user in the client's account can delete user account. Once an user account is deleted, he/she cannot login to Aurora Platform.

    1. The admin user navigates to \"Admin\" > \"Users\".
    2. In the user list panel, next to the username, the admin click on the \"Delete\" button. The Confirmation dialog appears.
    3. If \"Delete\", the user account is deleted. If \"Cancel\", do nothing. The Confirmation dialog disappears.

    Expectation:

    "},{"location":"pages/app/functional-description/#edit-user-account-for-admin-user","title":"Edit user account (for admin user)","text":"

    Description: the admin user can change any information about the user account, including password. To change user information:

    1. The admin user navigates to \"Admin\" > \"Users\".
    2. In the user list panel, next to the username, the admin click on the \"Edit\" button.
    3. The user list disappears, the user detail appears, with the following information show up:
      • Username: (prefilled the username)
      • Password: (blank)
      • Confirm password: (blank)
    4. The admin can edit any of the information, and click \"Save\" or \"Cancel\".
      • If \"Save\": the information will be updated to the database, or show error per Expectation below.
      • If \"Cancel\": skip.
    5. If Save success or Cancel, transfer back to the user list UI, where the user information is updated accordingly.

    Expectation:

    "},{"location":"pages/app/functional-description/#sign-in","title":"Sign-in","text":"

    (3 man-days)

    Description: the users can sign-in to Aurora Platform as follow:

    1. User navigates to the URL.
    2. If the user is not logged in, the UI just shows the login screen.
    3. User types username & password.
    4. If correct, the user will proceed to normal working UI.
    5. If incorrect, the login screen shows text error.
    "},{"location":"pages/app/functional-description/#sign-out","title":"Sign-out","text":"

    (1 man-day)

    Description: the user can sign-out of Aurora Platform as follow:

    1. User navigates to the Settings > User page.
    2. User click on logout.
    3. The user is signed out to the UI login screen.

    Expectation: the user is completely signed out. Next time he/she uses the Aurora Platform, he/she has to login again.

    "},{"location":"pages/app/functional-description/#change-password","title":"Change password","text":"

    Description: the user can change their password as follow:

    1. User navigates to the Settings > User page.
    2. In the change password section, the user provides these info and click Change:
      • Current password
      • New password
      • Confirm new password
    3. If changing successfully, then the password is changed. Otherwise, show the error on the UI.

    Expectation:

    "},{"location":"pages/app/functional-description/#chat","title":"Chat","text":""},{"location":"pages/app/functional-description/#chat-to-the-bot","title":"Chat to the bot","text":"

    Description: the Aurora Platform focuses on question and answering over the uploaded data. Each chat has the following components:

    The chat workflow looks as follow:

    1. [Optional] User select files that they want to scope the context for the bot. If the user doesn't select any files, then all files on Aurora Platform will be the context for the bot.
      • The user can type multi-line messages, using \"Shift + Enter\" for line-break.
    2. User sends the message (either clicking the Send button or hitting the Enter key).
    3. The bot in the chat conversation will return \"Thinking...\" while it processes.
    4. The information panel on the right begin to show data related to the user message.
    5. The bot begins to generate answer. The \"Thinking...\" placeholder disappears..

    Expecatation:

    "},{"location":"pages/app/functional-description/#conversation-switch","title":"Conversation - switch","text":"

    Description: users can jump around between different conversations. They can see the list of all conversations, can select an old converation, and continue the chat under the context of the old conversation. The switching workflow is like this:

    1. Users click on the conversation dropdown. It will show a list of conversations.
    2. Within that dropdown, the user selects one conversation.
    3. The chat messages, information panel, and selected data will show the content in that old chat.
    4. The user can continue chatting as normal under the context of this old chat.

    Expectation:

    "},{"location":"pages/app/functional-description/#conversation-create","title":"Conversation - create","text":"

    Description: the user can explicitly start a new conversation with the chatbot:

    1. User click on the \"New\" button.
    2. The new conversation is automatically created.

    Expectation:

    "},{"location":"pages/app/functional-description/#conversation-rename","title":"Conversation - rename","text":"

    Description: user can rename the chatbot by typing the name, and click on the Rename button next to it.

    Condition:

    "},{"location":"pages/app/functional-description/#conversation-delete","title":"Conversation - delete","text":"

    Description: user can delete the existing conversation as follow:

    1. Click on Delete button.
    2. The UI show confirmation with 2 buttons:
      • Delete
      • Cancel.
    3. If Delete, delete the conversation, switch to the next oldest conversation, close the confirmation panel.
    4. If cancel, just close the confirmation panel.
    "},{"location":"pages/app/functional-description/#file-management","title":"File management","text":"

    The file management allows users to upload, list and delete files that they upload to the Aurora Platform

    "},{"location":"pages/app/functional-description/#upload-file","title":"Upload file","text":"

    Description: the user can upload files to the Aurora Platform. The uploaded files will be served as context for our chatbot to refer to when it converses with the user. To upload file, the user:

    1. Navigate to the File tab.
    2. Within the File tab, there is an Upload section.
    3. User can add files to the Upload section through drag & drop, and or by click on the file browser.
    4. User can select some options relating to uploading and indexing. Depending on the project, these options can be different. Nevertheless, they will discuss below.
    5. User click on \"Upload and Index\" button.
    6. The app show notifications when indexing starts and finishes, and when errors happen on the top right corner.

    Options:

    Condition:

    "},{"location":"pages/app/functional-description/#list-all-files","title":"List all files","text":"

    Description: the user can know which files are on the system by:

    1. Navigate to the File tab.
    2. By default, it will show all the uploaded files, each with the following information: file name, file size, number of pages, uploaded date
    3. The UI also shows total number of pages, and total number of sizes in MB.
    "},{"location":"pages/app/functional-description/#delete-file","title":"Delete file","text":"

    Description: users can delete files from this UI to free up the space, or to remove outdated information. To remove the files:

    1. User navigate to the File tab.
    2. In the list of file, next to each file, there is a Delete button.
    3. The user clicks on the Delete button. Confirmation dialog appear.
    4. If Delete, delete the file. If Cancel, close the confirmation dialog.

    Expectation: once the file is deleted:

    "},{"location":"pages/app/ext/user-management/","title":"User management","text":"

    ktem provides user management as an extension. To enable user management, in your flowsettings.py, set the following variables:

    Once enabled, you have access to the following features:

    "},{"location":"pages/app/index/file/","title":"File index","text":"

    The file index stores files in a local folder and index them for retrieval. This file index provides the following infrastructure to support the indexing:

    The indexing and retrieval pipelines are encouraged to use the above software infrastructure.

    "},{"location":"pages/app/index/file/#indexing-pipeline","title":"Indexing pipeline","text":"

    The ktem has default indexing pipeline: ktem.index.file.pipelines.IndexDocumentPipeline.

    This default pipeline works as follow:

    You can customize this default pipeline if your indexing process is close to the default pipeline. You can create your own indexing pipeline if there are too much different logic.

    "},{"location":"pages/app/index/file/#customize-the-default-pipeline","title":"Customize the default pipeline","text":"

    The default pipeline provides the contact points in flowsettings.py.

    1. FILE_INDEX_PIPELINE_FILE_EXTRACTORS. Supply overriding file extractor, based on file extension. Example: {\".pdf\": \"path.to.PDFReader\", \".xlsx\": \"path.to.ExcelReader\"}
    2. FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE. The expected number of characters of each text segment. Example: 1024.
    3. FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP. The expected number of characters that consecutive text segments should overlap with each other. Example: 256.
    "},{"location":"pages/app/index/file/#create-your-own-indexing-pipeline","title":"Create your own indexing pipeline","text":"

    Your indexing pipeline will subclass BaseFileIndexIndexing.

    You should define the following methods:

    By subclassing BaseFileIndexIndexing, You will have access to the following resources:

    Once you have prepared your pipeline, register it in flowsettings.py: FILE_INDEX_PIPELINE = \"<python.path.to.your.pipeline>\".

    "},{"location":"pages/app/index/file/#retrieval-pipeline","title":"Retrieval pipeline","text":"

    The ktem has default retrieval pipeline: ktem.index.file.pipelines.DocumentRetrievalPipeline. This pipeline works as follow:

    "},{"location":"pages/app/index/file/#create-your-own-retrieval-pipeline","title":"Create your own retrieval pipeline","text":"

    Your retrieval pipeline will subclass BaseFileIndexRetriever. The retriever has the same database, vectorstore and docstore accesses like the indexing pipeline.

    You should define the following methods:

    Once you build the retrieval pipeline class, you can register it in flowsettings.py: FILE_INDEXING_RETRIEVER_PIPELIENS = [\"path.to.retrieval.pipelie\"]. Because there can be multiple parallel pipelines within an index, this variable takes a list of string rather than a string.

    "},{"location":"pages/app/index/file/#software-infrastructure","title":"Software infrastructure","text":"Infra Access Schema Ref SQL table Source self._Source - id (int): id of the source (auto)- name (str): the name of the file- path (str): the path of the file- size (int): the file size in bytes- note (dict): allow extra optional information about the file- date_created (datetime): the time the file is created (auto) This is SQLALchemy ORM class. Can consult SQL table Index self._Index - id (int): id of the index entry (auto)- source_id (int): the id of a file in the Source table- target_id: the id of the segment in docstore or vector store- relation_type (str): if the link is \"document\" or \"vector\" This is SQLAlchemy ORM class Vector store self._VS - self._VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)- self._VS.delete: delete vector entries based on ids- self._VS.query: get embeddings based on embeddings. kotaemon > storages > vectorstores > BaseVectorStore Doc store self._DS - self._DS.add: add the segments to document stores- self._DS.get: get the segments based on id- self._DS.get_all: get all segments- self._DS.delete: delete segments based on id kotaemon > storages > docstores > base > BaseDocumentStore"},{"location":"pages/app/settings/overview/","title":"Settings","text":""},{"location":"pages/app/settings/overview/#overview","title":"Overview","text":"

    There are 3 kinds of settings in ktem, geared towards different stakeholders for different use cases:

    "},{"location":"pages/app/settings/user-settings/","title":"User settings","text":""},{"location":"pages/app/settings/user-settings/#user-settings","title":"User settings","text":"

    ktem allows developers to extend the index and the reasoning pipeline. In many cases, these components can have settings that should be modified by users at run-time, (e.g. topk, chunksize...). These are the user settings.

    ktem allows developers to declare such user settings in their code. Once declared, ktem will render them in a Settings page.

    There are 2 places that ktem looks for declared user settings. You can refer to the respective pages.

    "},{"location":"pages/app/settings/user-settings/#syntax-of-a-settings","title":"Syntax of a settings","text":"

    A collection of settings is a dictionary of type dict[str, dict], where the key is a setting id, and the value is the description of the setting.

    settings = {\n    \"topk\": {\n        \"name\": \"Top-k chunks\",\n        \"value\": 10,\n        \"component\": \"number\",\n    },\n    \"lang\": {\n        \"name\": \"Languages\",\n        \"value\": \"en\",\n        \"component\": \"dropdown\",\n        \"choices\": [(\"en\", \"English\"), (\"cn\", \"Chinese\")],\n    }\n}\n

    Each setting description must have:

    "},{"location":"pages/app/settings/user-settings/#settings-page-structure","title":"Settings page structure","text":""},{"location":"reference/Summary/","title":"Summary","text":""},{"location":"reference/cli/","title":"CLI","text":""},{"location":"reference/cli/#cli.export","title":"export","text":"
    export(export_path, output)\n

    Export a pipeline to a config file

    Source code in libs/kotaemon/kotaemon/cli.py
    @promptui.command()\n@click.argument(\"export_path\", nargs=1)\n@click.option(\"--output\", default=\"promptui.yml\", show_default=True, required=False)\ndef export(export_path, output):\n    \"\"\"Export a pipeline to a config file\"\"\"\n    import sys\n\n    from theflow.utils.modules import import_dotted_string\n\n    from kotaemon.contribs.promptui.config import export_pipeline_to_config\n\n    sys.path.append(os.getcwd())\n    cls = import_dotted_string(export_path, safe=False)\n    export_pipeline_to_config(cls, output)\n    check_config_format(output)\n
    "},{"location":"reference/cli/#cli.run","title":"run","text":"
    run(run_path, share, username, password, appname, port)\n

    Run the UI from a config file

    Examples:

    \n# Run with default config file\n$ kh promptui run\n\n\n# Run with username and password supplied\n$ kh promptui run --username admin --password password\n\n\n# Run with username and prompted password\n$ kh promptui run --username admin\n\n# Run and share to promptui\n# kh promptui run --username admin --password password --share --appname hey                 --port 7861\n
    Source code in libs/kotaemon/kotaemon/cli.py
    @promptui.command()\n@click.argument(\"run_path\", required=False, default=\"promptui.yml\")\n@click.option(\n    \"--share\",\n    is_flag=True,\n    show_default=True,\n    default=False,\n    help=\"Share the app through Gradio. Requires --username to enable authentication.\",\n)\n@click.option(\n    \"--username\",\n    required=False,\n    help=(\n        \"Username for the user. If not provided, the promptui will not have \"\n        \"authentication.\"\n    ),\n)\n@click.option(\n    \"--password\",\n    required=False,\n    help=\"Password for the user. If not provided, will be prompted.\",\n)\n@click.option(\n    \"--appname\",\n    required=False,\n    help=\"The share app subdomain. Requires --share and --username\",\n)\n@click.option(\n    \"--port\",\n    required=False,\n    help=\"Port to run the app. If not provided, will $GRADIO_SERVER_PORT (7860)\",\n)\ndef run(run_path, share, username, password, appname, port):\n    \"\"\"Run the UI from a config file\n\n    Examples:\n\n        \\b\n        # Run with default config file\n        $ kh promptui run\n\n        \\b\n        # Run with username and password supplied\n        $ kh promptui run --username admin --password password\n\n        \\b\n        # Run with username and prompted password\n        $ kh promptui run --username admin\n\n        # Run and share to promptui\n        # kh promptui run --username admin --password password --share --appname hey \\\n                --port 7861\n    \"\"\"\n    import sys\n\n    from kotaemon.contribs.promptui.ui import build_from_dict\n\n    sys.path.append(os.getcwd())\n\n    check_config_format(run_path)\n    demo = build_from_dict(run_path)\n\n    params: dict = {}\n    if username is not None:\n        if password is not None:\n            auth = (username, password)\n        else:\n            auth = (username, click.prompt(\"Password\", hide_input=True))\n        params[\"auth\"] = auth\n\n    port = int(port) if port else int(os.getenv(\"GRADIO_SERVER_PORT\", \"7860\"))\n    params[\"server_port\"] = port\n\n    if share:\n        if username is None:\n            raise ValueError(\n                \"Username must be provided to enable authentication for sharing\"\n            )\n        if appname:\n            from kotaemon.contribs.promptui.tunnel import Tunnel\n\n            tunnel = Tunnel(\n                appname=str(appname), username=str(username), local_port=port\n            )\n            url = tunnel.run()\n            print(f\"App is shared at {url}\")\n        else:\n            params[\"share\"] = True\n            print(\"App is shared at Gradio\")\n\n    demo.launch(**params)\n
    "},{"location":"reference/cli/#cli.makedoc","title":"makedoc","text":"
    makedoc(module, output, separation_level)\n

    Make documentation for module module

    Example:

    \n# Make component documentation for kotaemon library\n$ kh makedoc kotaemon\n
    Source code in libs/kotaemon/kotaemon/cli.py
    @main.command()\n@click.argument(\"module\", required=True)\n@click.option(\n    \"--output\", default=\"docs.md\", required=False, help=\"The output markdown file\"\n)\n@click.option(\n    \"--separation-level\", required=False, default=1, help=\"Organize markdown layout\"\n)\ndef makedoc(module, output, separation_level):\n    \"\"\"Make documentation for module `module`\n\n    Example:\n\n        \\b\n        # Make component documentation for kotaemon library\n        $ kh makedoc kotaemon\n    \"\"\"\n    from kotaemon.contribs.docs import make_doc\n\n    make_doc(module, output, separation_level)\n    print(f\"Documentation exported to {output}\")\n
    "},{"location":"reference/cli/#cli.start_project","title":"start_project","text":"
    start_project(template)\n

    Start a project from a template.

    Important: the value for --template corresponds to the name of the template folder, which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates The default value is \"project-default\", which should work when you are starting a client project.

    Source code in libs/kotaemon/kotaemon/cli.py
    @main.command()\n@click.option(\n    \"--template\",\n    default=\"project-default\",\n    required=False,\n    help=\"Template name\",\n    show_default=True,\n)\ndef start_project(template):\n    \"\"\"Start a project from a template.\n\n    Important: the value for --template corresponds to the name of the template folder,\n    which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates\n    The default value is \"project-default\", which should work when you are starting a\n    client project.\n    \"\"\"\n\n    print(\"Retrieving template...\")\n    os.system(\n        \"cookiecutter git@github.com:Cinnamon/kotaemon.git \"\n        f\"--directory='templates/{template}'\"\n    )\n
    "},{"location":"reference/agents/","title":"Agents","text":""},{"location":"reference/agents/#agents.BaseAgent","title":"BaseAgent","text":"

    Bases: BaseComponent

    Define base agent interface

    Source code in libs/kotaemon/kotaemon/agents/base.py
    class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -> None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"reference/agents/#agents.BaseAgent.add_tools","title":"add_tools","text":"
    add_tools(tools)\n

    Helper method to add tools and update agent state if needed

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def add_tools(self, tools: list[BaseTool]) -> None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n
    "},{"location":"reference/agents/#agents.BaseAgent.run","title":"run","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n
    "},{"location":"reference/agents/#agents.AgentFinish","title":"AgentFinish","text":"

    Bases: NamedTuple

    Agent's return value when finishing execution.

    Parameters:

    Name Type Description Default return_values

    The return values of the agent.

    required log

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n
    "},{"location":"reference/agents/#agents.AgentOutput","title":"AgentOutput","text":"

    Bases: LLMInterface

    Output from an agent.

    Parameters:

    Name Type Description Default text

    The text output from the agent.

    required agent_type

    The type of agent.

    required status

    The status after executing the agent.

    required error

    The error message if any.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n
    "},{"location":"reference/agents/#agents.AgentType","title":"AgentType","text":"

    Bases: Enum

    Enumerated type for agent types.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad","title":"BaseScratchPad","text":"

    Base class for output handlers.

    "},{"location":"reference/agents/#agents.BaseScratchPad--attributes","title":"Attributes:","text":"

    logger : logging.Logger The logger object to log messages.

    "},{"location":"reference/agents/#agents.BaseScratchPad--methods","title":"Methods:","text":"

    stop(): Stop the output.

    update_status(output: str, **kwargs): Update the status of the output.

    thinking(name: str): Log that a process is thinking.

    done(_all=False): Log that the process is done.

    stream_print(item: str): Not implemented.

    json_print(item: Dict[str, Any]): Log a JSON object.

    panel_print(item: Any, title: str = \"Output\", stream: bool = False): Log a panel output.

    clear(): Not implemented.

    print(content: str, **kwargs): Log arbitrary content.

    format_json(json_obj: str): Format a JSON object.

    debug(content: str, **kwargs): Log a debug message.

    info(content: str, **kwargs): Log an informational message.

    warning(content: str, **kwargs): Log a warning message.

    error(content: str, **kwargs): Log an error message.

    critical(content: str, **kwargs): Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.stop","title":"stop","text":"
    stop()\n

    Stop the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.update_status","title":"update_status","text":"
    update_status(output, **kwargs)\n

    Update the status of the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.thinking","title":"thinking","text":"
    thinking(name)\n

    Log that a process is thinking.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.done","title":"done","text":"
    done(_all=False)\n

    Log that the process is done.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.stream_print","title":"stream_print","text":"
    stream_print(item)\n

    Stream print.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.json_print","title":"json_print","text":"
    json_print(item)\n

    Log a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.panel_print","title":"panel_print","text":"
    panel_print(item, title='Output', stream=False)\n

    Log a panel output.

    Parameters:

    Name Type Description Default item

    Any The item to log.

    required title

    str, optional The title of the panel, defaults to \"Output\".

    'Output' stream

    bool, optional

    False Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.clear","title":"clear","text":"
    clear()\n

    Not implemented.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.print","title":"print","text":"
    print(content, **kwargs)\n

    Log arbitrary content.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.format_json","title":"format_json","text":"
    format_json(json_obj)\n

    Format a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.debug","title":"debug","text":"
    debug(content, **kwargs)\n

    Log a debug message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.info","title":"info","text":"
    info(content, **kwargs)\n

    Log an informational message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.warning","title":"warning","text":"
    warning(content, **kwargs)\n

    Log a warning message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.error","title":"error","text":"
    error(content, **kwargs)\n

    Log an error message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.critical","title":"critical","text":"
    critical(content, **kwargs)\n

    Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.LangchainAgent","title":"LangchainAgent","text":"

    Bases: BaseAgent

    Wrapper for Langchain Agent

    Source code in libs/kotaemon/kotaemon/agents/langchain_based.py
    class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -> None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -> AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n
    "},{"location":"reference/agents/#agents.ReactAgent","title":"ReactAgent","text":"

    Bases: BaseAgent

    Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -> str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -> dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -> str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -> AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n
    "},{"location":"reference/agents/#agents.ReactAgent.clear","title":"clear","text":"
    clear()\n

    Clear and reset the agent.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n
    "},{"location":"reference/agents/#agents.ReactAgent.run","title":"run","text":"
    run(instruction, max_iterations=None)\n

    Run the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def run(self, instruction, max_iterations=None) -> AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/#agents.ReactAgent.stream","title":"stream","text":"
    stream(instruction, max_iterations=None)\n

    Stream the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/#agents.RewooAgent","title":"RewooAgent","text":"

    Bases: BaseAgent

    Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -> tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -> tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -> Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) > 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n
    "},{"location":"reference/agents/#agents.RewooAgent.run","title":"run","text":"
    run(instruction, use_citation=False)\n

    Run the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    @BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/#agents.RewooAgent.stream","title":"stream","text":"
    stream(instruction, use_citation=False)\n

    Stream the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/#agents.BaseTool","title":"BaseTool","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -> Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -> Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -> LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n
    "},{"location":"reference/agents/#agents.BaseTool.name","title":"name instance-attribute","text":"
    name\n

    The unique name of the tool that clearly communicates its purpose.

    "},{"location":"reference/agents/#agents.BaseTool.description","title":"description instance-attribute","text":"
    description\n

    Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.

    "},{"location":"reference/agents/#agents.BaseTool.args_schema","title":"args_schema class-attribute instance-attribute","text":"
    args_schema = None\n

    Pydantic model class to validate and parse the tool's input arguments.

    "},{"location":"reference/agents/#agents.BaseTool.verbose","title":"verbose class-attribute instance-attribute","text":"
    verbose = False\n

    Whether to log the tool's progress.

    "},{"location":"reference/agents/#agents.BaseTool.handle_tool_error","title":"handle_tool_error class-attribute instance-attribute","text":"
    handle_tool_error = False\n

    Handle the content of the ToolException thrown.

    "},{"location":"reference/agents/#agents.BaseTool.to_langchain_format","title":"to_langchain_format","text":"
    to_langchain_format()\n

    Convert this tool to Langchain format to use with its agent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def to_langchain_format(self) -> LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n
    "},{"location":"reference/agents/#agents.BaseTool.run","title":"run","text":"
    run(tool_input, verbose=None, **kwargs)\n

    Run the tool.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -> Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n
    "},{"location":"reference/agents/#agents.BaseTool.from_langchain_format","title":"from_langchain_format classmethod","text":"
    from_langchain_format(langchain_tool)\n

    Wrapper for Langchain Tool

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    @classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n
    "},{"location":"reference/agents/#agents.ComponentTool","title":"ComponentTool","text":"

    Bases: BaseTool

    Wrapper around other BaseComponent to use it as a tool

    Parameters:

    Name Type Description Default component

    BaseComponent-based component to wrap

    required postprocessor

    Optional postprocessor for the component output

    required Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -> Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n
    "},{"location":"reference/agents/#agents.WikipediaTool","title":"WikipediaTool","text":"

    Bases: BaseTool

    Tool that adds the capability to query the Wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -> AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n
    "},{"location":"reference/agents/base/","title":"Base","text":""},{"location":"reference/agents/base/#agents.base.BaseAgent","title":"BaseAgent","text":"

    Bases: BaseComponent

    Define base agent interface

    Source code in libs/kotaemon/kotaemon/agents/base.py
    class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -> None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"reference/agents/base/#agents.base.BaseAgent.add_tools","title":"add_tools","text":"
    add_tools(tools)\n

    Helper method to add tools and update agent state if needed

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def add_tools(self, tools: list[BaseTool]) -> None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n
    "},{"location":"reference/agents/base/#agents.base.BaseAgent.run","title":"run","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n
    "},{"location":"reference/agents/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/agents/langchain_based/#agents.langchain_based.LangchainAgent","title":"LangchainAgent","text":"

    Bases: BaseAgent

    Wrapper for Langchain Agent

    Source code in libs/kotaemon/kotaemon/agents/langchain_based.py
    class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -> None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -> AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n
    "},{"location":"reference/agents/utils/","title":"Utils","text":""},{"location":"reference/agents/utils/#agents.utils.get_plugin_response_content","title":"get_plugin_response_content","text":"
    get_plugin_response_content(output)\n

    Wrapper for AgentOutput content return

    Source code in libs/kotaemon/kotaemon/agents/utils.py
    def get_plugin_response_content(output) -> str:\n    \"\"\"\n    Wrapper for AgentOutput content return\n    \"\"\"\n    if isinstance(output, Document):\n        return output.text\n    else:\n        return str(output)\n
    "},{"location":"reference/agents/utils/#agents.utils.calculate_cost","title":"calculate_cost","text":"
    calculate_cost(model_name, prompt_token, completion_token)\n

    Calculate the cost of a prompt and completion.

    Returns:

    Name Type Description float float

    Cost of the provided model name with provided token information

    Source code in libs/kotaemon/kotaemon/agents/utils.py
    def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -> float:\n    \"\"\"\n    Calculate the cost of a prompt and completion.\n\n    Returns:\n        float: Cost of the provided model name with provided token information\n    \"\"\"\n    # TODO: to be implemented\n    return 0.0\n
    "},{"location":"reference/agents/io/","title":"Io","text":""},{"location":"reference/agents/io/#agents.io.AgentAction","title":"AgentAction dataclass","text":"

    Agent's action to take.

    Parameters:

    Name Type Description Default tool str

    The tool to invoke.

    required tool_input Union[str, dict]

    The input to the tool.

    required log str

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    @dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n
    "},{"location":"reference/agents/io/#agents.io.AgentFinish","title":"AgentFinish","text":"

    Bases: NamedTuple

    Agent's return value when finishing execution.

    Parameters:

    Name Type Description Default return_values

    The return values of the agent.

    required log

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n
    "},{"location":"reference/agents/io/#agents.io.AgentOutput","title":"AgentOutput","text":"

    Bases: LLMInterface

    Output from an agent.

    Parameters:

    Name Type Description Default text

    The text output from the agent.

    required agent_type

    The type of agent.

    required status

    The status after executing the agent.

    required error

    The error message if any.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n
    "},{"location":"reference/agents/io/#agents.io.AgentType","title":"AgentType","text":"

    Bases: Enum

    Enumerated type for agent types.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad","title":"BaseScratchPad","text":"

    Base class for output handlers.

    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad--attributes","title":"Attributes:","text":"

    logger : logging.Logger The logger object to log messages.

    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad--methods","title":"Methods:","text":"

    stop(): Stop the output.

    update_status(output: str, **kwargs): Update the status of the output.

    thinking(name: str): Log that a process is thinking.

    done(_all=False): Log that the process is done.

    stream_print(item: str): Not implemented.

    json_print(item: Dict[str, Any]): Log a JSON object.

    panel_print(item: Any, title: str = \"Output\", stream: bool = False): Log a panel output.

    clear(): Not implemented.

    print(content: str, **kwargs): Log arbitrary content.

    format_json(json_obj: str): Format a JSON object.

    debug(content: str, **kwargs): Log a debug message.

    info(content: str, **kwargs): Log an informational message.

    warning(content: str, **kwargs): Log a warning message.

    error(content: str, **kwargs): Log an error message.

    critical(content: str, **kwargs): Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stop","title":"stop","text":"
    stop()\n

    Stop the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.update_status","title":"update_status","text":"
    update_status(output, **kwargs)\n

    Update the status of the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.thinking","title":"thinking","text":"
    thinking(name)\n

    Log that a process is thinking.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.done","title":"done","text":"
    done(_all=False)\n

    Log that the process is done.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stream_print","title":"stream_print","text":"
    stream_print(item)\n

    Stream print.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.json_print","title":"json_print","text":"
    json_print(item)\n

    Log a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.panel_print","title":"panel_print","text":"
    panel_print(item, title='Output', stream=False)\n

    Log a panel output.

    Parameters:

    Name Type Description Default item

    Any The item to log.

    required title

    str, optional The title of the panel, defaults to \"Output\".

    'Output' stream

    bool, optional

    False Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.clear","title":"clear","text":"
    clear()\n

    Not implemented.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.print","title":"print","text":"
    print(content, **kwargs)\n

    Log arbitrary content.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.format_json","title":"format_json","text":"
    format_json(json_obj)\n

    Format a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.debug","title":"debug","text":"
    debug(content, **kwargs)\n

    Log a debug message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.info","title":"info","text":"
    info(content, **kwargs)\n

    Log an informational message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.warning","title":"warning","text":"
    warning(content, **kwargs)\n

    Log a warning message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.error","title":"error","text":"
    error(content, **kwargs)\n

    Log an error message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.critical","title":"critical","text":"
    critical(content, **kwargs)\n

    Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/","title":"Base","text":""},{"location":"reference/agents/io/base/#agents.io.base.AgentType","title":"AgentType","text":"

    Bases: Enum

    Enumerated type for agent types.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad","title":"BaseScratchPad","text":"

    Base class for output handlers.

    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--attributes","title":"Attributes:","text":"

    logger : logging.Logger The logger object to log messages.

    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--methods","title":"Methods:","text":"

    stop(): Stop the output.

    update_status(output: str, **kwargs): Update the status of the output.

    thinking(name: str): Log that a process is thinking.

    done(_all=False): Log that the process is done.

    stream_print(item: str): Not implemented.

    json_print(item: Dict[str, Any]): Log a JSON object.

    panel_print(item: Any, title: str = \"Output\", stream: bool = False): Log a panel output.

    clear(): Not implemented.

    print(content: str, **kwargs): Log arbitrary content.

    format_json(json_obj: str): Format a JSON object.

    debug(content: str, **kwargs): Log a debug message.

    info(content: str, **kwargs): Log an informational message.

    warning(content: str, **kwargs): Log a warning message.

    error(content: str, **kwargs): Log an error message.

    critical(content: str, **kwargs): Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stop","title":"stop","text":"
    stop()\n

    Stop the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.update_status","title":"update_status","text":"
    update_status(output, **kwargs)\n

    Update the status of the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.thinking","title":"thinking","text":"
    thinking(name)\n

    Log that a process is thinking.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.done","title":"done","text":"
    done(_all=False)\n

    Log that the process is done.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stream_print","title":"stream_print","text":"
    stream_print(item)\n

    Stream print.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.json_print","title":"json_print","text":"
    json_print(item)\n

    Log a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.panel_print","title":"panel_print","text":"
    panel_print(item, title='Output', stream=False)\n

    Log a panel output.

    Parameters:

    Name Type Description Default item

    Any The item to log.

    required title

    str, optional The title of the panel, defaults to \"Output\".

    'Output' stream

    bool, optional

    False Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.clear","title":"clear","text":"
    clear()\n

    Not implemented.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.print","title":"print","text":"
    print(content, **kwargs)\n

    Log arbitrary content.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.format_json","title":"format_json","text":"
    format_json(json_obj)\n

    Format a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.debug","title":"debug","text":"
    debug(content, **kwargs)\n

    Log a debug message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.info","title":"info","text":"
    info(content, **kwargs)\n

    Log an informational message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.warning","title":"warning","text":"
    warning(content, **kwargs)\n

    Log a warning message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.error","title":"error","text":"
    error(content, **kwargs)\n

    Log an error message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.critical","title":"critical","text":"
    critical(content, **kwargs)\n

    Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.AgentAction","title":"AgentAction dataclass","text":"

    Agent's action to take.

    Parameters:

    Name Type Description Default tool str

    The tool to invoke.

    required tool_input Union[str, dict]

    The input to the tool.

    required log str

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    @dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n
    "},{"location":"reference/agents/io/base/#agents.io.base.AgentFinish","title":"AgentFinish","text":"

    Bases: NamedTuple

    Agent's return value when finishing execution.

    Parameters:

    Name Type Description Default return_values

    The return values of the agent.

    required log

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n
    "},{"location":"reference/agents/io/base/#agents.io.base.AgentOutput","title":"AgentOutput","text":"

    Bases: LLMInterface

    Output from an agent.

    Parameters:

    Name Type Description Default text

    The text output from the agent.

    required agent_type

    The type of agent.

    required status

    The status after executing the agent.

    required error

    The error message if any.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n
    "},{"location":"reference/agents/io/base/#agents.io.base.check_log","title":"check_log","text":"
    check_log()\n

    Checks if logging has been enabled. :return: True if logging has been enabled, False otherwise. :rtype: bool

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def check_log():\n    \"\"\"\n    Checks if logging has been enabled.\n    :return: True if logging has been enabled, False otherwise.\n    :rtype: bool\n    \"\"\"\n    return os.environ.get(\"LOG_PATH\", None) is not None\n
    "},{"location":"reference/agents/react/","title":"React","text":""},{"location":"reference/agents/react/#agents.react.ReactAgent","title":"ReactAgent","text":"

    Bases: BaseAgent

    Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -> str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -> dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -> str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -> AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n
    "},{"location":"reference/agents/react/#agents.react.ReactAgent.clear","title":"clear","text":"
    clear()\n

    Clear and reset the agent.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n
    "},{"location":"reference/agents/react/#agents.react.ReactAgent.run","title":"run","text":"
    run(instruction, max_iterations=None)\n

    Run the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def run(self, instruction, max_iterations=None) -> AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/#agents.react.ReactAgent.stream","title":"stream","text":"
    stream(instruction, max_iterations=None)\n

    Stream the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/agent/","title":"Agent","text":""},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent","title":"ReactAgent","text":"

    Bases: BaseAgent

    Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -> str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -> dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -> str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -> AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n
    "},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.clear","title":"clear","text":"
    clear()\n

    Clear and reset the agent.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n
    "},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.run","title":"run","text":"
    run(instruction, max_iterations=None)\n

    Run the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def run(self, instruction, max_iterations=None) -> AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.stream","title":"stream","text":"
    stream(instruction, max_iterations=None)\n

    Stream the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/","title":"Rewoo","text":""},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent","title":"RewooAgent","text":"

    Bases: BaseAgent

    Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -> tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -> tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -> Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) > 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n
    "},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.run","title":"run","text":"
    run(instruction, use_citation=False)\n

    Run the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    @BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.stream","title":"stream","text":"
    stream(instruction, use_citation=False)\n

    Stream the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/agent/","title":"Agent","text":""},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent","title":"RewooAgent","text":"

    Bases: BaseAgent

    Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -> tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -> tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -> Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) > 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n
    "},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.run","title":"run","text":"
    run(instruction, use_citation=False)\n

    Run the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    @BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.stream","title":"stream","text":"
    stream(instruction, use_citation=False)\n

    Stream the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/planner/","title":"Planner","text":""},{"location":"reference/agents/rewoo/planner/#agents.rewoo.planner.Planner","title":"Planner","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/rewoo/planner.py
    class Planner(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    plugins: List[BaseTool]\n\n    def _compose_worker_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for worker in self.plugins:\n                prompt += f\"{worker.name}[input]: {worker.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _compose_fewshot_prompt(self) -> str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        worker_desctription = self._compose_worker_description()\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_planner_prompt.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return zero_shot_planner_prompt.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n\n    def run(self, instruction: str, output: BaseScratchPad = BaseScratchPad()) -> Any:\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n\n    def stream(self, instruction: str, output: BaseScratchPad = BaseScratchPad()):\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n\n        response = \"\"\n        try:\n            for text in self.model.stream(prompt):\n                response += text\n                yield text\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            print(\"Streaming is not supported, falling back to normal run\")\n            response = self.model(prompt)\n            yield response\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n
    "},{"location":"reference/agents/rewoo/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/solver/","title":"Solver","text":""},{"location":"reference/agents/rewoo/solver/#agents.rewoo.solver.Solver","title":"Solver","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/rewoo/solver.py
    class Solver(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    output_lang: str = \"English\"\n\n    def _compose_fewshot_prompt(self) -> str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction, plan_evidence, output_lang) -> str:\n        \"\"\"\n        Compose the prompt from template, plan&evidence, examples and instruction.\n        \"\"\"\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence, task=instruction, lang=output_lang\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return zero_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    task=instruction,\n                    lang=output_lang,\n                )\n\n    def run(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -> Any:\n        response = None\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n\n    def stream(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -> Any:\n        response = \"\"\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            for text in self.model.stream(prompt):\n                response += text.text\n                yield text\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            response = self.model(prompt).text\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n
    "},{"location":"reference/agents/tools/","title":"Tools","text":""},{"location":"reference/agents/tools/#agents.tools.BaseTool","title":"BaseTool","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -> Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -> Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -> LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n
    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.name","title":"name instance-attribute","text":"
    name\n

    The unique name of the tool that clearly communicates its purpose.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.description","title":"description instance-attribute","text":"
    description\n

    Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.args_schema","title":"args_schema class-attribute instance-attribute","text":"
    args_schema = None\n

    Pydantic model class to validate and parse the tool's input arguments.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.verbose","title":"verbose class-attribute instance-attribute","text":"
    verbose = False\n

    Whether to log the tool's progress.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.handle_tool_error","title":"handle_tool_error class-attribute instance-attribute","text":"
    handle_tool_error = False\n

    Handle the content of the ToolException thrown.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.to_langchain_format","title":"to_langchain_format","text":"
    to_langchain_format()\n

    Convert this tool to Langchain format to use with its agent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def to_langchain_format(self) -> LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n
    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.run","title":"run","text":"
    run(tool_input, verbose=None, **kwargs)\n

    Run the tool.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -> Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n
    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.from_langchain_format","title":"from_langchain_format classmethod","text":"
    from_langchain_format(langchain_tool)\n

    Wrapper for Langchain Tool

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    @classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n
    "},{"location":"reference/agents/tools/#agents.tools.ComponentTool","title":"ComponentTool","text":"

    Bases: BaseTool

    Wrapper around other BaseComponent to use it as a tool

    Parameters:

    Name Type Description Default component

    BaseComponent-based component to wrap

    required postprocessor

    Optional postprocessor for the component output

    required Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -> Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n
    "},{"location":"reference/agents/tools/#agents.tools.WikipediaTool","title":"WikipediaTool","text":"

    Bases: BaseTool

    Tool that adds the capability to query the Wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -> AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n
    "},{"location":"reference/agents/tools/base/","title":"Base","text":""},{"location":"reference/agents/tools/base/#agents.tools.base.ToolException","title":"ToolException","text":"

    Bases: Exception

    An optional exception that tool throws when execution error occurs.

    When this exception is thrown, the agent will not stop working, but will handle the exception according to the handle_tool_error variable of the tool, and the processing result will be returned to the agent as observation, and printed in red on the console.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ToolException(Exception):\n    \"\"\"An optional exception that tool throws when execution error occurs.\n\n    When this exception is thrown, the agent will not stop working,\n    but will handle the exception according to the handle_tool_error\n    variable of the tool, and the processing result will be returned\n    to the agent as observation, and printed in red on the console.\n    \"\"\"\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool","title":"BaseTool","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -> Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -> Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -> LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.name","title":"name instance-attribute","text":"
    name\n

    The unique name of the tool that clearly communicates its purpose.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.description","title":"description instance-attribute","text":"
    description\n

    Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.args_schema","title":"args_schema class-attribute instance-attribute","text":"
    args_schema = None\n

    Pydantic model class to validate and parse the tool's input arguments.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.verbose","title":"verbose class-attribute instance-attribute","text":"
    verbose = False\n

    Whether to log the tool's progress.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.handle_tool_error","title":"handle_tool_error class-attribute instance-attribute","text":"
    handle_tool_error = False\n

    Handle the content of the ToolException thrown.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.to_langchain_format","title":"to_langchain_format","text":"
    to_langchain_format()\n

    Convert this tool to Langchain format to use with its agent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def to_langchain_format(self) -> LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.run","title":"run","text":"
    run(tool_input, verbose=None, **kwargs)\n

    Run the tool.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -> Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.from_langchain_format","title":"from_langchain_format classmethod","text":"
    from_langchain_format(langchain_tool)\n

    Wrapper for Langchain Tool

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    @classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.ComponentTool","title":"ComponentTool","text":"

    Bases: BaseTool

    Wrapper around other BaseComponent to use it as a tool

    Parameters:

    Name Type Description Default component

    BaseComponent-based component to wrap

    required postprocessor

    Optional postprocessor for the component output

    required Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -> Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n
    "},{"location":"reference/agents/tools/google/","title":"Google","text":""},{"location":"reference/agents/tools/llm/","title":"Llm","text":""},{"location":"reference/agents/tools/wikipedia/","title":"Wikipedia","text":""},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki","title":"Wiki","text":"

    Wrapper around wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class Wiki:\n    \"\"\"Wrapper around wikipedia API.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"Check that wikipedia package is installed.\"\"\"\n        try:\n            import wikipedia  # noqa: F401\n        except ImportError:\n            raise ValueError(\n                \"Could not import wikipedia python package. \"\n                \"Please install it with `pip install wikipedia`.\"\n            )\n\n    def search(self, search: str) -> Union[str, Document]:\n        \"\"\"Try to search for wiki page.\n\n        If page exists, return the page summary, and a PageWithLookups object.\n        If page does not exist, return similar entries.\n        \"\"\"\n        import wikipedia\n\n        try:\n            page_content = wikipedia.page(search).content\n            url = wikipedia.page(search).url\n            result: Union[str, Document] = Document(\n                text=page_content, metadata={\"page\": url}\n            )\n        except wikipedia.PageError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        except wikipedia.DisambiguationError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        return result\n
    "},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki.search","title":"search","text":"
    search(search)\n

    Try to search for wiki page.

    If page exists, return the page summary, and a PageWithLookups object. If page does not exist, return similar entries.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    def search(self, search: str) -> Union[str, Document]:\n    \"\"\"Try to search for wiki page.\n\n    If page exists, return the page summary, and a PageWithLookups object.\n    If page does not exist, return similar entries.\n    \"\"\"\n    import wikipedia\n\n    try:\n        page_content = wikipedia.page(search).content\n        url = wikipedia.page(search).url\n        result: Union[str, Document] = Document(\n            text=page_content, metadata={\"page\": url}\n        )\n    except wikipedia.PageError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    except wikipedia.DisambiguationError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    return result\n
    "},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.WikipediaTool","title":"WikipediaTool","text":"

    Bases: BaseTool

    Tool that adds the capability to query the Wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -> AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n
    "},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#base.BaseComponent","title":"BaseComponent","text":"

    Bases: Function

    A component is a class that can be used to compose a pipeline.

    Benefits of component

    For each component, the spirit is

    as generic as possible.

    Source code in libs/kotaemon/kotaemon/base/component.py
    class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -> Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -> Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n
    "},{"location":"reference/base/#base.BaseComponent.run","title":"run abstractmethod","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/base/component.py
    @abstractmethod\ndef run(\n    self, *args, **kwargs\n) -> Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n
    "},{"location":"reference/base/#base.Document","title":"Document","text":"

    Bases: Document

    Base document class, mostly inherited from Document class from llama-index.

    This class accept one positional argument content of an arbitrary type, which will store the raw content of the document. If specified, the class will use content to initialize the base llama_index class.

    Attributes:

    Name Type Description content Any

    raw content of the document, can be anything

    source Optional[str]

    id of the source of the Document. Optional.

    channel Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]

    the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"<EMBEDDING>\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -> \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -> \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n
    "},{"location":"reference/base/#base.Document.to_haystack_format","title":"to_haystack_format","text":"
    to_haystack_format()\n

    Convert struct to Haystack document format.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    def to_haystack_format(self) -> \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n
    "},{"location":"reference/base/#base.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"

    Bases: Document

    Subclass of Document which must contains embedding

    Use this if you want to enforce component's IOs to must contain embedding.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n
    "},{"location":"reference/base/#base.ExtractorOutput","title":"ExtractorOutput","text":"

    Bases: Document

    Represents the output of an extractor.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n
    "},{"location":"reference/base/#base.RetrievedDocument","title":"RetrievedDocument","text":"

    Bases: Document

    Subclass of Document with retrieval-related information

    Attributes:

    Name Type Description score float

    score of the document (from 0.0 to 1.0)

    retrieval_metadata dict

    metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n
    "},{"location":"reference/base/component/","title":"Component","text":""},{"location":"reference/base/component/#base.component.BaseComponent","title":"BaseComponent","text":"

    Bases: Function

    A component is a class that can be used to compose a pipeline.

    Benefits of component

    For each component, the spirit is

    as generic as possible.

    Source code in libs/kotaemon/kotaemon/base/component.py
    class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -> Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -> Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n
    "},{"location":"reference/base/component/#base.component.BaseComponent.run","title":"run abstractmethod","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/base/component.py
    @abstractmethod\ndef run(\n    self, *args, **kwargs\n) -> Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n
    "},{"location":"reference/base/schema/","title":"Schema","text":""},{"location":"reference/base/schema/#base.schema.Document","title":"Document","text":"

    Bases: Document

    Base document class, mostly inherited from Document class from llama-index.

    This class accept one positional argument content of an arbitrary type, which will store the raw content of the document. If specified, the class will use content to initialize the base llama_index class.

    Attributes:

    Name Type Description content Any

    raw content of the document, can be anything

    source Optional[str]

    id of the source of the Document. Optional.

    channel Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]

    the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"<EMBEDDING>\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -> \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -> \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n
    "},{"location":"reference/base/schema/#base.schema.Document.to_haystack_format","title":"to_haystack_format","text":"
    to_haystack_format()\n

    Convert struct to Haystack document format.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    def to_haystack_format(self) -> \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n
    "},{"location":"reference/base/schema/#base.schema.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"

    Bases: Document

    Subclass of Document which must contains embedding

    Use this if you want to enforce component's IOs to must contain embedding.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n
    "},{"location":"reference/base/schema/#base.schema.RetrievedDocument","title":"RetrievedDocument","text":"

    Bases: Document

    Subclass of Document with retrieval-related information

    Attributes:

    Name Type Description score float

    score of the document (from 0.0 to 1.0)

    retrieval_metadata dict

    metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n
    "},{"location":"reference/base/schema/#base.schema.ExtractorOutput","title":"ExtractorOutput","text":"

    Bases: Document

    Represents the output of an extractor.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n
    "},{"location":"reference/chatbot/","title":"Chatbot","text":""},{"location":"reference/chatbot/#chatbot.ChatConversation","title":"ChatConversation","text":"

    Bases: SessionFunction

    Base implementation of a chat bot component

    A chatbot component should Source code in libs/kotaemon/kotaemon/chatbot/base.py
    class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -> bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\">> Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: <No response>\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n
    "},{"location":"reference/chatbot/#chatbot.ChatConversation.run","title":"run","text":"
    run(message)\n

    Chat, given a message, return a response

    Parameters:

    Name Type Description Default message HumanMessage

    The message to respond to

    required

    Returns:

    Type Description Optional[BaseMessage]

    The response to the message. If None, no response is sent.

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n
    "},{"location":"reference/chatbot/#chatbot.ChatConversation.check_end","title":"check_end","text":"
    check_end(\n    history=None, user_message=None, bot_message=None\n)\n

    Check if a conversation should end

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -> bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n
    "},{"location":"reference/chatbot/#chatbot.ChatConversation.terminal_session","title":"terminal_session","text":"
    terminal_session()\n

    Create a terminal session

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\">> Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: <No response>\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n
    "},{"location":"reference/chatbot/#chatbot.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"

    Bases: BaseChatBot

    Simple text respondent chatbot that essentially wraps around a chat LLM

    Source code in libs/kotaemon/kotaemon/chatbot/simple_respondent.py
    class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -> str:\n        return self.llm(self.history).text\n
    "},{"location":"reference/chatbot/base/","title":"Base","text":""},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation","title":"ChatConversation","text":"

    Bases: SessionFunction

    Base implementation of a chat bot component

    A chatbot component should Source code in libs/kotaemon/kotaemon/chatbot/base.py
    class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -> bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\">> Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: <No response>\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n
    "},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.run","title":"run","text":"
    run(message)\n

    Chat, given a message, return a response

    Parameters:

    Name Type Description Default message HumanMessage

    The message to respond to

    required

    Returns:

    Type Description Optional[BaseMessage]

    The response to the message. If None, no response is sent.

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n
    "},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.check_end","title":"check_end","text":"
    check_end(\n    history=None, user_message=None, bot_message=None\n)\n

    Check if a conversation should end

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -> bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n
    "},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.terminal_session","title":"terminal_session","text":"
    terminal_session()\n

    Create a terminal session

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\">> Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: <No response>\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n
    "},{"location":"reference/chatbot/base/#chatbot.base.session_chat_storage","title":"session_chat_storage","text":"
    session_chat_storage(obj)\n

    Store using the bot location rather than the session location

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def session_chat_storage(obj):\n    \"\"\"Store using the bot location rather than the session location\"\"\"\n    return obj._store_result\n
    "},{"location":"reference/chatbot/simple_respondent/","title":"Simple Respondent","text":""},{"location":"reference/chatbot/simple_respondent/#chatbot.simple_respondent.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"

    Bases: BaseChatBot

    Simple text respondent chatbot that essentially wraps around a chat LLM

    Source code in libs/kotaemon/kotaemon/chatbot/simple_respondent.py
    class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -> str:\n        return self.llm(self.history).text\n
    "},{"location":"reference/embeddings/","title":"Embeddings","text":""},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings","title":"EndpointEmbeddings","text":"

    Bases: BaseEmbeddings

    An Embeddings component that uses an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of an OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n
    "},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings.run","title":"run","text":"
    run(text)\n
    Generate embeddings from text Args

    text (str | list[str] | Document | list[Document]): text to generate embeddings from

    Returns: list[DocumentWithEmbedding]: embeddings

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n
    "},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"

    Bases: BaseEmbeddings

    Utilize fastembed library for embeddings locally without GPU.

    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If > 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -> \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings.ainvoke","title":"ainvoke async","text":"
    ainvoke(text, *args, **kwargs)\n

    Fastembed does not support async API.

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -> list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/#embeddings.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's Cohere embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's HuggingFace embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's OpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/base/","title":"Base","text":""},{"location":"reference/embeddings/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings","title":"EndpointEmbeddings","text":"

    Bases: BaseEmbeddings

    An Embeddings component that uses an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of an OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n
    "},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings.run","title":"run","text":"
    run(text)\n
    Generate embeddings from text Args

    text (str | list[str] | Document | list[Document]): text to generate embeddings from

    Returns: list[DocumentWithEmbedding]: embeddings

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n
    "},{"location":"reference/embeddings/fastembed/","title":"Fastembed","text":""},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"

    Bases: BaseEmbeddings

    Utilize fastembed library for embeddings locally without GPU.

    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If > 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -> \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings.ainvoke","title":"ainvoke async","text":"
    ainvoke(text, *args, **kwargs)\n

    Fastembed does not support async API.

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -> list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's OpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n
    "},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n
    "},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's Cohere embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n
    "},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's HuggingFace embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n
    "},{"location":"reference/embeddings/openai/","title":"Openai","text":""},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings","title":"BaseOpenAIEmbeddings","text":"

    Bases: BaseEmbeddings

    Base interface for OpenAI embedding model, using the openai library.

    This class exposes the parameters in resources.Chat. To subclass this class:

    - Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n
    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class BaseOpenAIEmbeddings(BaseEmbeddings):\n    \"\"\"Base interface for OpenAI embedding model, using the openai library.\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n\n    api_key: str = Param(None, help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request.\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request.\"\n    )\n\n    dimensions: Optional[int] = Param(\n        None,\n        help=(\n            \"The number of dimensions the resulting output embeddings should have. \"\n            \"Only supported in `text-embedding-3` and later models.\"\n        ),\n    )\n    context_length: Optional[int] = Param(\n        None, help=\"The maximum context length of the embedding model\"\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_doc = self.prepare_input(text)\n        client = self.prepare_client(async_version=False)\n\n        input_: list[str | list[int]] = []\n        splitted_indices = {}\n        for idx, text in enumerate(input_doc):\n            if self.context_length:\n                chunks = split_text_by_chunk_size(text.text or \" \", self.context_length)\n                splitted_indices[idx] = (len(input_), len(input_) + len(chunks))\n                input_.extend(chunks)\n            else:\n                splitted_indices[idx] = (len(input_), len(input_) + 1)\n                input_.append(text.text)\n\n        resp = self.openai_response(client, input=input_, **kwargs).dict()\n        output_ = list(sorted(resp[\"data\"], key=lambda x: x[\"index\"]))\n\n        output = []\n        for idx, doc in enumerate(input_doc):\n            embs = output_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            if len(embs) == 1:\n                output.append(\n                    DocumentWithEmbedding(embedding=embs[0][\"embedding\"], content=doc)\n                )\n                continue\n\n            chunk_lens = [\n                len(_)\n                for _ in input_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            ]\n            vs: list[list[float]] = [_[\"embedding\"] for _ in embs]\n            emb = np.average(vs, axis=0, weights=chunk_lens)\n            emb = emb / np.linalg.norm(emb)\n            output.append(DocumentWithEmbedding(embedding=emb.tolist(), content=doc))\n\n        return output\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        client = self.prepare_client(async_version=True)\n        resp = await self.openai_response(\n            client, input=[_.text if _.text else \" \" for _ in input_], **kwargs\n        ).dict()\n        output_ = sorted(resp[\"data\"], key=lambda x: x[\"index\"])\n        return [\n            DocumentWithEmbedding(embedding=o[\"embedding\"], content=i)\n            for i, o in zip(input_, output_)\n        ]\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.split_text_by_chunk_size","title":"split_text_by_chunk_size","text":"
    split_text_by_chunk_size(text, chunk_size)\n

    Split the text into chunks of a given size

    Parameters:

    Name Type Description Default text str

    text to split

    required chunk_size int

    size of each chunk

    required

    Returns:

    Type Description list[list[int]]

    list of chunks (as tokens)

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def split_text_by_chunk_size(text: str, chunk_size: int) -> list[list[int]]:\n    \"\"\"Split the text into chunks of a given size\n\n    Args:\n        text: text to split\n        chunk_size: size of each chunk\n\n    Returns:\n        list of chunks (as tokens)\n    \"\"\"\n    encoding = tiktoken.get_encoding(\"cl100k_base\")\n    tokens = iter(encoding.encode(text))\n    result = []\n    while chunk := list(islice(tokens, chunk_size)):\n        result.append(chunk)\n    return result\n
    "},{"location":"reference/indices/","title":"Indices","text":""},{"location":"reference/indices/#indices.VectorIndexing","title":"VectorIndexing","text":"

    Bases: BaseIndexing

    Ingest the document, run through the embedding, and store the embedding in a vector store.

    This pipeline supports the following set of inputs Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'<p><img src=\"{image_origin}\"></p>'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n
    "},{"location":"reference/indices/#indices.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"
    to_retrieval_pipeline(*args, **kwargs)\n

    Convert the indexing pipeline to a retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n
    "},{"location":"reference/indices/#indices.VectorRetrieval","title":"VectorRetrieval","text":"

    Bases: BaseRetrieval

    Retrieve list of documents from vector store

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -> list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) < thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n
    "},{"location":"reference/indices/#indices.VectorRetrieval.run","title":"run","text":"
    run(text, top_k=None, **kwargs)\n

    Retrieve a list of documents from vector store

    Parameters:

    Name Type Description Default text str | Document

    the text to retrieve similar documents

    required top_k Optional[int]

    number of top similar documents to return

    None

    Returns:

    Type Description list[RetrievedDocument]

    list[RetrievedDocument]: list of retrieved documents

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -> list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) < thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n
    "},{"location":"reference/indices/base/","title":"Base","text":""},{"location":"reference/indices/base/#indices.base.DocTransformer","title":"DocTransformer","text":"

    Bases: BaseComponent

    This is a base class for document transformers

    A document transformer transforms a list of documents into another list of documents. Transforming can mean splitting a document into multiple documents, reducing a large list of documents into a smaller list of documents, or adding metadata to each document in a list of documents, etc.

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class DocTransformer(BaseComponent):\n    \"\"\"This is a base class for document transformers\n\n    A document transformer transforms a list of documents into another list\n    of documents. Transforming can mean splitting a document into multiple documents,\n    reducing a large list of documents into a smaller list of documents, or adding\n    metadata to each document in a list of documents, etc.\n    \"\"\"\n\n    @abstractmethod\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -> list[Document]:\n        ...\n
    "},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin","title":"LlamaIndexDocTransformerMixin","text":"

    Allow automatically wrapping a Llama-index component into kotaemon component

    Example

    class TokenSplitter(LlamaIndexMixin, BaseSplitter): def _get_li_class(self): from llama_index.core.text_splitter import TokenTextSplitter return TokenTextSplitter

    To use this mixin, please: 1. Use this class as the 1st parent class, so that Python will prefer to use the attributes and methods of this class whenever possible. 2. Overwrite _get_li_class to return the relevant LlamaIndex component.

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class LlamaIndexDocTransformerMixin:\n    \"\"\"Allow automatically wrapping a Llama-index component into kotaemon component\n\n    Example:\n        class TokenSplitter(LlamaIndexMixin, BaseSplitter):\n            def _get_li_class(self):\n                from llama_index.core.text_splitter import TokenTextSplitter\n                return TokenTextSplitter\n\n    To use this mixin, please:\n        1. Use this class as the 1st parent class, so that Python will prefer to use\n        the attributes and methods of this class whenever possible.\n        2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.\n    \"\"\"\n\n    def _get_li_class(self) -> Type[NodeParser]:\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in _get_li_class\"\n        )\n\n    def __init__(self, **params):\n        self._li_cls = self._get_li_class()\n        self._obj = self._li_cls(**params)\n        self._kwargs = params\n        super().__init__()\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) > 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        if name.startswith(\"_\") or name in self._protected_keywords():\n            return super().__setattr__(name, value)\n\n        self._kwargs[name] = value\n        return setattr(self._obj, name, value)\n\n    def __getattr__(self, name: str) -> Any:\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -> list[Document]:\n        \"\"\"Run Llama-index node parser and convert the output to Document from\n        kotaemon\n        \"\"\"\n        docs = self._obj(documents, **kwargs)  # type: ignore\n        return [Document.from_dict(doc.to_dict()) for doc in docs]\n
    "},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin.run","title":"run","text":"
    run(documents, **kwargs)\n

    Run Llama-index node parser and convert the output to Document from kotaemon

    Source code in libs/kotaemon/kotaemon/indices/base.py
    def run(\n    self,\n    documents: list[Document],\n    **kwargs,\n) -> list[Document]:\n    \"\"\"Run Llama-index node parser and convert the output to Document from\n    kotaemon\n    \"\"\"\n    docs = self._obj(documents, **kwargs)  # type: ignore\n    return [Document.from_dict(doc.to_dict()) for doc in docs]\n
    "},{"location":"reference/indices/base/#indices.base.BaseIndexing","title":"BaseIndexing","text":"

    Bases: BaseComponent

    Define the base interface for indexing pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class BaseIndexing(BaseComponent):\n    \"\"\"Define the base interface for indexing pipeline\"\"\"\n\n    def to_retrieval_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        raise NotImplementedError\n\n    def to_qa_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n        raise NotImplementedError\n
    "},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"
    to_retrieval_pipeline(**kwargs)\n

    Convert the indexing pipeline to a retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    def to_retrieval_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_qa_pipeline","title":"to_qa_pipeline","text":"
    to_qa_pipeline(**kwargs)\n

    Convert the indexing pipeline to a QA pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    def to_qa_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/indices/base/#indices.base.BaseRetrieval","title":"BaseRetrieval","text":"

    Bases: BaseComponent

    Define the base interface for retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class BaseRetrieval(BaseComponent):\n    \"\"\"Define the base interface for retrieval pipeline\"\"\"\n\n    @abstractmethod\n    def run(self, *args, **kwargs) -> list[RetrievedDocument]:\n        ...\n
    "},{"location":"reference/indices/vectorindex/","title":"Vectorindex","text":""},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing","title":"VectorIndexing","text":"

    Bases: BaseIndexing

    Ingest the document, run through the embedding, and store the embedding in a vector store.

    This pipeline supports the following set of inputs Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'<p><img src=\"{image_origin}\"></p>'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n
    "},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"
    to_retrieval_pipeline(*args, **kwargs)\n

    Convert the indexing pipeline to a retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n
    "},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval","title":"VectorRetrieval","text":"

    Bases: BaseRetrieval

    Retrieve list of documents from vector store

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -> list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) < thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n
    "},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval.run","title":"run","text":"
    run(text, top_k=None, **kwargs)\n

    Retrieve a list of documents from vector store

    Parameters:

    Name Type Description Default text str | Document

    the text to retrieve similar documents

    required top_k Optional[int]

    number of top similar documents to return

    None

    Returns:

    Type Description list[RetrievedDocument]

    list[RetrievedDocument]: list of retrieved documents

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -> list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) < thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n
    "},{"location":"reference/indices/extractors/","title":"Extractors","text":""},{"location":"reference/indices/extractors/doc_parsers/","title":"Doc Parsers","text":""},{"location":"reference/indices/ingests/","title":"Ingests","text":""},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor","title":"DocumentIngestor","text":"

    Bases: BaseComponent

    Ingest common office document types into Document for indexing

    Document types

    Parameters:

    Name Type Description Default pdf_mode

    mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax

    required doc_parsers

    list of document parsers to parse the document

    required text_splitter

    splitter to split the document into text nodes

    required override_file_extractors

    override file extractors for specific file extensions The default file extractors are stored in KH_DEFAULT_FILE_EXTRACTORS

    required Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n
    "},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor.run","title":"run","text":"
    run(file_paths)\n

    Ingest the file paths into Document

    Parameters:

    Name Type Description Default file_paths list[str | Path] | str | Path

    list of file paths or a single file path

    required

    Returns:

    Type Description list[Document]

    list of parsed Documents

    Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n
    "},{"location":"reference/indices/ingests/files/","title":"Files","text":""},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor","title":"DocumentIngestor","text":"

    Bases: BaseComponent

    Ingest common office document types into Document for indexing

    Document types

    Parameters:

    Name Type Description Default pdf_mode

    mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax

    required doc_parsers

    list of document parsers to parse the document

    required text_splitter

    splitter to split the document into text nodes

    required override_file_extractors

    override file extractors for specific file extensions The default file extractors are stored in KH_DEFAULT_FILE_EXTRACTORS

    required Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n
    "},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor.run","title":"run","text":"
    run(file_paths)\n

    Ingest the file paths into Document

    Parameters:

    Name Type Description Default file_paths list[str | Path] | str | Path

    list of file paths or a single file path

    required

    Returns:

    Type Description list[Document]

    list of parsed Documents

    Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n
    "},{"location":"reference/indices/qa/","title":"Qa","text":""},{"location":"reference/indices/qa/#indices.qa.CitationPipeline","title":"CitationPipeline","text":"

    Bases: BaseComponent

    Citation pipeline to extract cited evidences from source (based on input question)

    Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
    class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n
    "},{"location":"reference/indices/qa/#indices.qa.CitationQAPipeline","title":"CitationQAPipeline","text":"

    Bases: BaseComponent

    Answering question from a text corpus with citation

    Source code in libs/kotaemon/kotaemon/indices/qa/text_based.py
    class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -> str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -> Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n
    "},{"location":"reference/indices/qa/citation/","title":"Citation","text":""},{"location":"reference/indices/qa/citation/#indices.qa.citation.CiteEvidence","title":"CiteEvidence","text":"

    Bases: BaseModel

    List of evidences (maximum 5) to support the answer.

    Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
    class CiteEvidence(BaseModel):\n    \"\"\"List of evidences (maximum 5) to support the answer.\"\"\"\n\n    evidences: List[str] = Field(\n        ...,\n        description=(\n            \"Each source should be a direct quote from the context, \"\n            \"as a substring of the original content (max 15 words).\"\n        ),\n    )\n
    "},{"location":"reference/indices/qa/citation/#indices.qa.citation.CitationPipeline","title":"CitationPipeline","text":"

    Bases: BaseComponent

    Citation pipeline to extract cited evidences from source (based on input question)

    Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
    class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n
    "},{"location":"reference/indices/qa/text_based/","title":"Text Based","text":""},{"location":"reference/indices/qa/text_based/#indices.qa.text_based.CitationQAPipeline","title":"CitationQAPipeline","text":"

    Bases: BaseComponent

    Answering question from a text corpus with citation

    Source code in libs/kotaemon/kotaemon/indices/qa/text_based.py
    class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -> str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -> Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n
    "},{"location":"reference/indices/rankings/","title":"Rankings","text":""},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking","title":"BaseReranking","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n
    "},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking.run","title":"run abstractmethod","text":"
    run(documents, query)\n

    Main method to transform list of documents (re-ranking, filtering, etc)

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    @abstractmethod\ndef run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n
    "},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking","title":"CohereReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking.run","title":"run","text":"
    run(documents, query)\n

    Use Cohere Reranker model to re-order documents with their relevance score

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    def run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking","title":"LLMReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring","title":"LLMScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring","title":"LLMTrulensScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/base/","title":"Base","text":""},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking","title":"BaseReranking","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n
    "},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking.run","title":"run abstractmethod","text":"
    run(documents, query)\n

    Main method to transform list of documents (re-ranking, filtering, etc)

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    @abstractmethod\ndef run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n
    "},{"location":"reference/indices/rankings/cohere/","title":"Cohere","text":""},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking","title":"CohereReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n
    "},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking.run","title":"run","text":"
    run(documents, query)\n

    Use Cohere Reranker model to re-order documents with their relevance score

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    def run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n
    "},{"location":"reference/indices/rankings/llm/","title":"Llm","text":""},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking","title":"LLMReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_scoring/","title":"Llm Scoring","text":""},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring","title":"LLMScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_trulens/","title":"Llm Trulens","text":""},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.PATTERN_INTEGER","title":"PATTERN_INTEGER module-attribute","text":"
    PATTERN_INTEGER = compile('([+-]?[1-9][0-9]*|0)')\n

    Regex that matches integers.

    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring","title":"LLMTrulensScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.validate_rating","title":"validate_rating","text":"
    validate_rating(rating)\n

    Validate a rating is between 0 and 10.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def validate_rating(rating) -> int:\n    \"\"\"Validate a rating is between 0 and 10.\"\"\"\n\n    if not 0 <= rating <= 10:\n        raise ValueError(\"Rating must be between 0 and 10\")\n\n    return rating\n
    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.re_0_10_rating","title":"re_0_10_rating","text":"
    re_0_10_rating(s)\n

    Extract a 0-10 rating from a string.

    If the string does not match an integer or matches an integer outside the 0-10 range, raises an error instead. If multiple numbers are found within the expected 0-10 range, the smallest is returned.

    Parameters:

    Name Type Description Default s str

    String to extract rating from.

    required

    Returns:

    Name Type Description int int

    Extracted rating.

    Raises:

    Type Description ParseError

    If no integers between 0 and 10 are found in the string.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def re_0_10_rating(s: str) -> int:\n    \"\"\"Extract a 0-10 rating from a string.\n\n    If the string does not match an integer or matches an integer outside the\n    0-10 range, raises an error instead. If multiple numbers are found within\n    the expected 0-10 range, the smallest is returned.\n\n    Args:\n        s: String to extract rating from.\n\n    Returns:\n        int: Extracted rating.\n\n    Raises:\n        ParseError: If no integers between 0 and 10 are found in the string.\n    \"\"\"\n\n    matches = PATTERN_INTEGER.findall(s)\n    if not matches:\n        raise AssertionError\n\n    vals = set()\n    for match in matches:\n        try:\n            vals.add(validate_rating(int(match)))\n        except ValueError:\n            pass\n\n    if not vals:\n        raise AssertionError\n\n    # Min to handle cases like \"The rating is 8 out of 10.\"\n    return min(vals)\n
    "},{"location":"reference/indices/splitters/","title":"Splitters","text":""},{"location":"reference/indices/splitters/#indices.splitters.BaseSplitter","title":"BaseSplitter","text":"

    Bases: DocTransformer

    Represent base splitter class

    Source code in libs/kotaemon/kotaemon/indices/splitters/__init__.py
    class BaseSplitter(DocTransformer):\n    \"\"\"Represent base splitter class\"\"\"\n\n    ...\n
    "},{"location":"reference/llms/","title":"LLMs","text":""},{"location":"reference/llms/#llms.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"

    Bases: SimpleBranchingPipeline

    A simple gated branching pipeline for executing multiple branches based on a condition.

    This class extends the SimpleBranchingPipeline class and adds the ability to execute the branches until a branch returns a non-empty output based on a condition.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n
    "},{"location":"reference/llms/#llms.GatedBranchingPipeline.run","title":"run","text":"
    run(*, condition_text=None, **prompt_kwargs)\n

    Execute the pipeline by running each branch and return the output of the first branch that returns a non-empty output based on the provided condition.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate for each branch. Default to None.

    None **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Type Description

    Union[OutputType, None]: The output of the first branch that satisfies the

    condition, or None if no branch satisfies the condition.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n
    "},{"location":"reference/llms/#llms.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"

    Bases: BaseComponent

    A simple branching pipeline for executing multiple branches.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n
    "},{"location":"reference/llms/#llms.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"
    add_branch(component)\n

    Add a new branch to the pipeline.

    Parameters:

    Name Type Description Default component BaseComponent

    The branch component to be added.

    required Source code in libs/kotaemon/kotaemon/llms/branching.py
    def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n
    "},{"location":"reference/llms/#llms.SimpleBranchingPipeline.run","title":"run","text":"
    run(**prompt_kwargs)\n

    Execute the pipeline by running each branch and return the outputs as a list.

    Parameters:

    Name Type Description Default **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Name Type Description List

    The outputs of each branch as a list.

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n
    "},{"location":"reference/llms/#llms.AzureChatOpenAI","title":"AzureChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model provided by Microsoft Azure

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/llms/#llms.AzureChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.ChatOpenAI","title":"ChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.ChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/llms/#llms.ChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.EndpointChatLLM","title":"EndpointChatLLM","text":"

    Bases: ChatLLM

    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of a OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        return self.invoke(messages, **kwargs)\n
    "},{"location":"reference/llms/#llms.EndpointChatLLM.run","title":"run","text":"
    run(messages, **kwargs)\n

    Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n
    "},{"location":"reference/llms/#llms.EndpointChatLLM.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Same as run

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n
    "},{"location":"reference/llms/#llms.LlamaCppChat","title":"LlamaCppChat","text":"

    Bases: ChatLLM

    Wrapper around the llama-cpp-python's Llama model

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -> \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n
    "},{"location":"reference/llms/#llms.LlamaCppChat.client_object","title":"client_object","text":"
    client_object()\n

    Get the llama-cpp-python client object

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    @Param.auto()\ndef client_object(self) -> \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n
    "},{"location":"reference/llms/#llms.AzureOpenAI","title":"AzureOpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's AzureOpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n
    "},{"location":"reference/llms/#llms.LlamaCpp","title":"LlamaCpp","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's LlamaCpp class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n
    "},{"location":"reference/llms/#llms.OpenAI","title":"OpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's OpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n
    "},{"location":"reference/llms/#llms.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"

    Bases: BaseComponent

    Perform sequential chain-of-thought with manual pre-defined prompts

    This method supports variable number of steps. Each step corresponds to a kotaemon.pipelines.cot.Thought. Please refer that section for Thought's detail. This section is about chaining thought together.

    Usage:

    Create and run a chain of thought without \"+\" operator:

    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n>>> llm = LCAzureChatOpenAI(...)\n>>> thought1 = Thought(\n>>>    prompt=\"Word {word} in {language} is \",\n>>>    post_process=lambda string: {\"translated\": string},\n>>> )\n>>> thought2 = Thought(\n>>>     prompt=\"Translate {translated} to Japanese\",\n>>>     post_process=lambda string: {\"output\": string},\n>>> )\n>>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n>>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Create and run a chain of thought without \"+\" operator: Please refer the kotaemon.pipelines.cot.Thought section for examples.

    This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    >>> llm = LCAzureChatOpenAI(...)\n    >>> thought1 = Thought(\n    >>>    prompt=\"Word {word} in {language} is \",\n    >>>    post_process=lambda string: {\"translated\": string},\n    >>> )\n    >>> thought2 = Thought(\n    >>>     prompt=\"Translate {translated} to Japanese\",\n    >>>     post_process=lambda string: {\"output\": string},\n    >>> )\n    >>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    >>> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/#llms.ManualSequentialChainOfThought.run","title":"run","text":"
    run(**kwargs)\n

    Run the manual chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n
    "},{"location":"reference/llms/#llms.Thought","title":"Thought","text":"

    Bases: BaseComponent

    A thought in the chain of thought

    Usage:

    Create and run a thought:

    >> from kotaemon.pipelines.cot import Thought\n>> thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n>> output = thought(action=\"install\", object=\"python\")\n>> print(output)\n{'tutorial': 'As an AI language model,...'}\n

    Basically, when a thought is run, it will:

    1. Populate the prompt template with the input **kwargs.
    2. Run the LLM model with the populated prompt.
    3. Post-process the LLM output with the post-processor.

    This Thought allows chaining sequentially with the + operator. For example:

    >> llm = LCAzureChatOpenAI(...)\n>> thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n>> thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n>> thought = thought1 + thought2\n>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Under the hood, when the + operator is used, a ManualSequentialChainOfThought is created.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    >> from kotaemon.pipelines.cot import Thought\n    >> thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    >> output = thought(action=\"install\", object=\"python\")\n    >> print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    >> llm = LCAzureChatOpenAI(...)\n    >> thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    >> thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    >> thought = thought1 + thought2\n    >> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -> List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/#llms.Thought.prompt_template","title":"prompt_template","text":"
    prompt_template()\n

    Automatically wrap around param prompt. Can ignore

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    @Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n
    "},{"location":"reference/llms/#llms.Thought.run","title":"run","text":"
    run(**kwargs)\n

    Run the chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n
    "},{"location":"reference/llms/#llms.GatedLinearPipeline","title":"GatedLinearPipeline","text":"

    Bases: SimpleLinearPipeline

    A pipeline that extends the SimpleLinearPipeline class and adds a condition attribute.

    Attributes:

    Name Type Description condition Callable[[IO_Type], Any]

    A callable function that represents the condition.

    Usage Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -> Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n
    "},{"location":"reference/llms/#llms.GatedLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the pipeline with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate. Default to None.

    None llm_kwargs dict

    Additional keyword arguments for the language model call.

    {} post_processor_kwargs dict

    Additional keyword arguments for the post-processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document Document

    The final output of the pipeline as a Document object.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -> Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n
    "},{"location":"reference/llms/#llms.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"

    Bases: BaseComponent

    A simple pipeline for running a function with a prompt, a language model, and an optional post-processor.

    Attributes:

    Name Type Description prompt BasePromptComponent

    The prompt component used to generate the initial input.

    llm Union[ChatLLM, LLM]

    The language model component used to generate the output.

    post_processor Union[BaseComponent, Callable[[IO_Type], IO_Type]]

    An optional post-processor component or function.

    Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n
    "},{"location":"reference/llms/#llms.SimpleLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the function with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default llm_kwargs dict

    Keyword arguments for the llm call.

    {} post_processor_kwargs dict

    Keyword arguments for the post_processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document

    The final output of the function as a Document object.

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n
    "},{"location":"reference/llms/#llms.BasePromptComponent","title":"BasePromptComponent","text":"

    Bases: BaseComponent

    Base class for prompt components.

    Parameters:

    Name Type Description Default template PromptTemplate

    The prompt template.

    required **kwargs

    Any additional keyword arguments that will be used to populate the given template.

    {} Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n
    "},{"location":"reference/llms/#llms.BasePromptComponent.set_value","title":"set_value","text":"
    set_value(**kwargs)\n

    Similar to __set but for external use.

    Set the values of the attributes in the object based on the provided keyword arguments.

    Parameters:

    Name Type Description Default kwargs dict

    A dictionary with the attribute names as keys and the new values as values.

    {}

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n
    "},{"location":"reference/llms/#llms.BasePromptComponent.run","title":"run","text":"
    run(**kwargs)\n

    Run the function with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to pass to the function.

    {}

    Returns:

    Type Description

    The result of calling the populate method of the template object

    with the given keyword arguments.

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n
    "},{"location":"reference/llms/#llms.PromptTemplate","title":"PromptTemplate","text":"

    Base class for prompt templates.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -> str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n
    "},{"location":"reference/llms/#llms.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"
    check_missing_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n
    "},{"location":"reference/llms/#llms.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"
    check_redundant_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n
    "},{"location":"reference/llms/#llms.PromptTemplate.populate","title":"populate","text":"
    populate(**kwargs)\n

    Strictly populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Type Description str

    The populated template.

    Raises:

    Type Description ValueError

    If an unknown placeholder is provided.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def populate(self, **kwargs) -> str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n
    "},{"location":"reference/llms/#llms.PromptTemplate.partial_populate","title":"partial_populate","text":"
    partial_populate(**kwargs)\n

    Partially populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Name Type Description str

    The populated template.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n
    "},{"location":"reference/llms/base/","title":"Base","text":""},{"location":"reference/llms/branching/","title":"Branching","text":""},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"

    Bases: BaseComponent

    A simple branching pipeline for executing multiple branches.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n
    "},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"
    add_branch(component)\n

    Add a new branch to the pipeline.

    Parameters:

    Name Type Description Default component BaseComponent

    The branch component to be added.

    required Source code in libs/kotaemon/kotaemon/llms/branching.py
    def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n
    "},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.run","title":"run","text":"
    run(**prompt_kwargs)\n

    Execute the pipeline by running each branch and return the outputs as a list.

    Parameters:

    Name Type Description Default **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Name Type Description List

    The outputs of each branch as a list.

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n
    "},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"

    Bases: SimpleBranchingPipeline

    A simple gated branching pipeline for executing multiple branches based on a condition.

    This class extends the SimpleBranchingPipeline class and adds the ability to execute the branches until a branch returns a non-empty output based on a condition.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n
    "},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline.run","title":"run","text":"
    run(*, condition_text=None, **prompt_kwargs)\n

    Execute the pipeline by running each branch and return the output of the first branch that returns a non-empty output based on the provided condition.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate for each branch. Default to None.

    None **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Type Description

    Union[OutputType, None]: The output of the first branch that satisfies the

    condition, or None if no branch satisfies the condition.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n
    "},{"location":"reference/llms/cot/","title":"Cot","text":""},{"location":"reference/llms/cot/#llms.cot.Thought","title":"Thought","text":"

    Bases: BaseComponent

    A thought in the chain of thought

    Usage:

    Create and run a thought:

    >> from kotaemon.pipelines.cot import Thought\n>> thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n>> output = thought(action=\"install\", object=\"python\")\n>> print(output)\n{'tutorial': 'As an AI language model,...'}\n

    Basically, when a thought is run, it will:

    1. Populate the prompt template with the input **kwargs.
    2. Run the LLM model with the populated prompt.
    3. Post-process the LLM output with the post-processor.

    This Thought allows chaining sequentially with the + operator. For example:

    >> llm = LCAzureChatOpenAI(...)\n>> thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n>> thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n>> thought = thought1 + thought2\n>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Under the hood, when the + operator is used, a ManualSequentialChainOfThought is created.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    >> from kotaemon.pipelines.cot import Thought\n    >> thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    >> output = thought(action=\"install\", object=\"python\")\n    >> print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    >> llm = LCAzureChatOpenAI(...)\n    >> thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    >> thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    >> thought = thought1 + thought2\n    >> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -> List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/cot/#llms.cot.Thought.prompt_template","title":"prompt_template","text":"
    prompt_template()\n

    Automatically wrap around param prompt. Can ignore

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    @Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n
    "},{"location":"reference/llms/cot/#llms.cot.Thought.run","title":"run","text":"
    run(**kwargs)\n

    Run the chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n
    "},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"

    Bases: BaseComponent

    Perform sequential chain-of-thought with manual pre-defined prompts

    This method supports variable number of steps. Each step corresponds to a kotaemon.pipelines.cot.Thought. Please refer that section for Thought's detail. This section is about chaining thought together.

    Usage:

    Create and run a chain of thought without \"+\" operator:

    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n>>> llm = LCAzureChatOpenAI(...)\n>>> thought1 = Thought(\n>>>    prompt=\"Word {word} in {language} is \",\n>>>    post_process=lambda string: {\"translated\": string},\n>>> )\n>>> thought2 = Thought(\n>>>     prompt=\"Translate {translated} to Japanese\",\n>>>     post_process=lambda string: {\"output\": string},\n>>> )\n>>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n>>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Create and run a chain of thought without \"+\" operator: Please refer the kotaemon.pipelines.cot.Thought section for examples.

    This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    >>> llm = LCAzureChatOpenAI(...)\n    >>> thought1 = Thought(\n    >>>    prompt=\"Word {word} in {language} is \",\n    >>>    post_process=lambda string: {\"translated\": string},\n    >>> )\n    >>> thought2 = Thought(\n    >>>     prompt=\"Translate {translated} to Japanese\",\n    >>>     post_process=lambda string: {\"output\": string},\n    >>> )\n    >>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    >>> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought.run","title":"run","text":"
    run(**kwargs)\n

    Run the manual chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n
    "},{"location":"reference/llms/linear/","title":"Linear","text":""},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"

    Bases: BaseComponent

    A simple pipeline for running a function with a prompt, a language model, and an optional post-processor.

    Attributes:

    Name Type Description prompt BasePromptComponent

    The prompt component used to generate the initial input.

    llm Union[ChatLLM, LLM]

    The language model component used to generate the output.

    post_processor Union[BaseComponent, Callable[[IO_Type], IO_Type]]

    An optional post-processor component or function.

    Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n
    "},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the function with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default llm_kwargs dict

    Keyword arguments for the llm call.

    {} post_processor_kwargs dict

    Keyword arguments for the post_processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document

    The final output of the function as a Document object.

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n
    "},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline","title":"GatedLinearPipeline","text":"

    Bases: SimpleLinearPipeline

    A pipeline that extends the SimpleLinearPipeline class and adds a condition attribute.

    Attributes:

    Name Type Description condition Callable[[IO_Type], Any]

    A callable function that represents the condition.

    Usage Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -> Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n
    "},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the pipeline with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate. Default to None.

    None llm_kwargs dict

    Additional keyword arguments for the language model call.

    {} post_processor_kwargs dict

    Additional keyword arguments for the post-processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document Document

    The final output of the pipeline as a Document object.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -> Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n
    "},{"location":"reference/llms/chats/","title":"Chats","text":""},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM","title":"EndpointChatLLM","text":"

    Bases: ChatLLM

    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of a OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        return self.invoke(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.run","title":"run","text":"
    run(messages, **kwargs)\n

    Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n
    "},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Same as run

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/#llms.chats.LCChatMixin","title":"LCChatMixin","text":"

    Mixin for langchain based chat models

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) > 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) > 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n
    "},{"location":"reference/llms/chats/#llms.chats.LCChatMixin.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Generate response from messages

    Parameters:

    Name Type Description Default messages str | BaseMessage | list[BaseMessage]

    history of messages to generate response from

    required **kwargs

    additional arguments to pass to the langchain chat model

    {}

    Returns:

    Name Type Description LLMInterface LLMInterface

    generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n
    "},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat","title":"LlamaCppChat","text":"

    Bases: ChatLLM

    Wrapper around the llama-cpp-python's Llama model

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -> \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n
    "},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat.client_object","title":"client_object","text":"
    client_object()\n

    Get the llama-cpp-python client object

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    @Param.auto()\ndef client_object(self) -> \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n
    "},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI","title":"AzureChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model provided by Microsoft Azure

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI","title":"ChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/base/","title":"Base","text":""},{"location":"reference/llms/chats/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM","title":"EndpointChatLLM","text":"

    Bases: ChatLLM

    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of a OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        return self.invoke(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.run","title":"run","text":"
    run(messages, **kwargs)\n

    Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n
    "},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Same as run

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin","title":"LCChatMixin","text":"

    Mixin for langchain based chat models

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) > 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) > 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n
    "},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Generate response from messages

    Parameters:

    Name Type Description Default messages str | BaseMessage | list[BaseMessage]

    history of messages to generate response from

    required **kwargs

    additional arguments to pass to the langchain chat model

    {}

    Returns:

    Name Type Description LLMInterface LLMInterface

    generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n
    "},{"location":"reference/llms/chats/llamacpp/","title":"Llamacpp","text":""},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat","title":"LlamaCppChat","text":"

    Bases: ChatLLM

    Wrapper around the llama-cpp-python's Llama model

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -> \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n
    "},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat.client_object","title":"client_object","text":"
    client_object()\n

    Get the llama-cpp-python client object

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    @Param.auto()\ndef client_object(self) -> \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n
    "},{"location":"reference/llms/chats/openai/","title":"Openai","text":""},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI","title":"BaseChatOpenAI","text":"

    Bases: ChatLLM

    Base interface for OpenAI chat model, using the openai library

    This class exposes the parameters in resources.Chat. To subclass this class:

    - Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n
    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class BaseChatOpenAI(ChatLLM):\n    \"\"\"Base interface for OpenAI chat model, using the openai library\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n    _capabilities = [\"chat\", \"text\"]  # consider as mixin\n\n    api_key: str = Param(help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request\"\n    )\n\n    temperature: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between 0 and 2 that controls the randomness of the generated \"\n            \"tokens. Lower values make the model more deterministic, while higher \"\n            \"values make the model more random.\"\n        ),\n    )\n    max_tokens: Optional[int] = Param(\n        None,\n        help=(\n            \"Maximum number of tokens to generate. The total length of input tokens \"\n            \"and generated tokens is limited by the model's context length.\"\n        ),\n    )\n    n: int = Param(\n        1,\n        help=(\n            \"Number of completions to generate. The API will generate n completion \"\n            \"for each prompt.\"\n        ),\n    )\n    stop: Optional[str | list[str]] = Param(\n        None,\n        help=(\n            \"Stop sequence. If a stop sequence is detected, generation will stop \"\n            \"at that point. If not specified, generation will continue until the \"\n            \"maximum token length is reached.\"\n        ),\n    )\n    frequency_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing frequency in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    presence_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing presence in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    tool_choice: Optional[str] = Param(\n        None,\n        help=(\n            \"Choice of tool to use for the completion. Available choices are: \"\n            \"auto, default.\"\n        ),\n    )\n    tools: Optional[list[str]] = Param(\n        None,\n        help=\"List of tools to use for the completion.\",\n    )\n    logprobs: Optional[bool] = Param(\n        None,\n        help=(\n            \"Include log probabilities on the logprobs most likely tokens, \"\n            \"as well as the chosen token.\"\n        ),\n    )\n    logit_bias: Optional[dict] = Param(\n        None,\n        help=(\n            \"Dictionary of logit bias values to add to the logits of the tokens \"\n            \"in the vocabulary.\"\n        ),\n    )\n    top_logprobs: Optional[int] = Param(\n        None,\n        help=(\n            \"An integer between 0 and 5 specifying the number of most likely tokens \"\n            \"to return at each token position, each with an associated log \"\n            \"probability. `logprobs` must also be set to `true` if this parameter \"\n            \"is used.\"\n        ),\n    )\n    top_p: Optional[float] = Param(\n        None,\n        help=(\n            \"An alternative to sampling with temperature, called nucleus sampling, \"\n            \"where the model considers the results of the token with top_p \"\n            \"probability mass. So 0.1 means that only the tokens comprising the \"\n            \"top 10% probability mass are considered.\"\n        ),\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[\"ChatCompletionMessageParam\"]:\n        \"\"\"Prepare the message into OpenAI format\n\n        Returns:\n            list[dict]: List of messages in OpenAI format\n        \"\"\"\n        input_: list[BaseMessage] = []\n        output_: list[\"ChatCompletionMessageParam\"] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        for message in input_:\n            output_.append(message.to_openai_format())\n\n        return output_\n\n    def prepare_output(self, resp: dict) -> LLMInterface:\n        \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n        additional_kwargs = {}\n        if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n            additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n                \"tool_calls\"\n            ]\n\n        if resp[\"choices\"][0].get(\"logprobs\") is None:\n            logprobs = []\n        else:\n            all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n            logprobs = (\n                [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n            )\n\n        output = LLMInterface(\n            candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n            content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n            total_tokens=resp[\"usage\"][\"total_tokens\"],\n            prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n            completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n            additional_kwargs=additional_kwargs,\n            messages=[\n                AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n                for _ in resp[\"choices\"]\n            ],\n            logprobs=logprobs,\n        )\n\n        return output\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> LLMInterface:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n        return self.prepare_output(resp)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> LLMInterface:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = await self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n\n        return self.prepare_output(resp)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> Iterator[LLMInterface]:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        for c in resp:\n            chunk = c.dict()\n            if not chunk[\"choices\"]:\n                continue\n            if chunk[\"choices\"][0][\"delta\"][\"content\"] is not None:\n                if chunk[\"choices\"][0].get(\"logprobs\") is None:\n                    logprobs = []\n                else:\n                    logprobs = [\n                        logprob[\"logprob\"]\n                        for logprob in chunk[\"choices\"][0][\"logprobs\"].get(\n                            \"content\", []\n                        )\n                    ]\n\n                yield LLMInterface(\n                    content=chunk[\"choices\"][0][\"delta\"][\"content\"], logprobs=logprobs\n                )\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> AsyncGenerator[LLMInterface, None]:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        async for chunk in resp:\n            if not chunk.choices:\n                continue\n            if chunk.choices[0].delta.content is not None:\n                yield LLMInterface(content=chunk.choices[0].delta.content)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_message","title":"prepare_message","text":"
    prepare_message(messages)\n

    Prepare the message into OpenAI format

    Returns:

    Type Description list[ChatCompletionMessageParam]

    list[dict]: List of messages in OpenAI format

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_message(\n    self, messages: str | BaseMessage | list[BaseMessage]\n) -> list[\"ChatCompletionMessageParam\"]:\n    \"\"\"Prepare the message into OpenAI format\n\n    Returns:\n        list[dict]: List of messages in OpenAI format\n    \"\"\"\n    input_: list[BaseMessage] = []\n    output_: list[\"ChatCompletionMessageParam\"] = []\n\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    for message in input_:\n        output_.append(message.to_openai_format())\n\n    return output_\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_output","title":"prepare_output","text":"
    prepare_output(resp)\n

    Convert the OpenAI response into LLMInterface

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_output(self, resp: dict) -> LLMInterface:\n    \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n    additional_kwargs = {}\n    if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n        additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n            \"tool_calls\"\n        ]\n\n    if resp[\"choices\"][0].get(\"logprobs\") is None:\n        logprobs = []\n    else:\n        all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n        logprobs = (\n            [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n        )\n\n    output = LLMInterface(\n        candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n        content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n        total_tokens=resp[\"usage\"][\"total_tokens\"],\n        prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n        completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n        additional_kwargs=additional_kwargs,\n        messages=[\n            AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n            for _ in resp[\"choices\"]\n        ],\n        logprobs=logprobs,\n    )\n\n    return output\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI","title":"ChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI","title":"AzureChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model provided by Microsoft Azure

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/completions/","title":"Completions","text":""},{"location":"reference/llms/completions/#llms.completions.AzureOpenAI","title":"AzureOpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's AzureOpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n
    "},{"location":"reference/llms/completions/#llms.completions.LlamaCpp","title":"LlamaCpp","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's LlamaCpp class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n
    "},{"location":"reference/llms/completions/#llms.completions.OpenAI","title":"OpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's OpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n
    "},{"location":"reference/llms/completions/base/","title":"Base","text":""},{"location":"reference/llms/completions/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.OpenAI","title":"OpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's OpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n
    "},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.AzureOpenAI","title":"AzureOpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's AzureOpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n
    "},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.LlamaCpp","title":"LlamaCpp","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's LlamaCpp class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n
    "},{"location":"reference/llms/prompts/","title":"Prompts","text":""},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent","title":"BasePromptComponent","text":"

    Bases: BaseComponent

    Base class for prompt components.

    Parameters:

    Name Type Description Default template PromptTemplate

    The prompt template.

    required **kwargs

    Any additional keyword arguments that will be used to populate the given template.

    {} Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n
    "},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.set_value","title":"set_value","text":"
    set_value(**kwargs)\n

    Similar to __set but for external use.

    Set the values of the attributes in the object based on the provided keyword arguments.

    Parameters:

    Name Type Description Default kwargs dict

    A dictionary with the attribute names as keys and the new values as values.

    {}

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n
    "},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.run","title":"run","text":"
    run(**kwargs)\n

    Run the function with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to pass to the function.

    {}

    Returns:

    Type Description

    The result of calling the populate method of the template object

    with the given keyword arguments.

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate","title":"PromptTemplate","text":"

    Base class for prompt templates.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -> str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"
    check_missing_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"
    check_redundant_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.populate","title":"populate","text":"
    populate(**kwargs)\n

    Strictly populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Type Description str

    The populated template.

    Raises:

    Type Description ValueError

    If an unknown placeholder is provided.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def populate(self, **kwargs) -> str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.partial_populate","title":"partial_populate","text":"
    partial_populate(**kwargs)\n

    Partially populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Name Type Description str

    The populated template.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n
    "},{"location":"reference/llms/prompts/base/","title":"Base","text":""},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent","title":"BasePromptComponent","text":"

    Bases: BaseComponent

    Base class for prompt components.

    Parameters:

    Name Type Description Default template PromptTemplate

    The prompt template.

    required **kwargs

    Any additional keyword arguments that will be used to populate the given template.

    {} Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n
    "},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.set_value","title":"set_value","text":"
    set_value(**kwargs)\n

    Similar to __set but for external use.

    Set the values of the attributes in the object based on the provided keyword arguments.

    Parameters:

    Name Type Description Default kwargs dict

    A dictionary with the attribute names as keys and the new values as values.

    {}

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n
    "},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.run","title":"run","text":"
    run(**kwargs)\n

    Run the function with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to pass to the function.

    {}

    Returns:

    Type Description

    The result of calling the populate method of the template object

    with the given keyword arguments.

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n
    "},{"location":"reference/llms/prompts/template/","title":"Template","text":""},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate","title":"PromptTemplate","text":"

    Base class for prompt templates.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -> str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"
    check_missing_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"
    check_redundant_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.populate","title":"populate","text":"
    populate(**kwargs)\n

    Strictly populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Type Description str

    The populated template.

    Raises:

    Type Description ValueError

    If an unknown placeholder is provided.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def populate(self, **kwargs) -> str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.partial_populate","title":"partial_populate","text":"
    partial_populate(**kwargs)\n

    Partially populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Name Type Description str

    The populated template.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n
    "},{"location":"reference/loaders/","title":"Loaders","text":""},{"location":"reference/loaders/#loaders.AdobeReader","title":"AdobeReader","text":"

    Bases: BaseReader

    Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy

    Example
    >> from kotaemon.loaders import AdobeReader\n>> reader = AdobeReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Args: endpoint: URL to the Vision Language Model endpoint. If not provided, will use the default kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT

    max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n
    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import AdobeReader\n        >> reader = AdobeReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n
    "},{"location":"reference/loaders/#loaders.AdobeReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, **kwargs)\n

    Load data by calling to the Adobe's API

    Parameters:

    Name Type Description Default file Path

    Path to the PDF file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image

    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n
    "},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"

    Bases: BaseReader

    Utilize Azure AI Document Intelligence to parse document

    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read & crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Extract the input file, allowing multi-modal extraction

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read & crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/#loaders.AutoReader","title":"AutoReader","text":"

    Bases: BaseReader

    General auto reader for a variety of files. (based on llama-hub)

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -> None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        return self.load_data(file=file, **kwargs)\n
    "},{"location":"reference/loaders/#loaders.BaseReader","title":"BaseReader","text":"

    Bases: BaseComponent

    The base class for all readers

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n
    "},{"location":"reference/loaders/#loaders.DirectoryReader","title":"DirectoryReader","text":"

    Bases: LIReaderMixin, BaseReader

    Wrap around llama-index SimpleDirectoryReader

    Parameters:

    Name Type Description Default input_dir str

    Path to the directory.

    required input_files List

    List of file paths to read (Optional; overrides input_dir, exclude)

    required exclude List

    glob of python file paths to exclude (Optional)

    required exclude_hidden bool

    Whether to exclude hidden files (dotfiles).

    required encoding str

    Encoding of the files. Default is utf-8.

    required errors str

    how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open

    required recursive bool

    Whether to recursively search in subdirectories. False by default.

    required filename_as_id bool

    Whether to use the filename as the document id. False by default.

    required required_exts Optional[List[str]]

    List of required extensions. Default is None.

    required file_extractor Optional[Dict[str, BaseReader]]

    A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.

    required num_files_limit Optional[int]

    Maximum number of files to read. Default is None.

    required file_metadata Optional[Callable[str, Dict]]

    A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.

    required Source code in libs/kotaemon/kotaemon/loaders/composite_loader.py
    class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -> Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n
    "},{"location":"reference/loaders/#loaders.DocxReader","title":"DocxReader","text":"

    Bases: BaseReader

    Read Docx files that respect table, using python-docx library

    Reader behavior Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -> List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.DocxReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Docx reader

    Parameters:

    Name Type Description Default file_path Path

    Path to .docx file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.ExcelReader","title":"ExcelReader","text":"

    Bases: BaseReader

    Spreadsheet exporter respecting multiple worksheets

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n
    "},{"location":"reference/loaders/#loaders.ExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    True sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n
    "},{"location":"reference/loaders/#loaders.PandasExcelReader","title":"PandasExcelReader","text":"

    Bases: BaseReader

    Pandas-based CSV parser.

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n
    "},{"location":"reference/loaders/#loaders.PandasExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    False sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n
    "},{"location":"reference/loaders/#loaders.HtmlReader","title":"HtmlReader","text":"

    Bases: BaseReader

    Reader HTML usimg html2text

    Reader behavior

    Parameters:

    Name Type Description Default page_break_pattern str

    Pattern to split the HTML into pages

    None Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.HtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Html reader

    Parameters:

    Name Type Description Default file_path Path | str

    path to HTML file

    required extra_info Optional[dict]

    extra information passed to this reader during extracting data

    None

    Returns:

    Type Description list[Document]

    list[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.MhtmlReader","title":"MhtmlReader","text":"

    Bases: BaseReader

    Parse MHTML files with BeautifulSoup.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -> None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/#loaders.MhtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load MHTML document into document objects.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/#loaders.MathpixPDFReader","title":"MathpixPDFReader","text":"

    Bases: BaseReader

    Load PDF files using Mathpix service.

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -> Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -> str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -> dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -> str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -> None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -> str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -> str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"
    wait_for_processing(pdf_id)\n

    Wait for processing to complete.

    Parameters:

    Name Type Description Default pdf_id str

    a PDF id.

    required

    Returns: None

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def wait_for_processing(self, pdf_id: str) -> None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n
    "},{"location":"reference/loaders/#loaders.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"
    clean_pdf(contents)\n

    Clean the PDF file.

    Parameters:

    Name Type Description Default contents str

    a PDF file contents.

    required

    Returns:

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def clean_pdf(self, contents: str) -> str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n
    "},{"location":"reference/loaders/#loaders.ImageReader","title":"ImageReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from knowledgehub.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    required Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from knowledgehub.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n
    "},{"location":"reference/loaders/#loaders.ImageReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n
    "},{"location":"reference/loaders/#loaders.OCRReader","title":"OCRReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from kotaemon.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    True Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.OCRReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.PDFThumbnailReader","title":"PDFThumbnailReader","text":"

    Bases: PDFReader

    PDF parser with thumbnail for each page.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -> List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.PDFThumbnailReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, fs=None)\n

    Parse file.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -> List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.UnstructuredReader","title":"UnstructuredReader","text":"

    Bases: BaseReader

    General unstructured text reader for a variety of files.

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n
    "},{"location":"reference/loaders/#loaders.UnstructuredReader.load_data","title":"load_data","text":"
    load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n

    If api is set, parse through api

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n
    "},{"location":"reference/loaders/adobe_loader/","title":"Adobe Loader","text":""},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader","title":"AdobeReader","text":"

    Bases: BaseReader

    Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy

    Example
    >> from kotaemon.loaders import AdobeReader\n>> reader = AdobeReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Args: endpoint: URL to the Vision Language Model endpoint. If not provided, will use the default kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT

    max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n
    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import AdobeReader\n        >> reader = AdobeReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n
    "},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, **kwargs)\n

    Load data by calling to the Adobe's API

    Parameters:

    Name Type Description Default file Path

    Path to the PDF file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image

    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n
    "},{"location":"reference/loaders/azureai_document_intelligence_loader/","title":"Azureai Document Intelligence Loader","text":""},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"

    Bases: BaseReader

    Utilize Azure AI Document Intelligence to parse document

    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read & crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Extract the input file, allowing multi-modal extraction

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read & crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.crop_image","title":"crop_image","text":"
    crop_image(file_path, bbox, page_number=0)\n

    Crop the image based on the bounding box

    Parameters:

    Name Type Description Default file_path Path

    path to the image file

    required bbox list[float]

    bounding box of the image (in percentage [x0, y0, x1, y1])

    required page_number int

    page number of the image. Defaults to 0.

    0

    Returns:

    Type Description Image

    Image.Image: cropped image

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Image.Image:\n    \"\"\"Crop the image based on the bounding box\n\n    Args:\n        file_path (Path): path to the image file\n        bbox (list[float]): bounding box of the image (in percentage [x0, y0, x1, y1])\n        page_number (int, optional): page number of the image. Defaults to 0.\n\n    Returns:\n        Image.Image: cropped image\n    \"\"\"\n    left, upper, right, lower = bbox\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    if suffix == \".pdf\":\n        try:\n            import fitz\n        except ImportError:\n            raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n        doc = fitz.open(file_path)\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=150)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n    elif suffix in [\".tif\", \".tiff\"]:\n        img = Image.open(file_path)\n        img.seek(page_number)\n    else:\n        img = Image.open(file_path)\n\n    return img.crop(\n        (\n            int(left * img.width),\n            int(upper * img.height),\n            int(right * img.width),\n            int(lower * img.height),\n        )\n    )\n
    "},{"location":"reference/loaders/base/","title":"Base","text":""},{"location":"reference/loaders/base/#loaders.base.BaseReader","title":"BaseReader","text":"

    Bases: BaseComponent

    The base class for all readers

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n
    "},{"location":"reference/loaders/base/#loaders.base.AutoReader","title":"AutoReader","text":"

    Bases: BaseReader

    General auto reader for a variety of files. (based on llama-hub)

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -> None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        return self.load_data(file=file, **kwargs)\n
    "},{"location":"reference/loaders/base/#loaders.base.LIReaderMixin","title":"LIReaderMixin","text":"

    Bases: BaseComponent

    Base wrapper around llama-index reader

    To use the LIBaseReader, you need to implement the _get_wrapped_class method to return the relevant llama-index reader class that you want to wrap.

    Example:

    ```python\nclass DirectoryReader(LIBaseReader):\n    def _get_wrapped_class(self) -> Type[\"BaseReader\"]:\n        from llama_index import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n```\n
    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class LIReaderMixin(BaseComponent):\n    \"\"\"Base wrapper around llama-index reader\n\n    To use the LIBaseReader, you need to implement the _get_wrapped_class method to\n    return the relevant llama-index reader class that you want to wrap.\n\n    Example:\n\n        ```python\n        class DirectoryReader(LIBaseReader):\n            def _get_wrapped_class(self) -> Type[\"BaseReader\"]:\n                from llama_index import SimpleDirectoryReader\n\n                return SimpleDirectoryReader\n        ```\n    \"\"\"\n\n    def _get_wrapped_class(self) -> Type[\"LIBaseReader\"]:\n        raise NotImplementedError(\n            \"Please return the relevant llama-index class in in _get_wrapped_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        self._reader_class = self._get_wrapped_class()\n        self._reader = self._reader_class(*args, **kwargs)\n        super().__init__()\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._reader, name, value)\n\n    def __getattr__(self, name: str) -> Any:\n        return getattr(self._reader, name)\n\n    def load_data(self, *args, **kwargs: Any) -> List[Document]:\n        documents = self._reader.load_data(*args, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, *args, **kwargs: Any) -> List[Document]:\n        return self.load_data(*args, **kwargs)\n
    "},{"location":"reference/loaders/composite_loader/","title":"Composite Loader","text":""},{"location":"reference/loaders/composite_loader/#loaders.composite_loader.DirectoryReader","title":"DirectoryReader","text":"

    Bases: LIReaderMixin, BaseReader

    Wrap around llama-index SimpleDirectoryReader

    Parameters:

    Name Type Description Default input_dir str

    Path to the directory.

    required input_files List

    List of file paths to read (Optional; overrides input_dir, exclude)

    required exclude List

    glob of python file paths to exclude (Optional)

    required exclude_hidden bool

    Whether to exclude hidden files (dotfiles).

    required encoding str

    Encoding of the files. Default is utf-8.

    required errors str

    how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open

    required recursive bool

    Whether to recursively search in subdirectories. False by default.

    required filename_as_id bool

    Whether to use the filename as the document id. False by default.

    required required_exts Optional[List[str]]

    List of required extensions. Default is None.

    required file_extractor Optional[Dict[str, BaseReader]]

    A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.

    required num_files_limit Optional[int]

    Maximum number of files to read. Default is None.

    required file_metadata Optional[Callable[str, Dict]]

    A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.

    required Source code in libs/kotaemon/kotaemon/loaders/composite_loader.py
    class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -> Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n
    "},{"location":"reference/loaders/docx_loader/","title":"Docx Loader","text":""},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader","title":"DocxReader","text":"

    Bases: BaseReader

    Read Docx files that respect table, using python-docx library

    Reader behavior Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -> List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Docx reader

    Parameters:

    Name Type Description Default file_path Path

    Path to .docx file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/excel_loader/","title":"Excel Loader","text":"

    Pandas Excel reader.

    Pandas parser for .xlsx files.

    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader","title":"PandasExcelReader","text":"

    Bases: BaseReader

    Pandas-based CSV parser.

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n
    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    False sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n
    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader","title":"ExcelReader","text":"

    Bases: BaseReader

    Spreadsheet exporter respecting multiple worksheets

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n
    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    True sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n
    "},{"location":"reference/loaders/html_loader/","title":"Html Loader","text":""},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader","title":"HtmlReader","text":"

    Bases: BaseReader

    Reader HTML usimg html2text

    Reader behavior

    Parameters:

    Name Type Description Default page_break_pattern str

    Pattern to split the HTML into pages

    None Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n
    "},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Html reader

    Parameters:

    Name Type Description Default file_path Path | str

    path to HTML file

    required extra_info Optional[dict]

    extra information passed to this reader during extracting data

    None

    Returns:

    Type Description list[Document]

    list[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n
    "},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader","title":"MhtmlReader","text":"

    Bases: BaseReader

    Parse MHTML files with BeautifulSoup.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -> None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load MHTML document into document objects.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/mathpix_loader/","title":"Mathpix Loader","text":""},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader","title":"MathpixPDFReader","text":"

    Bases: BaseReader

    Load PDF files using Mathpix service.

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -> Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -> str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -> dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -> str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -> None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -> str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -> str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n
    "},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"
    wait_for_processing(pdf_id)\n

    Wait for processing to complete.

    Parameters:

    Name Type Description Default pdf_id str

    a PDF id.

    required

    Returns: None

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def wait_for_processing(self, pdf_id: str) -> None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n
    "},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"
    clean_pdf(contents)\n

    Clean the PDF file.

    Parameters:

    Name Type Description Default contents str

    a PDF file contents.

    required

    Returns:

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def clean_pdf(self, contents: str) -> str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n
    "},{"location":"reference/loaders/ocr_loader/","title":"Ocr Loader","text":""},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader","title":"OCRReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from kotaemon.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    True Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader","title":"ImageReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from knowledgehub.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    required Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from knowledgehub.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n
    "},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n
    "},{"location":"reference/loaders/pdf_loader/","title":"Pdf Loader","text":""},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader","title":"PDFThumbnailReader","text":"

    Bases: PDFReader

    PDF parser with thumbnail for each page.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -> List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, fs=None)\n

    Parse file.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -> List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.get_page_thumbnails","title":"get_page_thumbnails","text":"
    get_page_thumbnails(file_path, pages, dpi=80)\n

    Get image thumbnails of the pages in the PDF file.

    Parameters:

    Name Type Description Default file_path Path

    path to the image file

    required page_number list[int]

    list of page numbers to extract

    required

    Returns:

    Type Description List[Image]

    list[Image.Image]: list of page thumbnails

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    def get_page_thumbnails(\n    file_path: Path, pages: list[int], dpi: int = 80\n) -> List[Image.Image]:\n    \"\"\"Get image thumbnails of the pages in the PDF file.\n\n    Args:\n        file_path (Path): path to the image file\n        page_number (list[int]): list of page numbers to extract\n\n    Returns:\n        list[Image.Image]: list of page thumbnails\n    \"\"\"\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    assert suffix == \".pdf\", \"This function only supports PDF files.\"\n    try:\n        import fitz\n    except ImportError:\n        raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n    doc = fitz.open(file_path)\n\n    output_imgs = []\n    for page_number in pages:\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=dpi)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n        output_imgs.append(convert_image_to_base64(img))\n\n    return output_imgs\n
    "},{"location":"reference/loaders/txt_loader/","title":"Txt Loader","text":""},{"location":"reference/loaders/unstructured_loader/","title":"Unstructured Loader","text":"

    Unstructured file reader.

    A parser for unstructured text files using Unstructured.io. Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.

    To use .doc and .xls parser, install

    sudo apt-get install -y libmagic-dev poppler-utils libreoffice pip install xlrd

    "},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader","title":"UnstructuredReader","text":"

    Bases: BaseReader

    General unstructured text reader for a variety of files.

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n
    "},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader.load_data","title":"load_data","text":"
    load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n

    If api is set, parse through api

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n
    "},{"location":"reference/loaders/utils/","title":"Utils","text":""},{"location":"reference/loaders/utils/adobe/","title":"Adobe","text":""},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.request_adobe_service","title":"request_adobe_service","text":"
    request_adobe_service(file_path, output_path='')\n

    Main function to call the adobe service, and unzip the results. Args: file_path (str): path to the pdf file output_path (str): path to store the results

    Returns:

    Name Type Description output_path str

    path to the results

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def request_adobe_service(file_path: str, output_path: str = \"\") -> str:\n    \"\"\"Main function to call the adobe service, and unzip the results.\n    Args:\n        file_path (str): path to the pdf file\n        output_path (str): path to store the results\n\n    Returns:\n        output_path (str): path to the results\n\n    \"\"\"\n    try:\n        from adobe.pdfservices.operation.auth.credentials import Credentials\n        from adobe.pdfservices.operation.exception.exceptions import (\n            SdkException,\n            ServiceApiException,\n            ServiceUsageException,\n        )\n        from adobe.pdfservices.operation.execution_context import ExecutionContext\n        from adobe.pdfservices.operation.io.file_ref import FileRef\n        from adobe.pdfservices.operation.pdfops.extract_pdf_operation import (\n            ExtractPDFOperation,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import (  # noqa: E501\n            ExtractElementType,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import (  # noqa: E501\n            ExtractPDFOptions,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import (  # noqa: E501\n            ExtractRenditionsElementType,\n        )\n    except ImportError:\n        raise ImportError(\n            \"pdfservices-sdk is not installed. \"\n            \"Please install it by running `pip install pdfservices-sdk\"\n            \"@git+https://github.com/niallcm/pdfservices-python-sdk.git\"\n            \"@bump-and-unfreeze-requirements`\"\n        )\n\n    if not output_path:\n        output_path = tempfile.mkdtemp()\n\n    try:\n        # Initial setup, create credentials instance.\n        credentials = (\n            Credentials.service_principal_credentials_builder()\n            .with_client_id(config(\"PDF_SERVICES_CLIENT_ID\", default=\"\"))\n            .with_client_secret(config(\"PDF_SERVICES_CLIENT_SECRET\", default=\"\"))\n            .build()\n        )\n\n        # Create an ExecutionContext using credentials\n        # and create a new operation instance.\n        execution_context = ExecutionContext.create(credentials)\n        extract_pdf_operation = ExtractPDFOperation.create_new()\n\n        # Set operation input from a source file.\n        source = FileRef.create_from_local_file(file_path)\n        extract_pdf_operation.set_input(source)\n\n        # Build ExtractPDF options and set them into the operation\n        extract_pdf_options: ExtractPDFOptions = (\n            ExtractPDFOptions.builder()\n            .with_elements_to_extract(\n                [ExtractElementType.TEXT, ExtractElementType.TABLES]\n            )\n            .with_elements_to_extract_renditions(\n                [\n                    ExtractRenditionsElementType.TABLES,\n                    ExtractRenditionsElementType.FIGURES,\n                ]\n            )\n            .build()\n        )\n        extract_pdf_operation.set_options(extract_pdf_options)\n\n        # Execute the operation.\n        result: FileRef = extract_pdf_operation.execute(execution_context)\n\n        # Save the result to the specified location.\n        zip_file_path = os.path.join(\n            output_path, \"ExtractTextTableWithFigureTableRendition.zip\"\n        )\n        result.save_as(zip_file_path)\n        # Open the ZIP file\n        with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n            # Extract all contents to the destination folder\n            zip_ref.extractall(output_path)\n    except (ServiceApiException, ServiceUsageException, SdkException):\n        logging.exception(\"Exception encountered while executing operation\")\n\n    return output_path\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.make_markdown_table","title":"make_markdown_table","text":"
    make_markdown_table(table_as_list)\n

    Convert table from python list representation to markdown format. The input list consists of rows of tables, the first row is the header.

    Parameters:

    Name Type Description Default table_as_list List[str]

    list of table rows Example: [[\"Name\", \"Age\", \"Height\"], [\"Jake\", 20, 5'10], [\"Mary\", 21, 5'7]]

    required

    Returns: markdown representation of the table

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def make_markdown_table(table_as_list: List[str]) -> str:\n    \"\"\"\n    Convert table from python list representation to markdown format.\n    The input list consists of rows of tables, the first row is the header.\n\n    Args:\n        table_as_list: list of table rows\n            Example: [[\"Name\", \"Age\", \"Height\"],\n                    [\"Jake\", 20, 5'10],\n                    [\"Mary\", 21, 5'7]]\n    Returns:\n        markdown representation of the table\n    \"\"\"\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in table_as_list[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(table_as_list[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in table_as_list[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_json","title":"load_json","text":"
    load_json(input_path)\n

    Load json file

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def load_json(input_path: Union[str | Path]) -> dict:\n    \"\"\"Load json file\"\"\"\n    with open(input_path, \"r\") as fi:\n        data = json.load(fi)\n\n    return data\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_excel","title":"load_excel","text":"
    load_excel(input_path)\n

    Load excel file and convert to markdown

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def load_excel(input_path: Union[str | Path]) -> str:\n    \"\"\"Load excel file and convert to markdown\"\"\"\n\n    df = pd.read_excel(input_path).fillna(\"\")\n    # Convert dataframe to a list of rows\n    row_list = [df.columns.values.tolist()] + df.values.tolist()\n\n    for item_id, item in enumerate(row_list[0]):\n        if \"Unnamed\" in item:\n            row_list[0][item_id] = \"\"\n\n    for row in row_list:\n        for item_id, item in enumerate(row):\n            row[item_id] = str(item).replace(\"_x000D_\", \" \").replace(\"\\n\", \" \").strip()\n\n    markdown_str = make_markdown_table(row_list)\n    return markdown_str\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.encode_image_base64","title":"encode_image_base64","text":"
    encode_image_base64(image_path)\n

    Convert image to base64

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def encode_image_base64(image_path: Union[str | Path]) -> Union[bytes, str]:\n    \"\"\"Convert image to base64\"\"\"\n\n    with open(image_path, \"rb\") as image_file:\n        return base64.b64encode(image_file.read()).decode(\"utf-8\")\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_table_paths","title":"parse_table_paths","text":"
    parse_table_paths(file_paths)\n

    Read the table stored in an excel file given the file path

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def parse_table_paths(file_paths: List[Path]) -> str:\n    \"\"\"Read the table stored in an excel file given the file path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".xlsx\":\n            content = load_excel(path)\n            break\n    return content\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_figure_paths","title":"parse_figure_paths","text":"
    parse_figure_paths(file_paths)\n

    Read and convert an image to base64 given the image path

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:\n    \"\"\"Read and convert an image to base64 given the image path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".png\":\n            base64_image = encode_image_base64(path)\n            content = f\"data:image/png;base64,{base64_image}\"  # type: ignore\n            break\n    return content\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_single_figure_caption","title":"generate_single_figure_caption","text":"
    generate_single_figure_caption(vlm_endpoint, figure)\n

    Summarize a single figure using GPT-4V

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:\n    \"\"\"Summarize a single figure using GPT-4V\"\"\"\n    if figure:\n        output = generate_gpt4v(\n            endpoint=vlm_endpoint,\n            prompt=\"Provide a short 2 sentence summary of this image?\",\n            images=figure,\n        )\n        if \"sorry\" in output.lower():\n            output = \"\"\n    else:\n        output = \"\"\n    return output\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_figure_captions","title":"generate_figure_captions","text":"
    generate_figure_captions(\n    vlm_endpoint, figures, max_figures_to_process\n)\n

    Summarize several figures using GPT-4V. Args: vlm_endpoint (str): endpoint to the vision language model service figures (List): list of base64 images max_figures_to_process (int): the maximum number of figures will be summarized, the rest are ignored.

    Returns:

    Name Type Description results List[str]

    list of all figure captions and empty strings for

    List

    ignored figures.

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def generate_figure_captions(\n    vlm_endpoint: str, figures: List, max_figures_to_process: int\n) -> List:\n    \"\"\"Summarize several figures using GPT-4V.\n    Args:\n        vlm_endpoint (str): endpoint to the vision language model service\n        figures (List): list of base64 images\n        max_figures_to_process (int): the maximum number of figures will be summarized,\n        the rest are ignored.\n\n    Returns:\n        results (List[str]): list of all figure captions and empty strings for\n        ignored figures.\n    \"\"\"\n    to_gen_figures = figures[:max_figures_to_process]\n    other_figures = figures[max_figures_to_process:]\n\n    with ThreadPoolExecutor() as executor:\n        futures = [\n            executor.submit(\n                lambda: generate_single_figure_caption(vlm_endpoint, figure)\n            )\n            for figure in to_gen_figures\n        ]\n\n    results = [future.result() for future in futures]\n    return results + [\"\"] * len(other_figures)\n
    "},{"location":"reference/loaders/utils/box/","title":"Box","text":""},{"location":"reference/loaders/utils/box/#loaders.utils.box.bbox_to_points","title":"bbox_to_points","text":"
    bbox_to_points(box)\n

    Convert bounding box to list of points

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def bbox_to_points(box: List[int]):\n    \"\"\"Convert bounding box to list of points\"\"\"\n    x1, y1, x2, y2 = box\n    return [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.points_to_bbox","title":"points_to_bbox","text":"
    points_to_bbox(points)\n

    Convert list of points to bounding box

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def points_to_bbox(points: List[Tuple[int, int]]):\n    \"\"\"Convert list of points to bounding box\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    return [min(all_x), min(all_y), max(all_x), max(all_y)]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_points","title":"scale_points","text":"
    scale_points(points, scale_factor=1.0)\n

    Scale points by a scale factor

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def scale_points(points: List[Tuple[int, int]], scale_factor: float = 1.0):\n    \"\"\"Scale points by a scale factor\"\"\"\n    return [(int(pos[0] * scale_factor), int(pos[1] * scale_factor)) for pos in points]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.union_points","title":"union_points","text":"
    union_points(points)\n

    Return union bounding box of list of points

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def union_points(points: List[Tuple[int, int]]):\n    \"\"\"Return union bounding box of list of points\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    bbox = (min(all_x), min(all_y), max(all_x), max(all_y))\n    return bbox\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_box","title":"scale_box","text":"
    scale_box(box, scale_factor=1.0)\n

    Scale box by a scale factor

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def scale_box(box: List[int], scale_factor: float = 1.0):\n    \"\"\"Scale box by a scale factor\"\"\"\n    return [int(pos * scale_factor) for pos in box]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_h","title":"box_h","text":"
    box_h(box)\n

    Return box height

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def box_h(box: List[int]):\n    \"Return box height\"\n    return box[3] - box[1]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_w","title":"box_w","text":"
    box_w(box)\n

    Return box width

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def box_w(box: List[int]):\n    \"Return box width\"\n    return box[2] - box[0]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_area","title":"box_area","text":"
    box_area(box)\n

    Return box area

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def box_area(box: List[int]):\n    \"Return box area\"\n    x1, y1, x2, y2 = box\n    return (x2 - x1) * (y2 - y1)\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.get_rect_iou","title":"get_rect_iou","text":"
    get_rect_iou(gt_box, pd_box, iou_type=0)\n

    Intersection over union on layout rectangle

    Parameters:

    Name Type Description Default gt_box List[tuple]

    List[tuple] A list contains bounding box coordinates of ground truth

    required pd_box List[tuple]

    List[tuple] A list contains bounding box coordinates of prediction

    required iou_type

    int 0: intersection / union, normal IOU 1: intersection / min(areas), useful when boxes are under/over-segmented

    0 Input format

    [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]

    required Annotation for each element in bbox required

    Returns:

    Type Description int

    Intersection over union value

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int:\n    \"\"\"Intersection over union on layout rectangle\n\n    Args:\n        gt_box: List[tuple]\n            A list contains bounding box coordinates of ground truth\n        pd_box: List[tuple]\n            A list contains bounding box coordinates of prediction\n        iou_type: int\n            0: intersection / union, normal IOU\n            1: intersection / min(areas), useful when boxes are under/over-segmented\n\n        Input format: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n        Annotation for each element in bbox:\n        (x1, y1)        (x2, y1)\n            +-------+\n            |       |\n            |       |\n            +-------+\n        (x1, y2)        (x2, y2)\n\n    Returns:\n        Intersection over union value\n    \"\"\"\n\n    assert iou_type in [0, 1], \"Only support 0: origin iou, 1: intersection / min(area)\"\n\n    # determine the (x, y)-coordinates of the intersection rectangle\n    # gt_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    # pd_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    x_left = max(gt_box[0][0], pd_box[0][0])\n    y_top = max(gt_box[0][1], pd_box[0][1])\n    x_right = min(gt_box[2][0], pd_box[2][0])\n    y_bottom = min(gt_box[2][1], pd_box[2][1])\n\n    # compute the area of intersection rectangle\n    interArea = max(0, x_right - x_left) * max(0, y_bottom - y_top)\n\n    # compute the area of both the prediction and ground-truth\n    # rectangles\n    gt_area = (gt_box[2][0] - gt_box[0][0]) * (gt_box[2][1] - gt_box[0][1])\n    pd_area = (pd_box[2][0] - pd_box[0][0]) * (pd_box[2][1] - pd_box[0][1])\n\n    # compute the intersection over union by taking the intersection\n    # area and dividing it by the sum of prediction + ground-truth\n    # areas - the intersection area\n    if iou_type == 0:\n        iou = interArea / float(gt_area + pd_area - interArea)\n    elif iou_type == 1:\n        iou = interArea / max(min(gt_area, pd_area), 1)\n\n    # return the intersection over union value\n    return iou\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.sort_funsd_reading_order","title":"sort_funsd_reading_order","text":"
    sort_funsd_reading_order(lines, box_key_name='box')\n

    Sort cell list to create the right reading order using their locations

    Parameters:

    Name Type Description Default lines List[dict]

    list of cells to sort

    required

    Returns:

    Type Description

    a list of cell lists in the right reading order that contain

    no key or start with a key and contain no other key

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def sort_funsd_reading_order(lines: List[dict], box_key_name: str = \"box\"):\n    \"\"\"Sort cell list to create the right reading order using their locations\n\n    Args:\n        lines: list of cells to sort\n\n    Returns:\n        a list of cell lists in the right reading order that contain\n        no key or start with a key and contain no other key\n    \"\"\"\n    sorted_list = []\n\n    if len(lines) == 0:\n        return lines\n\n    while len(lines) > 1:\n        topleft_line = lines[0]\n        for line in lines[1:]:\n            topleft_line_pos = topleft_line[box_key_name]\n            topleft_line_center_y = (topleft_line_pos[1] + topleft_line_pos[3]) / 2\n            x1, y1, x2, y2 = line[box_key_name]\n            box_center_x = (x1 + x2) / 2\n            box_center_y = (y1 + y2) / 2\n            cell_h = y2 - y1\n            if box_center_y <= topleft_line_center_y - cell_h / 2:\n                topleft_line = line\n                continue\n            if (\n                box_center_x < topleft_line_pos[2]\n                and box_center_y < topleft_line_pos[3]\n            ):\n                topleft_line = line\n                continue\n        sorted_list.append(topleft_line)\n        lines.remove(topleft_line)\n\n    sorted_list.append(lines[0])\n\n    return sorted_list\n
    "},{"location":"reference/loaders/utils/gpt4v/","title":"Gpt4V","text":""},{"location":"reference/loaders/utils/pdf_ocr/","title":"Pdf Ocr","text":""},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.read_pdf_unstructured","title":"read_pdf_unstructured","text":"
    read_pdf_unstructured(input_path)\n

    Convert PDF from specified path to list of text items with location information

    Parameters:

    Name Type Description Default input_path Union[Path, str]

    path to input file

    required

    Returns:

    Type Description

    Dict page_number: list of text boxes

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def read_pdf_unstructured(input_path: Union[Path, str]):\n    \"\"\"Convert PDF from specified path to list of text items with\n    location information\n\n    Args:\n        input_path: path to input file\n\n    Returns:\n        Dict page_number: list of text boxes\n    \"\"\"\n    try:\n        from unstructured.partition.auto import partition\n    except ImportError as e:\n        raise ImportError(\n            \"Please install unstructured PDF reader `pip install unstructured[pdf]`: \"\n            f\"{e}\"\n        )\n\n    page_items = defaultdict(list)\n    items = partition(input_path)\n    for item in items:\n        page_number = item.metadata.page_number\n        bbox = points_to_bbox(item.metadata.coordinates.points)\n        coord_system = item.metadata.coordinates.system\n        max_w, max_h = coord_system.width, coord_system.height\n        page_items[page_number - 1].append(\n            {\n                \"text\": item.text,\n                \"box\": bbox,\n                \"location\": bbox_to_points(bbox),\n                \"page_shape\": (max_w, max_h),\n            }\n        )\n\n    return page_items\n
    "},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_ocr_and_pdf_texts","title":"merge_ocr_and_pdf_texts","text":"
    merge_ocr_and_pdf_texts(\n    ocr_list, pdf_text_list, debug_info=None\n)\n

    Merge PDF and OCR text using IOU overlapping location Args: ocr_list: List of OCR items {\"text\", \"box\", \"location\"} pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}

    Returns:

    Type Description

    Combined list of PDF text and non-overlap OCR text

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def merge_ocr_and_pdf_texts(\n    ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None\n):\n    \"\"\"Merge PDF and OCR text using IOU overlapping location\n    Args:\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        Combined list of PDF text and non-overlap OCR text\n    \"\"\"\n    not_matched_ocr = []\n\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    for ocr_item in ocr_list:\n        matched = False\n        for pdf_item in pdf_text_list:\n            if (\n                get_rect_iou(ocr_item[\"location\"], pdf_item[\"location\"], iou_type=1)\n                > IOU_THRES\n            ):\n                matched = True\n                break\n\n        color = (255, 0, 0)\n        if not matched:\n            ocr_item[\"matched\"] = False\n            not_matched_ocr.append(ocr_item)\n            color = (0, 255, 255)\n\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                ocr_item[\"location\"][0],\n                ocr_item[\"location\"][2],\n                color=color,\n                thickness=1,\n            )\n\n    if debug_info is not None:\n        for pdf_item in pdf_text_list:\n            cv2.rectangle(\n                debug_im,\n                pdf_item[\"location\"][0],\n                pdf_item[\"location\"][2],\n                color=(0, 255, 0),\n                thickness=2,\n            )\n\n    return pdf_text_list + not_matched_ocr\n
    "},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_table_cell_and_ocr","title":"merge_table_cell_and_ocr","text":"
    merge_table_cell_and_ocr(\n    table_list, ocr_list, pdf_list, debug_info=None\n)\n

    Merge table items with OCR text using IOU overlapping location Args: table_list: List of table items \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"} ocr_list: List of OCR items {\"text\", \"box\", \"location\"} pdf_list: List of PDF items {\"text\", \"box\", \"location\"}

    Returns:

    Name Type Description all_table_cells

    List of tables, each of table is represented by list of cells with combined text from OCR

    not_matched_items

    List of PDF text which is not overlapped by table region

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def merge_table_cell_and_ocr(\n    table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None\n):\n    \"\"\"Merge table items with OCR text using IOU overlapping location\n    Args:\n        table_list: List of table items\n            \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"}\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        all_table_cells: List of tables, each of table is represented\n            by list of cells with combined text from OCR\n        not_matched_items: List of PDF text which is not overlapped by table region\n    \"\"\"\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    cell_list = [item for item in table_list if item[\"type\"] == \"cell\"]\n    table_list = [item for item in table_list if item[\"type\"] == \"table\"]\n\n    # sort table by area\n    table_list = sorted(table_list, key=lambda item: box_area(item[\"bbox\"]))\n\n    all_tables = []\n    matched_pdf_ids = []\n    matched_cell_ids = []\n\n    for table in table_list:\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                table[\"location\"][0],\n                table[\"location\"][2],\n                color=[0, 0, 255],\n                thickness=5,\n            )\n\n        cur_table_cells = []\n        for cell_id, cell in enumerate(cell_list):\n            if cell_id in matched_cell_ids:\n                continue\n\n            if get_rect_iou(\n                table[\"location\"], cell[\"location\"], iou_type=1\n            ) > IOU_THRES and box_area(table[\"bbox\"]) > box_area(cell[\"bbox\"]):\n                color = [128, 0, 128]\n                # cell matched to table\n                for item_list, item_type in [(pdf_list, \"pdf\"), (ocr_list, \"ocr\")]:\n                    cell[\"ocr\"] = []\n                    for item_id, item in enumerate(item_list):\n                        if item_type == \"pdf\" and item_id in matched_pdf_ids:\n                            continue\n                        if (\n                            get_rect_iou(item[\"location\"], cell[\"location\"], iou_type=1)\n                            > IOU_THRES\n                        ):\n                            cell[\"ocr\"].append(item)\n                            if item_type == \"pdf\":\n                                matched_pdf_ids.append(item_id)\n\n                    if len(cell[\"ocr\"]) > 0:\n                        # check if union of matched ocr does\n                        # not extend over cell boundary,\n                        # if True, continue to use OCR_list to match\n                        all_box_points_in_cell = []\n                        for item in cell[\"ocr\"]:\n                            all_box_points_in_cell.extend(item[\"location\"])\n                        union_box = union_points(all_box_points_in_cell)\n                        cell_okay = (\n                            box_h(union_box) <= box_h(cell[\"bbox\"]) * PADDING_THRES\n                            and box_w(union_box) <= box_w(cell[\"bbox\"]) * PADDING_THRES\n                        )\n                    else:\n                        cell_okay = False\n\n                    if cell_okay:\n                        if item_type == \"pdf\":\n                            color = [255, 0, 255]\n                        break\n\n                if debug_info is not None:\n                    cv2.rectangle(\n                        debug_im,\n                        cell[\"location\"][0],\n                        cell[\"location\"][2],\n                        color=color,\n                        thickness=3,\n                    )\n\n                matched_cell_ids.append(cell_id)\n                cur_table_cells.append(cell)\n\n        all_tables.append(cur_table_cells)\n\n    not_matched_items = [\n        item for _id, item in enumerate(pdf_list) if _id not in matched_pdf_ids\n    ]\n    if debug_info is not None:\n        for item in not_matched_items:\n            cv2.rectangle(\n                debug_im,\n                item[\"location\"][0],\n                item[\"location\"][2],\n                color=[128, 128, 128],\n                thickness=3,\n            )\n\n    return all_tables, not_matched_items\n
    "},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.parse_ocr_output","title":"parse_ocr_output","text":"
    parse_ocr_output(\n    ocr_page_items,\n    pdf_page_items,\n    artifact_path=None,\n    debug_path=None,\n)\n

    Main function to combine OCR output and PDF text to form list of table / non-table regions Args: ocr_page_items: List of OCR items by page pdf_page_items: Dict of PDF texts (page number as key) debug_path: If specified, use OpenCV to plot debug image and save to debug_path

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def parse_ocr_output(\n    ocr_page_items: List[dict],\n    pdf_page_items: Dict[int, List[dict]],\n    artifact_path: Optional[str] = None,\n    debug_path: Optional[str] = None,\n):\n    \"\"\"Main function to combine OCR output and PDF text to\n    form list of table / non-table regions\n    Args:\n        ocr_page_items: List of OCR items by page\n        pdf_page_items: Dict of PDF texts (page number as key)\n        debug_path: If specified, use OpenCV to plot debug image and save to debug_path\n    \"\"\"\n    all_tables = []\n    all_texts = []\n\n    for page_id, page in enumerate(ocr_page_items):\n        ocr_list = page[\"json\"][\"ocr\"]\n        table_list = page[\"json\"][\"table\"]\n        page_shape = page[\"image_shape\"]\n        pdf_item_list = pdf_page_items[page_id]\n\n        # create bbox additional information\n        for item in ocr_list:\n            item[\"box\"] = points_to_bbox(item[\"location\"])\n\n        # re-scale pdf items according to new image size\n        for item in pdf_item_list:\n            scale_factor = page_shape[0] / item[\"page_shape\"][0]\n            item[\"box\"] = scale_box(item[\"box\"], scale_factor=scale_factor)\n            item[\"location\"] = scale_points(item[\"location\"], scale_factor=scale_factor)\n\n        # if using debug mode, openCV must be installed\n        if debug_path and artifact_path is not None:\n            try:\n                import cv2\n            except ImportError:\n                raise ImportError(\n                    \"Please install openCV first to use OCRReader debug mode\"\n                )\n            image_path = Path(artifact_path) / page[\"image\"]\n            image = cv2.imread(str(image_path))\n            debug_info = (cv2, image)\n        else:\n            debug_info = None\n\n        new_pdf_list = merge_ocr_and_pdf_texts(\n            ocr_list, pdf_item_list, debug_info=debug_info\n        )\n\n        # sort by reading order\n        ocr_list = sort_funsd_reading_order(ocr_list)\n        new_pdf_list = sort_funsd_reading_order(new_pdf_list)\n\n        all_table_cells, non_table_text_list = merge_table_cell_and_ocr(\n            table_list, ocr_list, new_pdf_list, debug_info=debug_info\n        )\n\n        table_texts = [table_cells_to_markdown(cells) for cells in all_table_cells]\n        all_tables.extend([(page_id, text) for text in table_texts])\n        all_texts.append(\n            (page_id, \" \".join(item[\"text\"] for item in non_table_text_list))\n        )\n\n        # export debug image to debug_path\n        if debug_path:\n            cv2.imwrite(str(Path(debug_path) / \"page_{}.png\".format(page_id)), image)\n\n    return all_tables, all_texts\n
    "},{"location":"reference/loaders/utils/table/","title":"Table","text":""},{"location":"reference/loaders/utils/table/#loaders.utils.table.check_col_conflicts","title":"check_col_conflicts","text":"
    check_col_conflicts(col_a, col_b, thres=0.15)\n

    Check if 2 columns A and B has non-empty content in the same row (to be used with merge_cols)

    Parameters:

    Name Type Description Default col_a List[str]

    column A (list of str)

    required col_b List[str]

    column B (list of str)

    required thres float

    percentage of overlapping allowed

    0.15

    Returns: if number of overlapping greater than threshold

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def check_col_conflicts(\n    col_a: List[str], col_b: List[str], thres: float = 0.15\n) -> bool:\n    \"\"\"Check if 2 columns A and B has non-empty content in the same row\n    (to be used with merge_cols)\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n        thres: percentage of overlapping allowed\n    Returns:\n        if number of overlapping greater than threshold\n    \"\"\"\n    num_rows = len([cell for cell in col_a if cell])\n    assert len(col_a) == len(col_b)\n    conflict_count = 0\n    for cell_a, cell_b in zip(col_a, col_b):\n        if cell_a and cell_b:\n            conflict_count += 1\n    return conflict_count > num_rows * thres\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.merge_cols","title":"merge_cols","text":"
    merge_cols(col_a, col_b)\n

    Merge column A and B if they do not have conflict rows

    Parameters:

    Name Type Description Default col_a List[str]

    column A (list of str)

    required col_b List[str]

    column B (list of str)

    required

    Returns: merged column

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def merge_cols(col_a: List[str], col_b: List[str]) -> List[str]:\n    \"\"\"Merge column A and B if they do not have conflict rows\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n    Returns:\n        merged column\n    \"\"\"\n    for r_id in range(len(col_a)):\n        if col_b[r_id]:\n            col_a[r_id] = col_a[r_id] + \" \" + col_b[r_id]\n    return col_a\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.add_index_col","title":"add_index_col","text":"
    add_index_col(csv_rows)\n

    Add index column as the first column of the table csv_rows

    Parameters:

    Name Type Description Default csv_rows List[List[str]]

    input table

    required

    Returns: output table with index column

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def add_index_col(csv_rows: List[List[str]]) -> List[List[str]]:\n    \"\"\"Add index column as the first column of the table csv_rows\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output table with index column\n    \"\"\"\n    new_csv_rows = [[\"row id\"] + [\"\"] * len(csv_rows[0])]\n    for r_id, row in enumerate(csv_rows):\n        new_csv_rows.append([str(r_id + 1)] + row)\n    return new_csv_rows\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.compress_csv","title":"compress_csv","text":"
    compress_csv(csv_rows)\n

    Compress table csv_rows by merging sparse columns (merge_cols)

    Parameters:

    Name Type Description Default csv_rows List[List[str]]

    input table

    required

    Returns: output: compressed table

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def compress_csv(csv_rows: List[List[str]]) -> List[List[str]]:\n    \"\"\"Compress table csv_rows by merging sparse columns (merge_cols)\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output: compressed table\n    \"\"\"\n    csv_cols = [[r[c_id] for r in csv_rows] for c_id in range(len(csv_rows[0]))]\n    to_remove_col_ids = []\n    last_c_id = 0\n    for c_id in range(1, len(csv_cols)):\n        if not check_col_conflicts(csv_cols[last_c_id], csv_cols[c_id]):\n            to_remove_col_ids.append(c_id)\n            csv_cols[last_c_id] = merge_cols(csv_cols[last_c_id], csv_cols[c_id])\n        else:\n            last_c_id = c_id\n\n    csv_cols = [r for c_id, r in enumerate(csv_cols) if c_id not in to_remove_col_ids]\n    csv_rows = [[c[r_id] for c in csv_cols] for r_id in range(len(csv_cols[0]))]\n    return csv_rows\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.get_table_from_ocr","title":"get_table_from_ocr","text":"
    get_table_from_ocr(ocr_list, table_list)\n

    Get list of text lines belong to table regions specified by table_list

    Parameters:

    Name Type Description Default ocr_list List[dict]

    list of OCR output in Casia format (Flax)

    required table_list List[dict]

    list of table output in Casia format (Flax)

    required

    Returns:

    Name Type Description _type_

    description

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def get_table_from_ocr(ocr_list: List[dict], table_list: List[dict]):\n    \"\"\"Get list of text lines belong to table regions specified by table_list\n\n    Args:\n        ocr_list: list of OCR output in Casia format (Flax)\n        table_list: list of table output in Casia format (Flax)\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    table_texts = []\n    for table in table_list:\n        if table[\"type\"] != \"table\":\n            continue\n        cur_table_texts = []\n        for ocr in ocr_list:\n            _iou = get_rect_iou(table[\"location\"], ocr[\"location\"], iou_type=1)\n            if _iou > 0.8:\n                cur_table_texts.append(ocr[\"text\"])\n        table_texts.append(cur_table_texts)\n\n    return table_texts\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.make_markdown_table","title":"make_markdown_table","text":"
    make_markdown_table(array)\n

    Convert table rows in list format to markdown string

    Parameters:

    Name Type Description Default Example Input
    [[\"Name\", \"Age\", \"Height\"],\n[\"Jake\", 20, 5'10],\n[\"Mary\", 21, 5'7]]\n
    required

    Returns: String to put into a .md file

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def make_markdown_table(array: List[List[str]]) -> str:\n    \"\"\"Convert table rows in list format to markdown string\n\n    Args:\n        Python list with rows of table as lists\n        First element as header.\n        Example Input:\n                [[\"Name\", \"Age\", \"Height\"],\n                [\"Jake\", 20, 5'10],\n                [\"Mary\", 21, 5'7]]\n    Returns:\n        String to put into a .md file\n    \"\"\"\n    array = compress_csv(array)\n    array = add_index_col(array)\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in array[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(array[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in array[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_csv_string_to_list","title":"parse_csv_string_to_list","text":"
    parse_csv_string_to_list(csv_str)\n

    Convert CSV string to list of rows

    Parameters:

    Name Type Description Default csv_str str

    input CSV string

    required

    Returns:

    Type Description List[List[str]]

    Output table in list format

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def parse_csv_string_to_list(csv_str: str) -> List[List[str]]:\n    \"\"\"Convert CSV string to list of rows\n\n    Args:\n        csv_str: input CSV string\n\n    Returns:\n        Output table in list format\n    \"\"\"\n    io = StringIO(csv_str)\n    csv_reader = csv.reader(io, delimiter=\",\")\n    rows = [row for row in csv_reader]\n    return rows\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.format_cell","title":"format_cell","text":"
    format_cell(cell, length_limit=None)\n

    Format cell content by remove redundant character and enforce length limit

    Parameters:

    Name Type Description Default cell str

    input cell text

    required length_limit Optional[int]

    limit of text length.

    None

    Returns:

    Type Description str

    new cell text

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def format_cell(cell: str, length_limit: Optional[int] = None) -> str:\n    \"\"\"Format cell content by remove redundant character and enforce length limit\n\n    Args:\n        cell: input cell text\n        length_limit: limit of text length.\n\n    Returns:\n        new cell text\n    \"\"\"\n    cell = cell.replace(\"\\n\", \" \")\n    if length_limit:\n        cell = cell[:length_limit]\n    return cell\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.extract_tables_from_csv_string","title":"extract_tables_from_csv_string","text":"
    extract_tables_from_csv_string(csv_content, table_texts)\n

    Extract list of table from FullOCR output (csv_content) with the specified table_texts

    Parameters:

    Name Type Description Default csv_content str

    CSV output from FullOCR pipeline

    required table_texts List[List[str]]

    list of table texts extracted

    required

    Returns:

    Type Description Tuple[List[str], str]

    List of tables and non-text content

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def extract_tables_from_csv_string(\n    csv_content: str, table_texts: List[List[str]]\n) -> Tuple[List[str], str]:\n    \"\"\"Extract list of table from FullOCR output\n    (csv_content) with the specified table_texts\n\n    Args:\n        csv_content: CSV output from FullOCR pipeline\n        table_texts: list of table texts extracted\n        from get_table_from_ocr()\n\n    Returns:\n        List of tables and non-text content\n    \"\"\"\n    rows = parse_csv_string_to_list(csv_content)\n    used_row_ids = []\n    table_csv_list = []\n    for table in table_texts:\n        cur_rows = []\n        for row_id, row in enumerate(rows):\n            scores = [\n                any(cell in cell_reference for cell in table)\n                for cell_reference in row\n                if cell_reference\n            ]\n            score = sum(scores) / len(scores)\n            if score > 0.5 and row_id not in used_row_ids:\n                used_row_ids.append(row_id)\n                cur_rows.append([format_cell(cell) for cell in row])\n        if cur_rows:\n            table_csv_list.append(make_markdown_table(cur_rows))\n        else:\n            print(\"table not matched\", table)\n\n    non_table_rows = [\n        row for row_id, row in enumerate(rows) if row_id not in used_row_ids\n    ]\n    non_table_text = \"\\n\".join(\n        \" \".join(format_cell(cell) for cell in row) for row in non_table_rows\n    )\n    return table_csv_list, non_table_text\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.strip_special_chars_markdown","title":"strip_special_chars_markdown","text":"
    strip_special_chars_markdown(text)\n

    Strip special characters from input text in markdown table format

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def strip_special_chars_markdown(text: str) -> str:\n    \"\"\"Strip special characters from input text in markdown table format\"\"\"\n    return text.replace(\"|\", \"\").replace(\":---:\", \"\").replace(\"---\", \"\")\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_markdown_text_to_tables","title":"parse_markdown_text_to_tables","text":"
    parse_markdown_text_to_tables(text)\n

    Convert markdown text to list of non-table spans and table spans

    Parameters:

    Name Type Description Default text str

    input markdown text

    required

    Returns:

    Type Description Tuple[List[str], List[str]]

    list of table spans and non-table spans

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def parse_markdown_text_to_tables(text: str) -> Tuple[List[str], List[str]]:\n    \"\"\"Convert markdown text to list of non-table spans and table spans\n\n    Args:\n        text: input markdown text\n\n    Returns:\n        list of table spans and non-table spans\n    \"\"\"\n    # init empty tables and texts list\n    tables = []\n    texts = []\n\n    # split input by line break\n    lines = text.split(\"\\n\")\n    cur_table = []\n    cur_text: List[str] = []\n    for line in lines:\n        line = line.strip()\n        if line.startswith(\"|\"):\n            if len(cur_text) > 0:\n                texts.append(cur_text)\n                cur_text = []\n            cur_table.append(line)\n        else:\n            # add new table to the list\n            if len(cur_table) > 0:\n                tables.append(cur_table)\n                cur_table = []\n            cur_text.append(line)\n\n    table_texts = [\"\\n\".join(table) for table in tables]\n    non_table_texts = [\"\\n\".join(text) for text in texts]\n    return table_texts, non_table_texts\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.table_cells_to_markdown","title":"table_cells_to_markdown","text":"
    table_cells_to_markdown(cells)\n

    Convert list of cells with attached text to Markdown table

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def table_cells_to_markdown(cells: List[dict]):\n    \"\"\"Convert list of cells with attached text to Markdown table\"\"\"\n\n    if len(cells) == 0:\n        return \"\"\n\n    all_row_ids = []\n    all_col_ids = []\n    for cell in cells:\n        all_row_ids.extend(cell[\"rows\"])\n        all_col_ids.extend(cell[\"columns\"])\n\n    num_rows, num_cols = max(all_row_ids) + 1, max(all_col_ids) + 1\n    table_rows = [[\"\" for c in range(num_cols)] for r in range(num_rows)]\n\n    # start filling in the grid\n    for cell in cells:\n        cell_text = \" \".join(item[\"text\"] for item in cell[\"ocr\"])\n        start_row_id, end_row_id = cell[\"rows\"]\n        start_col_id, end_col_id = cell[\"columns\"]\n        span_cell = end_row_id != start_row_id or end_col_id != start_col_id\n\n        # do not repeat long text in span cell to prevent context length issue\n        if span_cell and len(cell_text.replace(\" \", \"\")) < 20 and start_row_id > 0:\n            for row in range(start_row_id, end_row_id + 1):\n                for col in range(start_col_id, end_col_id + 1):\n                    table_rows[row][col] += cell_text + \" \"\n        else:\n            table_rows[start_row_id][start_col_id] += cell_text + \" \"\n\n    return make_markdown_table(table_rows)\n
    "},{"location":"reference/parsers/","title":"Parsers","text":""},{"location":"reference/parsers/#parsers.RegexExtractor","title":"RegexExtractor","text":"

    Bases: BaseComponent

    Simple class for extracting text from a document using a regex pattern.

    Parameters:

    Name Type Description Default pattern List[str]

    The regex pattern(s) to use.

    required output_map dict

    A mapping from extracted text to the desired output. Defaults to None.

    required Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -> list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -> str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -> ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            >>> document1 = Document(...)\n            >>> document2 = Document(...)\n            >>> document_batch = [document1, document2]\n            >>> batch_output = self(document_batch)\n            >>> print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw_static","title":"run_raw_static staticmethod","text":"
    run_raw_static(pattern, text)\n

    Finds all non-overlapping occurrences of a pattern in a string.

    Parameters:

    Name Type Description Default pattern str

    The regular expression pattern to search for.

    required text str

    The input string to search in.

    required

    Returns:

    Type Description list[str]

    List[str]: A list of all non-overlapping occurrences of the pattern in the string.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef run_raw_static(pattern: str, text: str) -> list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.map_output","title":"map_output staticmethod","text":"
    map_output(text, output_map)\n

    Maps the given text to its corresponding value in the output_map dictionary.

    Parameters:

    Name Type Description Default text str

    The input text to be mapped.

    required output_map dict

    A dictionary containing mapping of input text to output values.

    required

    Returns:

    Name Type Description str str

    The corresponding value from the output_map if text is found in the dictionary, otherwise returns the original text.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef map_output(text, output_map) -> str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw","title":"run_raw","text":"
    run_raw(text)\n

    Matches the raw text against the pattern and rans the output mapping, returning an instance of ExtractorOutput.

    Parameters:

    Name Type Description Default text str

    The raw text to be processed.

    required

    Returns:

    Name Type Description ExtractorOutput ExtractorOutput

    The processed output as a list of ExtractorOutput.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run_raw(self, text: str) -> ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.run","title":"run","text":"
    run(text)\n

    Match the input against a pattern and return the output for each input

    Parameters:

    Name Type Description Default text str | list[str] | Document | list[Document]

    contains the input string to be processed

    required

    Returns:

    Type Description list[ExtractorOutput]

    A list contains the output ExtractorOutput for each input

    Example
    >>> document1 = Document(...)\n>>> document2 = Document(...)\n>>> document_batch = [document1, document2]\n>>> batch_output = self(document_batch)\n>>> print(batch_output)\n[output1_document1, output1_document2]\n
    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        >>> document1 = Document(...)\n        >>> document2 = Document(...)\n        >>> document_batch = [document1, document2]\n        >>> batch_output = self(document_batch)\n        >>> print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n
    "},{"location":"reference/parsers/regex_extractor/","title":"Regex Extractor","text":""},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor","title":"RegexExtractor","text":"

    Bases: BaseComponent

    Simple class for extracting text from a document using a regex pattern.

    Parameters:

    Name Type Description Default pattern List[str]

    The regex pattern(s) to use.

    required output_map dict

    A mapping from extracted text to the desired output. Defaults to None.

    required Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -> list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -> str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -> ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            >>> document1 = Document(...)\n            >>> document2 = Document(...)\n            >>> document_batch = [document1, document2]\n            >>> batch_output = self(document_batch)\n            >>> print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw_static","title":"run_raw_static staticmethod","text":"
    run_raw_static(pattern, text)\n

    Finds all non-overlapping occurrences of a pattern in a string.

    Parameters:

    Name Type Description Default pattern str

    The regular expression pattern to search for.

    required text str

    The input string to search in.

    required

    Returns:

    Type Description list[str]

    List[str]: A list of all non-overlapping occurrences of the pattern in the string.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef run_raw_static(pattern: str, text: str) -> list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.map_output","title":"map_output staticmethod","text":"
    map_output(text, output_map)\n

    Maps the given text to its corresponding value in the output_map dictionary.

    Parameters:

    Name Type Description Default text str

    The input text to be mapped.

    required output_map dict

    A dictionary containing mapping of input text to output values.

    required

    Returns:

    Name Type Description str str

    The corresponding value from the output_map if text is found in the dictionary, otherwise returns the original text.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef map_output(text, output_map) -> str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw","title":"run_raw","text":"
    run_raw(text)\n

    Matches the raw text against the pattern and rans the output mapping, returning an instance of ExtractorOutput.

    Parameters:

    Name Type Description Default text str

    The raw text to be processed.

    required

    Returns:

    Name Type Description ExtractorOutput ExtractorOutput

    The processed output as a list of ExtractorOutput.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run_raw(self, text: str) -> ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run","title":"run","text":"
    run(text)\n

    Match the input against a pattern and return the output for each input

    Parameters:

    Name Type Description Default text str | list[str] | Document | list[Document]

    contains the input string to be processed

    required

    Returns:

    Type Description list[ExtractorOutput]

    A list contains the output ExtractorOutput for each input

    Example
    >>> document1 = Document(...)\n>>> document2 = Document(...)\n>>> document_batch = [document1, document2]\n>>> batch_output = self(document_batch)\n>>> print(batch_output)\n[output1_document1, output1_document2]\n
    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        >>> document1 = Document(...)\n        >>> document2 = Document(...)\n        >>> document_batch = [document1, document2]\n        >>> batch_output = self(document_batch)\n        >>> print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n
    "},{"location":"reference/storages/","title":"Storages","text":""},{"location":"reference/storages/#storages.BaseDocumentStore","title":"BaseDocumentStore","text":"

    Bases: ABC

    A document store is in charged of storing and managing documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.add","title":"add abstractmethod","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    Document or list of documents

    required ids Optional[Union[List[str], str]]

    List of ids of the documents. Optional, if not set will use doc.doc_id

    None Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.get","title":"get abstractmethod","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.get_all","title":"get_all abstractmethod","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.count","title":"count abstractmethod","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.query","title":"query abstractmethod","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search document store using search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.delete","title":"delete abstractmethod","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -> List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None refresh_indices bool

    request Elasticsearch to update its index (default to True)

    True Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"
    query_raw(query)\n

    Query Elasticsearch store using query format of ES client

    Parameters:

    Name Type Description Default query dict

    Elasticsearch query format

    required

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query_raw(self, query: dict) -> List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search Elasticsearch docstore using search query (BM25)

    Parameters:

    Name Type Description Default query str

    query text

    required top_k int

    number of top documents to return. Defaults to 10.

    10

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.save","title":"save","text":"
    save(path)\n

    Save document to path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.load","title":"load","text":"
    load(path)\n

    Load document store from path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Perform full-text search on document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"

    Bases: BaseDocumentStore

    LancdDB document store which support full-text search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def get_all(self) -> List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Load documents into lancedb storage.

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.delete","title":"delete","text":"
    delete(ids, refresh_indices=True)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"

    Bases: InMemoryDocumentStore

    Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n
    "},{"location":"reference/storages/#storages.BaseVectorStore","title":"BaseVectorStore","text":"

    Bases: ABC

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -> list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.add","title":"add abstractmethod","text":"
    add(embeddings, metadatas=None, ids=None)\n

    Add vector embeddings to vector stores

    Parameters:

    Name Type Description Default embeddings list[list[float]] | list[DocumentWithEmbedding]

    List of embeddings

    required metadatas Optional[list[dict]]

    List of metadata of the embeddings

    None ids Optional[list[str]]

    List of ids of the embeddings

    None kwargs

    meant for vectorstore-specific parameters

    required

    Returns:

    Type Description list[str]

    List of ids of the embeddings

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -> list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.delete","title":"delete abstractmethod","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids list[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.query","title":"query abstractmethod","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the vector store

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.ChromaVectorStore","title":"ChromaVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -> int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/#storages.ChromaVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n
    "},{"location":"reference/storages/#storages.ChromaVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore","title":"InMemoryVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore.save","title":"save","text":"
    save(save_path, fs=None, **kwargs)\n

    save a simpleVectorStore to a dictionary.

    Parameters:

    Name Type Description Default save_path str

    Path of saving vector to disk.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore.load","title":"load","text":"
    load(load_path, fs=None)\n

    Create a SimpleKVStore from a load directory.

    Parameters:

    Name Type Description Default load_path str

    Path of loading vector.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore.drop","title":"drop","text":"
    drop()\n

    Clear the old data

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n
    "},{"location":"reference/storages/#storages.LanceDBVectorStore","title":"LanceDBVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/#storages.LanceDBVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n
    "},{"location":"reference/storages/#storages.LanceDBVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/#storages.MilvusVectorStore","title":"MilvusVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
    class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -> int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/#storages.QdrantVectorStore","title":"QdrantVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -> int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/#storages.QdrantVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n
    "},{"location":"reference/storages/#storages.QdrantVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n
    "},{"location":"reference/storages/#storages.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Similar to InMemoryVectorStore but is backed by file by default

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
    class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/docstores/","title":"Docstores","text":""},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore","title":"BaseDocumentStore","text":"

    Bases: ABC

    A document store is in charged of storing and managing documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.add","title":"add abstractmethod","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    Document or list of documents

    required ids Optional[Union[List[str], str]]

    List of ids of the documents. Optional, if not set will use doc.doc_id

    None Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get","title":"get abstractmethod","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get_all","title":"get_all abstractmethod","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.count","title":"count abstractmethod","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.query","title":"query abstractmethod","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search document store using search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.delete","title":"delete abstractmethod","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -> List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None refresh_indices bool

    request Elasticsearch to update its index (default to True)

    True Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"
    query_raw(query)\n

    Query Elasticsearch store using query format of ES client

    Parameters:

    Name Type Description Default query dict

    Elasticsearch query format

    required

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query_raw(self, query: dict) -> List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search Elasticsearch docstore using search query (BM25)

    Parameters:

    Name Type Description Default query str

    query text

    required top_k int

    number of top documents to return. Defaults to 10.

    10

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.save","title":"save","text":"
    save(path)\n

    Save document to path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.load","title":"load","text":"
    load(path)\n

    Load document store from path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Perform full-text search on document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"

    Bases: BaseDocumentStore

    LancdDB document store which support full-text search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def get_all(self) -> List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Load documents into lancedb storage.

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.delete","title":"delete","text":"
    delete(ids, refresh_indices=True)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"

    Bases: InMemoryDocumentStore

    Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n
    "},{"location":"reference/storages/docstores/base/","title":"Base","text":""},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore","title":"BaseDocumentStore","text":"

    Bases: ABC

    A document store is in charged of storing and managing documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.add","title":"add abstractmethod","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    Document or list of documents

    required ids Optional[Union[List[str], str]]

    List of ids of the documents. Optional, if not set will use doc.doc_id

    None Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get","title":"get abstractmethod","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get_all","title":"get_all abstractmethod","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.count","title":"count abstractmethod","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.query","title":"query abstractmethod","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search document store using search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.delete","title":"delete abstractmethod","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/elasticsearch/","title":"Elasticsearch","text":""},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -> List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None refresh_indices bool

    request Elasticsearch to update its index (default to True)

    True Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"
    query_raw(query)\n

    Query Elasticsearch store using query format of ES client

    Parameters:

    Name Type Description Default query dict

    Elasticsearch query format

    required

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query_raw(self, query: dict) -> List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search Elasticsearch docstore using search query (BM25)

    Parameters:

    Name Type Description Default query str

    query text

    required top_k int

    number of top documents to return. Defaults to 10.

    10

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/in_memory/","title":"In Memory","text":""},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.save","title":"save","text":"
    save(path)\n

    Save document to path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.load","title":"load","text":"
    load(path)\n

    Load document store from path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Perform full-text search on document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n
    "},{"location":"reference/storages/docstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"

    Bases: BaseDocumentStore

    LancdDB document store which support full-text search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def get_all(self) -> List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Load documents into lancedb storage.

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.delete","title":"delete","text":"
    delete(ids, refresh_indices=True)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/docstores/simple_file/","title":"Simple File","text":""},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"

    Bases: InMemoryDocumentStore

    Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n
    "},{"location":"reference/storages/vectorstores/","title":"Vectorstores","text":""},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore","title":"BaseVectorStore","text":"

    Bases: ABC

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -> list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.add","title":"add abstractmethod","text":"
    add(embeddings, metadatas=None, ids=None)\n

    Add vector embeddings to vector stores

    Parameters:

    Name Type Description Default embeddings list[list[float]] | list[DocumentWithEmbedding]

    List of embeddings

    required metadatas Optional[list[dict]]

    List of metadata of the embeddings

    None ids Optional[list[str]]

    List of ids of the embeddings

    None kwargs

    meant for vectorstore-specific parameters

    required

    Returns:

    Type Description list[str]

    List of ids of the embeddings

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -> list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.delete","title":"delete abstractmethod","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids list[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.query","title":"query abstractmethod","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the vector store

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore","title":"ChromaVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -> int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore","title":"InMemoryVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.save","title":"save","text":"
    save(save_path, fs=None, **kwargs)\n

    save a simpleVectorStore to a dictionary.

    Parameters:

    Name Type Description Default save_path str

    Path of saving vector to disk.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.load","title":"load","text":"
    load(load_path, fs=None)\n

    Create a SimpleKVStore from a load directory.

    Parameters:

    Name Type Description Default load_path str

    Path of loading vector.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.drop","title":"drop","text":"
    drop()\n

    Clear the old data

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore","title":"LanceDBVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.MilvusVectorStore","title":"MilvusVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
    class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -> int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore","title":"QdrantVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -> int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Similar to InMemoryVectorStore but is backed by file by default

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
    class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/vectorstores/base/","title":"Base","text":""},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore","title":"BaseVectorStore","text":"

    Bases: ABC

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -> list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.add","title":"add abstractmethod","text":"
    add(embeddings, metadatas=None, ids=None)\n

    Add vector embeddings to vector stores

    Parameters:

    Name Type Description Default embeddings list[list[float]] | list[DocumentWithEmbedding]

    List of embeddings

    required metadatas Optional[list[dict]]

    List of metadata of the embeddings

    None ids Optional[list[str]]

    List of ids of the embeddings

    None kwargs

    meant for vectorstore-specific parameters

    required

    Returns:

    Type Description list[str]

    List of ids of the embeddings

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -> list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.delete","title":"delete abstractmethod","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids list[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.query","title":"query abstractmethod","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the vector store

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore","title":"LlamaIndexVectorStore","text":"

    Bases: BaseVectorStore

    Mixin for LlamaIndex based vectorstores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class LlamaIndexVectorStore(BaseVectorStore):\n    \"\"\"Mixin for LlamaIndex based vectorstores\"\"\"\n\n    _li_class: type[LIVectorStore | BasePydanticVectorStore] | None\n\n    def _get_li_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in in _get_li_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        # get li_class from the method if not set\n        if not self._li_class:\n            LIClass = self._get_li_class()\n        else:\n            LIClass = self._li_class\n\n        from dataclasses import fields\n\n        self._client = LIClass(*args, **kwargs)\n\n        self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)}\n        for key in [\"query_embedding\", \"similarity_top_k\", \"node_ids\"]:\n            if key in self._vsq_kwargs:\n                self._vsq_kwargs.remove(key)\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._client, name, value)\n\n    def __getattr__(self, name: str) -> Any:\n        if name == \"_li_class\":\n            return super().__getattribute__(name)\n\n        return getattr(self._client, name)\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if isinstance(embeddings[0], list):\n            nodes: list[DocumentWithEmbedding] = [\n                DocumentWithEmbedding(embedding=embedding) for embedding in embeddings\n            ]\n        else:\n            nodes = embeddings  # type: ignore\n        if metadatas is not None:\n            for node, metadata in zip(nodes, metadatas):\n                node.metadata = metadata\n        if ids is not None:\n            for node, id in zip(nodes, ids):\n                node.id_ = id\n                node.relationships = {\n                    NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)\n                }\n\n        return self._client.add(nodes=nodes)\n\n    def delete(self, ids: list[str], **kwargs):\n        for id_ in ids:\n            self._client.delete(ref_doc_id=id_, **kwargs)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n            kwargs: extra query parameters. Depending on the name, these parameters\n                will be used when constructing the VectorStoreQuery object or when\n                performing querying of the underlying vector store.\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        vsq_kwargs = {}\n        vs_kwargs = {}\n        for kwkey, kwvalue in kwargs.items():\n            if kwkey in self._vsq_kwargs:\n                vsq_kwargs[kwkey] = kwvalue\n            else:\n                vs_kwargs[kwkey] = kwvalue\n\n        output = self._client.query(\n            query=VectorStoreQuery(\n                query_embedding=embedding,\n                similarity_top_k=top_k,\n                node_ids=ids,\n                **vsq_kwargs,\n            ),\n            **vs_kwargs,\n        )\n\n        embeddings = []\n        if output.nodes:\n            for node in output.nodes:\n                embeddings.append(node.embedding)\n        similarities = output.similarities if output.similarities else []\n        out_ids = output.ids if output.ids else []\n\n        return embeddings, similarities, out_ids\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore.query","title":"query","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None kwargs

    extra query parameters. Depending on the name, these parameters will be used when constructing the VectorStoreQuery object or when performing querying of the underlying vector store.

    {}

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    def query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n        kwargs: extra query parameters. Depending on the name, these parameters\n            will be used when constructing the VectorStoreQuery object or when\n            performing querying of the underlying vector store.\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    vsq_kwargs = {}\n    vs_kwargs = {}\n    for kwkey, kwvalue in kwargs.items():\n        if kwkey in self._vsq_kwargs:\n            vsq_kwargs[kwkey] = kwvalue\n        else:\n            vs_kwargs[kwkey] = kwvalue\n\n    output = self._client.query(\n        query=VectorStoreQuery(\n            query_embedding=embedding,\n            similarity_top_k=top_k,\n            node_ids=ids,\n            **vsq_kwargs,\n        ),\n        **vs_kwargs,\n    )\n\n    embeddings = []\n    if output.nodes:\n        for node in output.nodes:\n            embeddings.append(node.embedding)\n    similarities = output.similarities if output.similarities else []\n    out_ids = output.ids if output.ids else []\n\n    return embeddings, similarities, out_ids\n
    "},{"location":"reference/storages/vectorstores/chroma/","title":"Chroma","text":""},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore","title":"ChromaVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -> int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n
    "},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n
    "},{"location":"reference/storages/vectorstores/in_memory/","title":"In Memory","text":"

    Simple vector store index.

    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore","title":"InMemoryVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.save","title":"save","text":"
    save(save_path, fs=None, **kwargs)\n

    save a simpleVectorStore to a dictionary.

    Parameters:

    Name Type Description Default save_path str

    Path of saving vector to disk.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.load","title":"load","text":"
    load(load_path, fs=None)\n

    Create a SimpleKVStore from a load directory.

    Parameters:

    Name Type Description Default load_path str

    Path of loading vector.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.drop","title":"drop","text":"
    drop()\n

    Clear the old data

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n
    "},{"location":"reference/storages/vectorstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore","title":"LanceDBVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n
    "},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/vectorstores/milvus/","title":"Milvus","text":""},{"location":"reference/storages/vectorstores/milvus/#storages.vectorstores.milvus.MilvusVectorStore","title":"MilvusVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
    class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -> int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/qdrant/","title":"Qdrant","text":""},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore","title":"QdrantVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -> int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n
    "},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n
    "},{"location":"reference/storages/vectorstores/simple_file/","title":"Simple File","text":"

    Simple file vector store index.

    "},{"location":"reference/storages/vectorstores/simple_file/#storages.vectorstores.simple_file.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Similar to InMemoryVectorStore but is backed by file by default

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
    class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n
    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Quick Start","text":""},{"location":"#getting-started-with-kotaemon","title":"Getting Started with Kotaemon","text":"

    This page is intended for end users who want to use the kotaemon tool for Question Answering on local documents. If you are a developer who wants contribute to the project, please visit the development page.

    "},{"location":"#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"

    Visit this guide.

    "},{"location":"#installation-offline","title":"Installation (Offline)","text":""},{"location":"#download","title":"Download","text":"

    Download the kotaemon-app.zip file from the latest release.

    "},{"location":"#run-setup-script","title":"Run setup script","text":"
    1. Unzip the downloaded file.
    2. Navigate to the scripts folder and start an installer that matches your OS:
      • Windows: run_windows.bat. Just double click the file.
      • macOS: run_macos.sh
        1. Right click on your file and select Open with and Other.
        2. Enable All Applications and choose Terminal.
        3. NOTE: If you always want to open that file with Terminal, then check Always Open With.
        4. From now on, double click on your file and it should work.
      • Linux: run_linux.sh. Please run the script using bash run_linux.sh in your terminal.
    3. After the installation, the installer will ask to launch the ktem's UI, answer to continue.
    4. If launched, the application will be open automatically in your browser.
    "},{"location":"#launch","title":"Launch","text":"

    To launch the app after initial setup or any change, simply run the run_* script again.

    A browser window will be opened and greets you with this screen:

    "},{"location":"#usage","title":"Usage","text":"

    For how to use the application, see Usage. This page will also be available to you within the application.

    "},{"location":"#feedback","title":"Feedback","text":"

    Feel free to create a bug report or a feature request on our repo.

    "},{"location":"about/","title":"About Kotaemon","text":""},{"location":"about/#about-kotaemon","title":"About Kotaemon","text":"

    An open-source tool for chatting with your documents. Built with both end users and developers in mind.

    Source Code | Live Demo

    User Guide | Developer Guide | Feedback

    Dark Mode | Light Mode

    "},{"location":"local_model/","title":"Setup local LLMs & Embedding models","text":""},{"location":"local_model/#setup-local-llms-embedding-models","title":"Setup local LLMs & Embedding models","text":""},{"location":"local_model/#prepare-local-models","title":"Prepare local models","text":""},{"location":"local_model/#note","title":"NOTE","text":"

    In the case of using Docker image, please replace http://localhost with http://host.docker.internal to correctly communicate with service on the host machine. See more detail.

    "},{"location":"local_model/#ollama-openai-compatible-server-recommended","title":"Ollama OpenAI compatible server (recommended)","text":"

    Install ollama and start the application.

    Pull your model (e.g):

    ollama pull llama3.1:8b\nollama pull nomic-embed-text\n

    Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to Ollama:

    api_key: ollama\nbase_url: http://localhost:11434/v1/\nmodel: gemma2:2b (for llm) | nomic-embed-text (for embedding)\n

    "},{"location":"local_model/#oobaboogatext-generation-webui-openai-compatible-server","title":"oobabooga/text-generation-webui OpenAI compatible server","text":"

    Install oobabooga/text-generation-webui.

    Follow the setup guide to download your models (GGUF, HF). Also take a look at OpenAI compatible server for detail instructions.

    Here is a short version

    # install sentence-transformer for embeddings creation\npip install sentence_transformers\n# change to text-generation-webui src dir\npython server.py --api\n

    Use the Models tab to download new model and press Load.

    Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to text-generation-webui:

    api_key: dummy\nbase_url: http://localhost:5000/v1/\nmodel: any\n
    "},{"location":"local_model/#llama-cpp-python-server-llm-only","title":"llama-cpp-python server (LLM only)","text":"

    See llama-cpp-python OpenAI server.

    Download any GGUF model weight on HuggingFace or other source. Place it somewhere on your local machine.

    Run

    LOCAL_MODEL=<path/to/GGUF> python scripts/serve_local.py\n

    Setup LLM model on Resources tab with type OpenAI. Set these model parameters to connect to llama-cpp-python:

    api_key: dummy\nbase_url: http://localhost:8000/v1/\nmodel: model_name\n
    "},{"location":"local_model/#use-local-models-for-rag","title":"Use local models for RAG","text":"

    You are set! Start a new conversation to test your local RAG pipeline.

    "},{"location":"online_install/","title":"Online install","text":""},{"location":"online_install/#installation-online-huggingface-space","title":"Installation (Online HuggingFace Space)","text":"
    1. Go to kotaemon_template
    2. Use Duplicate function to create your own space
    3. Wait for the build to complete and start up (apprx 10 mins).
    4. Follow the first setup instructions (and register for Cohere API key if needed)\\
    5. Complete the setup and use your own private space!
    "},{"location":"usage/","title":"Basic Usage","text":""},{"location":"usage/#1-add-your-ai-models","title":"1. Add your AI models","text":"

    To add a model:

    1. Navigate to the Resources tab.
    2. Select the LLMs sub-tab.
    3. Select the Add sub-tab.
    4. Config the model to add:
      • Give it a name.
      • Pick a vendor/provider (e.g. ChatOpenAI).
      • Provide the specifications.
      • (Optional) Set the model as default.
    5. Click Add to add the model.
    6. Select Embedding Models sub-tab and repeat the step 3 to 5 to add an embedding model.
    (Optional) Configure model via the .env file

    Alternatively, you can configure the models via the .env file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.

    Currently, the following providers are supported:

    "},{"location":"usage/#openai","title":"OpenAI","text":"

    In the .env file, set the OPENAI_API_KEY variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.

    OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=<your OpenAI API key here>\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n
    "},{"location":"usage/#azure-openai","title":"Azure OpenAI","text":"

    For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.

    AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview # could be different for you\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo # change to your deployment name\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 # change to your deployment name\n
    "},{"location":"usage/#local-models","title":"Local models","text":"

    Pros:

    Cons:

    "},{"location":"usage/#find-and-download-a-llm","title":"Find and download a LLM","text":"

    You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:

    You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that take up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.

    Here are some recommendations and their size in memory:

    "},{"location":"usage/#enable-local-models","title":"Enable local models","text":"

    To add a local model to the model pool, set the LOCAL_MODEL variable in the .env file to the path of the model file.

    LOCAL_MODEL=<full path to your model file>\n

    Here is how to get the full path of your model file:

    "},{"location":"usage/#2-upload-your-documents","title":"2. Upload your documents","text":"

    In order to do QA on your documents, you need to upload them to the application first. Navigate to the File Index tab and you will see 2 sections:

    1. File upload:
      • Drag and drop your file to the UI or select it from your file system. Then click Upload and Index.
      • The application will take some time to process the file and show a message once it is done.
    2. File list:
      • This section shows the list of files that have been uploaded to the application and allows users to delete them.
    "},{"location":"usage/#3-chat-with-your-documents","title":"3. Chat with your documents","text":"

    Now navigate back to the Chat tab. The chat tab is divided into 3 regions:

    1. Conversation Settings Panel
      • Here you can select, create, rename, and delete conversations.
        • By default, a new conversation is created automatically if no conversation is selected.
      • Below that you have the file index, where you can choose whether to disable, select all files, or select which files to retrieve references from.
        • If you choose \"Disabled\", no files will be considered as context during chat.
        • If you choose \"Search All\", all files will be considered during chat.
        • If you choose \"Select\", a dropdown will appear for you to select the files to be considered during chat. If no files are selected, then no files will be considered during chat.
    2. Chat Panel
      • This is where you can chat with the chatbot.
    3. Information Panel

    Generally, the score quality is LLM relevant score > Reranking score > Vectorscore. By default, overall relevance score is taken directly from LLM relevant score. Evidences are sorted based on their overall relevance score and whether they have citation or not.

    "},{"location":"development/","title":"Development","text":""},{"location":"development/#kotaemon","title":"kotaemon","text":"

    An open-source clean & customizable RAG UI for chatting with your documents. Built with both end users and developers in mind.

    Live Demo | Source Code

    User Guide | Developer Guide | Feedback

    "},{"location":"development/#introduction","title":"Introduction","text":"

    This project serves as a functional RAG UI for both end users who want to do QA on their documents and developers who want to build their own RAG pipeline.

    +----------------------------------------------------------------------------+\n| End users: Those who use apps built with `kotaemon`.                       |\n| (You use an app like the one in the demo above)                            |\n|     +----------------------------------------------------------------+     |\n|     | Developers: Those who built with `kotaemon`.                   |     |\n|     | (You have `import kotaemon` somewhere in your project)         |     |\n|     |     +----------------------------------------------------+     |     |\n|     |     | Contributors: Those who make `kotaemon` better.    |     |     |\n|     |     | (You make PR to this repo)                         |     |     |\n|     |     +----------------------------------------------------+     |     |\n|     +----------------------------------------------------------------+     |\n+----------------------------------------------------------------------------+\n

    This repository is under active development. Feedback, issues, and PRs are highly appreciated.

    "},{"location":"development/#key-features","title":"Key Features","text":" "},{"location":"development/#installation","title":"Installation","text":""},{"location":"development/#for-end-users","title":"For end users","text":"

    This document is intended for developers. If you just want to install and use the app as it is, please follow the non-technical User Guide. Use the most recent release .zip to include latest features and bug-fixes.

    "},{"location":"development/#for-developers","title":"For developers","text":""},{"location":"development/#with-docker-recommended","title":"With Docker (recommended)","text":"

    We support lite & full version of Docker images. With full, the extra packages of unstructured will be installed as well, it can support additional file types (.doc, .docx, ...) but the cost is larger docker image size. For most users, the lite image should work well in most cases.

    docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-lite\n
    docker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\nghcr.io/cinnamon/kotaemon:main-full\n

    Currently, two platforms: linux/amd64 and linux/arm64 (for newer Mac) are provided & tested. User can specify the platform by passing --platform in the docker run command. For example:

    # To run docker with platform linux/arm64\ndocker run \\\n-e GRADIO_SERVER_NAME=0.0.0.0 \\\n-e GRADIO_SERVER_PORT=7860 \\\n-p 7860:7860 -it --rm \\\n--platform linux/arm64 \\\nghcr.io/cinnamon/kotaemon:main-lite\n

    If everything is set up fine, navigate to http://localhost:7860/ to access the web UI.

    We use GHCR to store docker images, all images can be found here.

    "},{"location":"development/#without-docker","title":"Without Docker","text":"
    # optional (setup env)\nconda create -n kotaemon python=3.10\nconda activate kotaemon\n\n# clone this repo\ngit clone https://github.com/Cinnamon/kotaemon\ncd kotaemon\n\npip install -e \"libs/kotaemon[all]\"\npip install -e \"libs/ktem\"\n
    python app.py\n

    The app will be automatically launched in your browser.

    Default username / password are: admin / admin. You can setup additional users directly on the UI.

    "},{"location":"development/#setup-local-models-for-local-private-rag","title":"Setup local models (for local / private RAG)","text":"

    See Local model setup.

    "},{"location":"development/#customize-your-application","title":"Customize your application","text":"

    By default, all application data are stored in ./ktem_app_data folder. You can backup or copy this folder to move your installation to a new machine.

    For advance users or specific use-cases, you can customize those files:

    "},{"location":"development/#flowsettingspy","title":"flowsettings.py","text":"

    This file contains the configuration of your application. You can use the example here as the starting point.

    Notable settings
    # setup your preferred document store (with full-text search capabilities)\nKH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore)\n\n# setup your preferred vectorstore (for vector-based search)\nKH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Qdrant)\n\n# Enable / disable multimodal QA\nKH_REASONINGS_USE_MULTIMODAL=True\n\n# Setup your new reasoning pipeline or modify existing one.\nKH_REASONINGS = [\n    \"ktem.reasoning.simple.FullQAPipeline\",\n    \"ktem.reasoning.simple.FullDecomposeQAPipeline\",\n    \"ktem.reasoning.react.ReactAgentPipeline\",\n    \"ktem.reasoning.rewoo.RewooAgentPipeline\",\n]\n)\n
    "},{"location":"development/#env","title":".env","text":"

    This file provides another way to configure your models and credentials.

    Configure model via the .env file

    Alternatively, you can configure the models via the .env file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.

    Currently, the following providers are supported:

    "},{"location":"development/#openai","title":"OpenAI","text":"

    In the .env file, set the OPENAI_API_KEY variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people.

    OPENAI_API_BASE=https://api.openai.com/v1\nOPENAI_API_KEY=<your OpenAI API key here>\nOPENAI_CHAT_MODEL=gpt-3.5-turbo\nOPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002\n
    "},{"location":"development/#azure-openai","title":"Azure OpenAI","text":"

    For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development.

    AZURE_OPENAI_ENDPOINT=\nAZURE_OPENAI_API_KEY=\nOPENAI_API_VERSION=2024-02-15-preview\nAZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo\nAZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002\n
    "},{"location":"development/#local-models","title":"Local models","text":""},{"location":"development/#using-ollama-openai-compatible-server","title":"Using ollama OpenAI compatible server","text":"

    Install ollama and start the application.

    Pull your model (e.g):

    ollama pull llama3.1:8b\nollama pull nomic-embed-text\n

    Set the model names on web UI and make it as default.

    "},{"location":"development/#using-gguf-with-llama-cpp-python","title":"Using GGUF with llama-cpp-python","text":"

    You can search and download a LLM to be ran locally from the Hugging Face Hub. Currently, these model formats are supported:

    You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time.

    Here are some recommendations and their size in memory:

    Add a new LlamaCpp model with the provided model name on the web uI.

    "},{"location":"development/#adding-your-own-rag-pipeline","title":"Adding your own RAG pipeline","text":""},{"location":"development/#custom-reasoning-pipeline","title":"Custom reasoning pipeline","text":"

    First, check the default pipeline implementation in here. You can make quick adjustment to how the default QA pipeline work.

    Next, if you feel comfortable adding new pipeline, add new .py implementation in libs/ktem/ktem/reasoning/ and later include it in flowssettings to enable it on the UI.

    "},{"location":"development/#custom-indexing-pipeline","title":"Custom indexing pipeline","text":"

    Check sample implementation in libs/ktem/ktem/index/file/graph

    (more instruction WIP).

    "},{"location":"development/#developer-guide","title":"Developer guide","text":"

    Please refer to the Developer Guide for more details.

    "},{"location":"development/#star-history","title":"Star History","text":""},{"location":"development/contributing/","title":"Contributing","text":""},{"location":"development/contributing/#contributing","title":"Contributing","text":""},{"location":"development/contributing/#setting-up","title":"Setting up","text":" "},{"location":"development/contributing/#package-overview","title":"Package overview","text":"

    kotaemon library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities:

    mindmap\n  root((kotaemon))\n    Base Interfaces\n      Document\n      LLMInterface\n      RetrievedDocument\n      BaseEmbeddings\n      BaseChat\n      BaseCompletion\n      ...\n    Core Components\n      LLMs\n        AzureOpenAI\n        OpenAI\n      Embeddings\n        AzureOpenAI\n        OpenAI\n        HuggingFaceEmbedding\n      VectorStore\n        InMemoryVectorstore\n        ChromaVectorstore\n      Agent\n      Tool\n      DocumentStore\n      ...\n    Utilities\n      Scaffold project\n      PromptUI\n      Documentation Support
    "},{"location":"development/contributing/#common-conventions","title":"Common conventions","text":""},{"location":"development/contributing/#environment-caching-on-pr","title":"Environment caching on PR","text":""},{"location":"development/contributing/#merge-pr-guideline","title":"Merge PR guideline","text":""},{"location":"development/create-a-component/","title":"Creating a Component","text":""},{"location":"development/create-a-component/#creating-a-component","title":"Creating a component","text":"

    A fundamental concept in kotaemon is \"component\".

    Anything that isn't data or data structure is a \"component\". A component can be thought of as a step within a pipeline. It takes in some input, processes it, and returns an output, just the same as a Python function! The output will then become an input for the next component in a pipeline. In fact, a pipeline is just a component. More appropriately, a nested component: a component that makes use of one or more other components in the processing step. So in reality, there isn't a difference between a pipeline and a component! Because of that, in kotaemon, we will consider them the same as \"component\".

    To define a component, you will:

    1. Create a class that subclasses from kotaemon.base.BaseComponent
    2. Declare init params with type annotation
    3. Declare nodes (nodes are just other components!) with type annotation
    4. Implement the processing logic in run.

    The syntax of a component is as follow:

    from kotaemon.base import BaseComponent\nfrom kotaemon.llms import LCAzureChatOpenAI\nfrom kotaemon.parsers import RegexExtractor\n\n\nclass FancyPipeline(BaseComponent):\n    param1: str = \"This is param1\"\n    param2: int = 10\n    param3: float\n\n    node1: BaseComponent    # this is a node because of BaseComponent type annotation\n    node2: LCAzureChatOpenAI  # this is also a node because LCAzureChatOpenAI subclasses BaseComponent\n    node3: RegexExtractor   # this is also a node bceause RegexExtractor subclasses BaseComponent\n\n    def run(self, some_text: str):\n        prompt = (self.param1 + some_text) * int(self.param2 + self.param3)\n        llm_pred = self.node2(prompt).text\n        matches = self.node3(llm_pred)\n        return matches\n

    Then this component can be used as follow:

    llm = LCAzureChatOpenAI(endpoint=\"some-endpont\")\nextractor = RegexExtractor(pattern=[\"yes\", \"Yes\"])\n\ncomponent = FancyPipeline(\n    param1=\"Hello\"\n    param3=1.5\n    node1=llm,\n    node2=llm,\n    node3=extractor\n)\ncomponent(\"goodbye\")\n

    This way, we can define each operation as a reusable component, and use them to compose larger reusable components!

    "},{"location":"development/create-a-component/#benefits-of-component","title":"Benefits of component","text":"

    By defining a component as above, we formally encapsulate all the necessary information inside a single class. This introduces several benefits:

    1. Allow tools like promptui to inspect the inner working of a component in order to automatically generate the promptui.
    2. Allow visualizing a pipeline for debugging purpose.
    "},{"location":"development/data-components/","title":"Data & Data Structure Components","text":""},{"location":"development/data-components/#data-data-structure-components","title":"Data & Data Structure Components","text":"

    The data & data structure components include:

    "},{"location":"development/data-components/#data-loader","title":"Data Loader","text":""},{"location":"development/data-components/#document-store","title":"Document Store","text":""},{"location":"development/data-components/#vector-store","title":"Vector Store","text":""},{"location":"development/utilities/","title":"Utilities","text":""},{"location":"development/utilities/#utilities","title":"Utilities","text":""},{"location":"development/utilities/#prompt-engineering-ui","title":"Prompt engineering UI","text":"

    Important: despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...).

    In the development process, developers typically build the pipeline. However, for use cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more effective. To facilitate this, kotaemon offers a user-friendly prompt engineering UI that developers integrate into their pipelines. This enables non-technical members to adjust prompts and parameters, run experiments, and export results for optimization.

    As of Sept 2023, there are 2 kinds of prompt engineering UI:

    "},{"location":"development/utilities/#simple-pipeline","title":"Simple pipeline","text":"

    For simple pipeline, the supported client project workflow looks as follow:

    1. [tech] Build pipeline
    2. [tech] Export pipeline to config: $ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>
    3. [tech] Customize the config
    4. [tech] Spin up prompt engineering UI: $ kotaemon promptui run <path/to/config/file.yml>
    5. [non-tech] Change params, run inference
    6. [non-tech] Export to Excel
    7. [non-tech] Select the set of params that achieve the best output

    The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally done by the developers, while step 7 happens exclusively in Excel file).

    "},{"location":"development/utilities/#step-2-export-pipeline-to-config","title":"Step 2 - Export pipeline to config","text":"

    Command:

    $ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>\n

    where:

    By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with ignore_ui=True, and they will be ignored in the config generation process. Example:

    class Pipeline(BaseComponent):\n    param1: str = Param(default=\"hello\")\n    param2: str = Param(default=\"goodbye\", ignore_ui=True)\n

    Declared as above, and param1 will show up in the config YAML file, while param2 will not.

    "},{"location":"development/utilities/#step-3-customize-the-config","title":"Step 3 - Customize the config","text":"

    developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema:

    <module.path.pipelineclass1>:\n  params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.)\n  inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.)\n  outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.)\n  logs: ... (Detail what information should show up in the log.)\n
    "},{"location":"development/utilities/#input-and-params","title":"Input and params","text":"

    The inputs section have the overall schema as follow:

    inputs:\n  <input-variable-name-1>:\n    component: <supported-UI-component>\n    params: # this section is optional)\n      value: <default-value>\n  <input-variable-name-2>: ... # similar to above\nparams:\n  <param-variable-name-1>: ... # similar to those in the inputs\n

    The list of supported prompt UI and their corresponding gradio UI components:

    COMPONENTS_CLASS = {\n    \"text\": gr.components.Textbox,\n    \"checkbox\": gr.components.CheckboxGroup,\n    \"dropdown\": gr.components.Dropdown,\n    \"file\": gr.components.File,\n    \"image\": gr.components.Image,\n    \"number\": gr.components.Number,\n    \"radio\": gr.components.Radio,\n    \"slider\": gr.components.Slider,\n}\n
    "},{"location":"development/utilities/#outputs","title":"Outputs","text":"

    The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration:

    outputs:\n  - component: <supported-UI-component>\n    step: <name-of-pipeline-step>\n    item: <jsonpath way to retrieve the info>\n  - ... # similar to above\n

    where:

    "},{"location":"development/utilities/#logs","title":"Logs","text":"

    The logs show a list of sheetname and how to retrieve the desired information.

    logs:\n  <logname>:\n    inputs:\n      - name: <column name>\n        step: <the pipeline step that we would wish to see the input>\n        variable: <the variable in the step>\n      - ...\n    outputs:\n      - name: <column name>\n        step: <the pipeline step that we would wish to see the output>\n        item: <how to retrieve the output of that step>\n
    "},{"location":"development/utilities/#step-4-5-spin-up-prompt-engineering-ui-perform-prompt-engineering","title":"Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering","text":"

    Command:

    $ kotaemon promptui run <path/to/config/file.yml>\n

    This will generate an UI as follow:

    where:

    "},{"location":"development/utilities/#step-6-export-to-excel","title":"Step 6 - Export to Excel","text":"

    Upon clicking export, the users can download Excel file.

    "},{"location":"development/utilities/#chat-pipeline","title":"Chat pipeline","text":"

    Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow:

    1. Set the desired parameters.
    2. Click \"New chat\" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect.
    3. Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot.
    4. During chat, the log of the chat will show up in the \"Output\" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings.
    5. When finishing chat, select your preference in the radio box. Click \"End chat\". This will save the chat log and the preference to disk.
    6. To compare the result of different run, click \"Export\" to get an Excel spreadsheet summary of different run.
    "},{"location":"pages/app/customize-flows/","title":"Customize flow logic","text":""},{"location":"pages/app/customize-flows/#add-new-indexing-and-reasoning-pipeline-to-the-application","title":"Add new indexing and reasoning pipeline to the application","text":"

    @trducng

    At high level, to add new indexing and reasoning pipeline:

    1. You define your indexing or reasoning pipeline as a class from BaseComponent.
    2. You declare that class in the setting files flowsettings.py.

    Then when python app.py, the application will dynamically load those pipelines.

    The below sections talk in more detail about how the pipelines should be constructed.

    "},{"location":"pages/app/customize-flows/#define-a-pipeline-as-a-class","title":"Define a pipeline as a class","text":"

    In essence, a pipeline will subclass from kotaemon.base.BaseComponent. Each pipeline has 2 main parts:

    An example pipeline:

    from kotaemon.base import BaseComponent\n\n\nclass SoSimple(BaseComponent):\n    arg1: int\n    arg2: str\n\n    def run(self, arg3: str):\n        return self.arg1 * self.arg2 + arg3\n

    This pipeline is simple for demonstration purpose, but we can imagine pipelines with much more arguments, that can take other pipelines as arguments, and have more complicated logic in the run method.

    An indexing or reasoning pipeline is just a class subclass from BaseComponent like above.

    For more detail on this topic, please refer to Creating a Component

    "},{"location":"pages/app/customize-flows/#run-signatures","title":"Run signatures","text":"

    Note: this section is tentative at the moment. We will finalize def run function signature by latest early April.

    The indexing pipeline:

        def run(\n        self,\n        file_paths: str | Path | list[str | Path],\n        reindex: bool = False,\n        **kwargs,\n    ):\n        \"\"\"Index files to intermediate representation (e.g. vector, database...)\n\n        Args:\n            file_paths: the list of paths to files\n            reindex: if True, files in `file_paths` that already exists in database\n                should be reindex.\n        \"\"\"\n

    The reasoning pipeline:

        def run(self, question: str, history: list, **kwargs) -> Document:\n        \"\"\"Answer the question\n\n        Args:\n            question: the user input\n            history: the chat history [(user_msg1, bot_msg1), (user_msg2, bot_msg2)...]\n\n        Returns:\n            kotaemon.base.Document: the final answer\n        \"\"\"\n
    "},{"location":"pages/app/customize-flows/#register-your-pipeline-to-ktem","title":"Register your pipeline to ktem","text":"

    To register your pipelines to ktem, you declare it in the flowsettings.py file. This file locates at the current working directory where you start the ktem. In most use cases, it is this one.

    KH_REASONING = [\"<python.module.path.to.the.reasoning.class>\"]\n\nKH_INDEX = \"<python.module.path.to.the.indexing.class>\"\n

    You can register multiple reasoning pipelines to ktem by populating the KH_REASONING list. The user can select which reasoning pipeline to use in their Settings page.

    For now, there's only one supported index option for KH_INDEX.

    Make sure that your class is discoverable by Python.

    "},{"location":"pages/app/customize-flows/#allow-users-to-customize-your-pipeline-in-the-app-settings","title":"Allow users to customize your pipeline in the app settings","text":"

    To allow the users to configure your pipeline, you need to declare what you allow the users to configure as a dictionary. ktem will include them into the application settings.

    In your pipeline class, add a classmethod get_user_settings that returns a setting dictionary, add a classmethod get_info that returns an info dictionary. Example:

    class SoSimple(BaseComponent):\n\n    ... # as above\n\n    @classmethod\n    def get_user_settings(cls) -> dict:\n        \"\"\"The settings to the user\"\"\"\n        return {\n            \"setting_1\": {\n                \"name\": \"Human-friendly name\",\n                \"value\": \"Default value\",\n                \"choices\": [(\"Human-friendly Choice 1\", \"choice1-id\"), (\"HFC 2\", \"choice2-id\")], # optional\n                \"component\": \"Which Gradio UI component to render, can be: text, number, checkbox, dropdown, radio, checkboxgroup\"\n            },\n            \"setting_2\": {\n                # follow the same rule as above\n            }\n        }\n\n    @classmethod\n    def get_info(cls) -> dict:\n        \"\"\"Pipeline information for bookkeeping purpose\"\"\"\n        return {\n            \"id\": \"a unique id to differentiate this pipeline from other pipeline\",\n            \"name\": \"Human-friendly name of the pipeline\",\n            \"description\": \"Can be a short description of this pipeline\"\n        }\n

    Once adding these methods to your pipeline class, ktem will automatically extract and add them to the settings.

    "},{"location":"pages/app/customize-flows/#construct-to-pipeline-object","title":"Construct to pipeline object","text":"

    Once ktem runs your pipeline, it will call your classmethod get_pipeline with the full user settings and expect to obtain the pipeline object. Within this get_pipeline method, you implement all the necessary logics to initiate the pipeline object. Example:

    class SoSimple(BaseComponent):\n    ... # as above\n\n    @classmethod\n    def get_pipeline(self, setting):\n        obj = cls(arg1=setting[\"reasoning.id.setting1\"])\n        return obj\n
    "},{"location":"pages/app/customize-flows/#reasoning-stream-output-to-ui","title":"Reasoning: Stream output to UI","text":"

    For fast user experience, you can stream the output directly to UI. This way, user can start observing the output as soon as the LLM model generates the 1st token, rather than having to wait the pipeline finishes to read the whole message.

    To stream the output, you need to;

    1. Turn the run function to async.
    2. Pass in the output to a special queue with self.report_output.
        async def run(self, question: str, history: list, **kwargs) -> Document:\n        for char in \"This is a long messages\":\n            self.report_output({\"output\": text.text})\n

    The argument to self.report_output is a dictionary, that contains either or all of these 2 keys: \"output\", \"evidence\". The \"output\" string will be streamed to the chat message, and the \"evidence\" string will be streamed to the information panel.

    "},{"location":"pages/app/customize-flows/#access-application-llms-embeddings","title":"Access application LLMs, Embeddings","text":"

    You can access users' collections of LLMs and embedding models with:

    from ktem.embeddings.manager import embeddings\nfrom ktem.llms.manager import llms\n\n\nllm = llms.get_default()\nembedding_model = embeddings.get_default()\n

    You can also allow the users to specifically select which llms or embedding models they want to use through the settings.

        @classmethod\n    def get_user_settings(cls) -> dict:\n        from ktem.llms.manager import llms\n\n        return {\n            \"citation_llm\": {\n                \"name\": \"LLM for citation\",\n                \"value\": llms.get_default(),\n                \"component: \"dropdown\",\n                \"choices\": list(llms.options().keys()),\n            },\n            ...\n        }\n
    "},{"location":"pages/app/customize-flows/#optional-access-application-data","title":"Optional: Access application data","text":"

    You can access the user's application database, vector store as follow:

    # get the database that contains the source files\nfrom ktem.db.models import Source, Index, Conversation, User\n\n# get the vector store\n
    "},{"location":"pages/app/features/","title":"Features","text":""},{"location":"pages/app/features/#chat","title":"Chat","text":"

    The kotaemon focuses on question and answering over a corpus of data. Below is the gentle introduction about the chat functionality.

    "},{"location":"pages/app/functional-description/","title":"Functional description","text":""},{"location":"pages/app/functional-description/#user-group-tenant-management","title":"User group / tenant management","text":""},{"location":"pages/app/functional-description/#create-new-user-group","title":"Create new user group","text":"

    (6 man-days)

    Description: each client has a dedicated user group. Each user group has an admin user who can do administrative tasks (e.g. creating user account in that user group...). The workflow for creating new user group is as follow:

    1. Cinnamon accesses the user group management UI.
    2. On \"Create user group\" panel, we supply: a. Client name: e.g. Apple. b. Sub-domain name: e.g. apple. c. Admin email, username & password.
    3. The system will: a. An Aurora Platform deployment with the specified sub-domain. b. Send an email to the admin, with the username & password.

    Expectation:

    Condition:

    "},{"location":"pages/app/functional-description/#delete-user-group","title":"Delete user group","text":"

    (2 man-days)

    Description: in the tenant management page, we can delete the selected user group. The user flow is as follow:

    1. Cinnamon accesses the user group management UI,
    2. View list of user groups.
    3. Next to target user group, click delete.
    4. Confirm whether to delete.
    5. If Yes, delete the user group. If No, cancel the operation.

    Expectation: when a user group is deleted, we expect to delete everything related to the user groups: domain, files, databases, caches, deployments.

    "},{"location":"pages/app/functional-description/#user-management","title":"User management","text":""},{"location":"pages/app/functional-description/#create-user-account-for-admin-user","title":"Create user account (for admin user)","text":"

    (1 man-day)

    Description: the admin user in the client's account can create user account for that user group. To create the new user, the client admin do:

    1. Navigate to \"Admin\" > \"Users\"
    2. In the \"Create user\" panel, supply:
      • Username
      • Password
      • Confirm password
    3. Click \"Create\"

    Expectation:

    "},{"location":"pages/app/functional-description/#delete-user-account-for-admin-user","title":"Delete user account (for admin user)","text":"

    Description: the admin user in the client's account can delete user account. Once an user account is deleted, he/she cannot login to Aurora Platform.

    1. The admin user navigates to \"Admin\" > \"Users\".
    2. In the user list panel, next to the username, the admin click on the \"Delete\" button. The Confirmation dialog appears.
    3. If \"Delete\", the user account is deleted. If \"Cancel\", do nothing. The Confirmation dialog disappears.

    Expectation:

    "},{"location":"pages/app/functional-description/#edit-user-account-for-admin-user","title":"Edit user account (for admin user)","text":"

    Description: the admin user can change any information about the user account, including password. To change user information:

    1. The admin user navigates to \"Admin\" > \"Users\".
    2. In the user list panel, next to the username, the admin click on the \"Edit\" button.
    3. The user list disappears, the user detail appears, with the following information show up:
      • Username: (prefilled the username)
      • Password: (blank)
      • Confirm password: (blank)
    4. The admin can edit any of the information, and click \"Save\" or \"Cancel\".
      • If \"Save\": the information will be updated to the database, or show error per Expectation below.
      • If \"Cancel\": skip.
    5. If Save success or Cancel, transfer back to the user list UI, where the user information is updated accordingly.

    Expectation:

    "},{"location":"pages/app/functional-description/#sign-in","title":"Sign-in","text":"

    (3 man-days)

    Description: the users can sign-in to Aurora Platform as follow:

    1. User navigates to the URL.
    2. If the user is not logged in, the UI just shows the login screen.
    3. User types username & password.
    4. If correct, the user will proceed to normal working UI.
    5. If incorrect, the login screen shows text error.
    "},{"location":"pages/app/functional-description/#sign-out","title":"Sign-out","text":"

    (1 man-day)

    Description: the user can sign-out of Aurora Platform as follow:

    1. User navigates to the Settings > User page.
    2. User click on logout.
    3. The user is signed out to the UI login screen.

    Expectation: the user is completely signed out. Next time he/she uses the Aurora Platform, he/she has to login again.

    "},{"location":"pages/app/functional-description/#change-password","title":"Change password","text":"

    Description: the user can change their password as follow:

    1. User navigates to the Settings > User page.
    2. In the change password section, the user provides these info and click Change:
      • Current password
      • New password
      • Confirm new password
    3. If changing successfully, then the password is changed. Otherwise, show the error on the UI.

    Expectation:

    "},{"location":"pages/app/functional-description/#chat","title":"Chat","text":""},{"location":"pages/app/functional-description/#chat-to-the-bot","title":"Chat to the bot","text":"

    Description: the Aurora Platform focuses on question and answering over the uploaded data. Each chat has the following components:

    The chat workflow looks as follow:

    1. [Optional] User select files that they want to scope the context for the bot. If the user doesn't select any files, then all files on Aurora Platform will be the context for the bot.
      • The user can type multi-line messages, using \"Shift + Enter\" for line-break.
    2. User sends the message (either clicking the Send button or hitting the Enter key).
    3. The bot in the chat conversation will return \"Thinking...\" while it processes.
    4. The information panel on the right begin to show data related to the user message.
    5. The bot begins to generate answer. The \"Thinking...\" placeholder disappears..

    Expecatation:

    "},{"location":"pages/app/functional-description/#conversation-switch","title":"Conversation - switch","text":"

    Description: users can jump around between different conversations. They can see the list of all conversations, can select an old converation, and continue the chat under the context of the old conversation. The switching workflow is like this:

    1. Users click on the conversation dropdown. It will show a list of conversations.
    2. Within that dropdown, the user selects one conversation.
    3. The chat messages, information panel, and selected data will show the content in that old chat.
    4. The user can continue chatting as normal under the context of this old chat.

    Expectation:

    "},{"location":"pages/app/functional-description/#conversation-create","title":"Conversation - create","text":"

    Description: the user can explicitly start a new conversation with the chatbot:

    1. User click on the \"New\" button.
    2. The new conversation is automatically created.

    Expectation:

    "},{"location":"pages/app/functional-description/#conversation-rename","title":"Conversation - rename","text":"

    Description: user can rename the chatbot by typing the name, and click on the Rename button next to it.

    Condition:

    "},{"location":"pages/app/functional-description/#conversation-delete","title":"Conversation - delete","text":"

    Description: user can delete the existing conversation as follow:

    1. Click on Delete button.
    2. The UI show confirmation with 2 buttons:
      • Delete
      • Cancel.
    3. If Delete, delete the conversation, switch to the next oldest conversation, close the confirmation panel.
    4. If cancel, just close the confirmation panel.
    "},{"location":"pages/app/functional-description/#file-management","title":"File management","text":"

    The file management allows users to upload, list and delete files that they upload to the Aurora Platform

    "},{"location":"pages/app/functional-description/#upload-file","title":"Upload file","text":"

    Description: the user can upload files to the Aurora Platform. The uploaded files will be served as context for our chatbot to refer to when it converses with the user. To upload file, the user:

    1. Navigate to the File tab.
    2. Within the File tab, there is an Upload section.
    3. User can add files to the Upload section through drag & drop, and or by click on the file browser.
    4. User can select some options relating to uploading and indexing. Depending on the project, these options can be different. Nevertheless, they will discuss below.
    5. User click on \"Upload and Index\" button.
    6. The app show notifications when indexing starts and finishes, and when errors happen on the top right corner.

    Options:

    Condition:

    "},{"location":"pages/app/functional-description/#list-all-files","title":"List all files","text":"

    Description: the user can know which files are on the system by:

    1. Navigate to the File tab.
    2. By default, it will show all the uploaded files, each with the following information: file name, file size, number of pages, uploaded date
    3. The UI also shows total number of pages, and total number of sizes in MB.
    "},{"location":"pages/app/functional-description/#delete-file","title":"Delete file","text":"

    Description: users can delete files from this UI to free up the space, or to remove outdated information. To remove the files:

    1. User navigate to the File tab.
    2. In the list of file, next to each file, there is a Delete button.
    3. The user clicks on the Delete button. Confirmation dialog appear.
    4. If Delete, delete the file. If Cancel, close the confirmation dialog.

    Expectation: once the file is deleted:

    "},{"location":"pages/app/ext/user-management/","title":"User management","text":"

    ktem provides user management as an extension. To enable user management, in your flowsettings.py, set the following variables:

    Once enabled, you have access to the following features:

    "},{"location":"pages/app/index/file/","title":"File index","text":"

    The file index stores files in a local folder and index them for retrieval. This file index provides the following infrastructure to support the indexing:

    The indexing and retrieval pipelines are encouraged to use the above software infrastructure.

    "},{"location":"pages/app/index/file/#indexing-pipeline","title":"Indexing pipeline","text":"

    The ktem has default indexing pipeline: ktem.index.file.pipelines.IndexDocumentPipeline.

    This default pipeline works as follow:

    You can customize this default pipeline if your indexing process is close to the default pipeline. You can create your own indexing pipeline if there are too much different logic.

    "},{"location":"pages/app/index/file/#customize-the-default-pipeline","title":"Customize the default pipeline","text":"

    The default pipeline provides the contact points in flowsettings.py.

    1. FILE_INDEX_PIPELINE_FILE_EXTRACTORS. Supply overriding file extractor, based on file extension. Example: {\".pdf\": \"path.to.PDFReader\", \".xlsx\": \"path.to.ExcelReader\"}
    2. FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE. The expected number of characters of each text segment. Example: 1024.
    3. FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP. The expected number of characters that consecutive text segments should overlap with each other. Example: 256.
    "},{"location":"pages/app/index/file/#create-your-own-indexing-pipeline","title":"Create your own indexing pipeline","text":"

    Your indexing pipeline will subclass BaseFileIndexIndexing.

    You should define the following methods:

    By subclassing BaseFileIndexIndexing, You will have access to the following resources:

    Once you have prepared your pipeline, register it in flowsettings.py: FILE_INDEX_PIPELINE = \"<python.path.to.your.pipeline>\".

    "},{"location":"pages/app/index/file/#retrieval-pipeline","title":"Retrieval pipeline","text":"

    The ktem has default retrieval pipeline: ktem.index.file.pipelines.DocumentRetrievalPipeline. This pipeline works as follow:

    "},{"location":"pages/app/index/file/#create-your-own-retrieval-pipeline","title":"Create your own retrieval pipeline","text":"

    Your retrieval pipeline will subclass BaseFileIndexRetriever. The retriever has the same database, vectorstore and docstore accesses like the indexing pipeline.

    You should define the following methods:

    Once you build the retrieval pipeline class, you can register it in flowsettings.py: FILE_INDEXING_RETRIEVER_PIPELIENS = [\"path.to.retrieval.pipelie\"]. Because there can be multiple parallel pipelines within an index, this variable takes a list of string rather than a string.

    "},{"location":"pages/app/index/file/#software-infrastructure","title":"Software infrastructure","text":"Infra Access Schema Ref SQL table Source self._Source - id (int): id of the source (auto)- name (str): the name of the file- path (str): the path of the file- size (int): the file size in bytes- note (dict): allow extra optional information about the file- date_created (datetime): the time the file is created (auto) This is SQLALchemy ORM class. Can consult SQL table Index self._Index - id (int): id of the index entry (auto)- source_id (int): the id of a file in the Source table- target_id: the id of the segment in docstore or vector store- relation_type (str): if the link is \"document\" or \"vector\" This is SQLAlchemy ORM class Vector store self._VS - self._VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)- self._VS.delete: delete vector entries based on ids- self._VS.query: get embeddings based on embeddings. kotaemon > storages > vectorstores > BaseVectorStore Doc store self._DS - self._DS.add: add the segments to document stores- self._DS.get: get the segments based on id- self._DS.get_all: get all segments- self._DS.delete: delete segments based on id kotaemon > storages > docstores > base > BaseDocumentStore"},{"location":"pages/app/settings/overview/","title":"Settings","text":""},{"location":"pages/app/settings/overview/#overview","title":"Overview","text":"

    There are 3 kinds of settings in ktem, geared towards different stakeholders for different use cases:

    "},{"location":"pages/app/settings/user-settings/","title":"User settings","text":""},{"location":"pages/app/settings/user-settings/#user-settings","title":"User settings","text":"

    ktem allows developers to extend the index and the reasoning pipeline. In many cases, these components can have settings that should be modified by users at run-time, (e.g. topk, chunksize...). These are the user settings.

    ktem allows developers to declare such user settings in their code. Once declared, ktem will render them in a Settings page.

    There are 2 places that ktem looks for declared user settings. You can refer to the respective pages.

    "},{"location":"pages/app/settings/user-settings/#syntax-of-a-settings","title":"Syntax of a settings","text":"

    A collection of settings is a dictionary of type dict[str, dict], where the key is a setting id, and the value is the description of the setting.

    settings = {\n    \"topk\": {\n        \"name\": \"Top-k chunks\",\n        \"value\": 10,\n        \"component\": \"number\",\n    },\n    \"lang\": {\n        \"name\": \"Languages\",\n        \"value\": \"en\",\n        \"component\": \"dropdown\",\n        \"choices\": [(\"en\", \"English\"), (\"cn\", \"Chinese\")],\n    }\n}\n

    Each setting description must have:

    "},{"location":"pages/app/settings/user-settings/#settings-page-structure","title":"Settings page structure","text":""},{"location":"reference/Summary/","title":"Summary","text":""},{"location":"reference/cli/","title":"CLI","text":""},{"location":"reference/cli/#cli.export","title":"export","text":"
    export(export_path, output)\n

    Export a pipeline to a config file

    Source code in libs/kotaemon/kotaemon/cli.py
    @promptui.command()\n@click.argument(\"export_path\", nargs=1)\n@click.option(\"--output\", default=\"promptui.yml\", show_default=True, required=False)\ndef export(export_path, output):\n    \"\"\"Export a pipeline to a config file\"\"\"\n    import sys\n\n    from theflow.utils.modules import import_dotted_string\n\n    from kotaemon.contribs.promptui.config import export_pipeline_to_config\n\n    sys.path.append(os.getcwd())\n    cls = import_dotted_string(export_path, safe=False)\n    export_pipeline_to_config(cls, output)\n    check_config_format(output)\n
    "},{"location":"reference/cli/#cli.run","title":"run","text":"
    run(run_path, share, username, password, appname, port)\n

    Run the UI from a config file

    Examples:

    \n# Run with default config file\n$ kh promptui run\n\n\n# Run with username and password supplied\n$ kh promptui run --username admin --password password\n\n\n# Run with username and prompted password\n$ kh promptui run --username admin\n\n# Run and share to promptui\n# kh promptui run --username admin --password password --share --appname hey                 --port 7861\n
    Source code in libs/kotaemon/kotaemon/cli.py
    @promptui.command()\n@click.argument(\"run_path\", required=False, default=\"promptui.yml\")\n@click.option(\n    \"--share\",\n    is_flag=True,\n    show_default=True,\n    default=False,\n    help=\"Share the app through Gradio. Requires --username to enable authentication.\",\n)\n@click.option(\n    \"--username\",\n    required=False,\n    help=(\n        \"Username for the user. If not provided, the promptui will not have \"\n        \"authentication.\"\n    ),\n)\n@click.option(\n    \"--password\",\n    required=False,\n    help=\"Password for the user. If not provided, will be prompted.\",\n)\n@click.option(\n    \"--appname\",\n    required=False,\n    help=\"The share app subdomain. Requires --share and --username\",\n)\n@click.option(\n    \"--port\",\n    required=False,\n    help=\"Port to run the app. If not provided, will $GRADIO_SERVER_PORT (7860)\",\n)\ndef run(run_path, share, username, password, appname, port):\n    \"\"\"Run the UI from a config file\n\n    Examples:\n\n        \\b\n        # Run with default config file\n        $ kh promptui run\n\n        \\b\n        # Run with username and password supplied\n        $ kh promptui run --username admin --password password\n\n        \\b\n        # Run with username and prompted password\n        $ kh promptui run --username admin\n\n        # Run and share to promptui\n        # kh promptui run --username admin --password password --share --appname hey \\\n                --port 7861\n    \"\"\"\n    import sys\n\n    from kotaemon.contribs.promptui.ui import build_from_dict\n\n    sys.path.append(os.getcwd())\n\n    check_config_format(run_path)\n    demo = build_from_dict(run_path)\n\n    params: dict = {}\n    if username is not None:\n        if password is not None:\n            auth = (username, password)\n        else:\n            auth = (username, click.prompt(\"Password\", hide_input=True))\n        params[\"auth\"] = auth\n\n    port = int(port) if port else int(os.getenv(\"GRADIO_SERVER_PORT\", \"7860\"))\n    params[\"server_port\"] = port\n\n    if share:\n        if username is None:\n            raise ValueError(\n                \"Username must be provided to enable authentication for sharing\"\n            )\n        if appname:\n            from kotaemon.contribs.promptui.tunnel import Tunnel\n\n            tunnel = Tunnel(\n                appname=str(appname), username=str(username), local_port=port\n            )\n            url = tunnel.run()\n            print(f\"App is shared at {url}\")\n        else:\n            params[\"share\"] = True\n            print(\"App is shared at Gradio\")\n\n    demo.launch(**params)\n
    "},{"location":"reference/cli/#cli.makedoc","title":"makedoc","text":"
    makedoc(module, output, separation_level)\n

    Make documentation for module module

    Example:

    \n# Make component documentation for kotaemon library\n$ kh makedoc kotaemon\n
    Source code in libs/kotaemon/kotaemon/cli.py
    @main.command()\n@click.argument(\"module\", required=True)\n@click.option(\n    \"--output\", default=\"docs.md\", required=False, help=\"The output markdown file\"\n)\n@click.option(\n    \"--separation-level\", required=False, default=1, help=\"Organize markdown layout\"\n)\ndef makedoc(module, output, separation_level):\n    \"\"\"Make documentation for module `module`\n\n    Example:\n\n        \\b\n        # Make component documentation for kotaemon library\n        $ kh makedoc kotaemon\n    \"\"\"\n    from kotaemon.contribs.docs import make_doc\n\n    make_doc(module, output, separation_level)\n    print(f\"Documentation exported to {output}\")\n
    "},{"location":"reference/cli/#cli.start_project","title":"start_project","text":"
    start_project(template)\n

    Start a project from a template.

    Important: the value for --template corresponds to the name of the template folder, which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates The default value is \"project-default\", which should work when you are starting a client project.

    Source code in libs/kotaemon/kotaemon/cli.py
    @main.command()\n@click.option(\n    \"--template\",\n    default=\"project-default\",\n    required=False,\n    help=\"Template name\",\n    show_default=True,\n)\ndef start_project(template):\n    \"\"\"Start a project from a template.\n\n    Important: the value for --template corresponds to the name of the template folder,\n    which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates\n    The default value is \"project-default\", which should work when you are starting a\n    client project.\n    \"\"\"\n\n    print(\"Retrieving template...\")\n    os.system(\n        \"cookiecutter git@github.com:Cinnamon/kotaemon.git \"\n        f\"--directory='templates/{template}'\"\n    )\n
    "},{"location":"reference/agents/","title":"Agents","text":""},{"location":"reference/agents/#agents.BaseAgent","title":"BaseAgent","text":"

    Bases: BaseComponent

    Define base agent interface

    Source code in libs/kotaemon/kotaemon/agents/base.py
    class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -> None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"reference/agents/#agents.BaseAgent.add_tools","title":"add_tools","text":"
    add_tools(tools)\n

    Helper method to add tools and update agent state if needed

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def add_tools(self, tools: list[BaseTool]) -> None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n
    "},{"location":"reference/agents/#agents.BaseAgent.run","title":"run","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n
    "},{"location":"reference/agents/#agents.AgentFinish","title":"AgentFinish","text":"

    Bases: NamedTuple

    Agent's return value when finishing execution.

    Parameters:

    Name Type Description Default return_values

    The return values of the agent.

    required log

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n
    "},{"location":"reference/agents/#agents.AgentOutput","title":"AgentOutput","text":"

    Bases: LLMInterface

    Output from an agent.

    Parameters:

    Name Type Description Default text

    The text output from the agent.

    required agent_type

    The type of agent.

    required status

    The status after executing the agent.

    required error

    The error message if any.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n
    "},{"location":"reference/agents/#agents.AgentType","title":"AgentType","text":"

    Bases: Enum

    Enumerated type for agent types.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad","title":"BaseScratchPad","text":"

    Base class for output handlers.

    "},{"location":"reference/agents/#agents.BaseScratchPad--attributes","title":"Attributes:","text":"

    logger : logging.Logger The logger object to log messages.

    "},{"location":"reference/agents/#agents.BaseScratchPad--methods","title":"Methods:","text":"

    stop(): Stop the output.

    update_status(output: str, **kwargs): Update the status of the output.

    thinking(name: str): Log that a process is thinking.

    done(_all=False): Log that the process is done.

    stream_print(item: str): Not implemented.

    json_print(item: Dict[str, Any]): Log a JSON object.

    panel_print(item: Any, title: str = \"Output\", stream: bool = False): Log a panel output.

    clear(): Not implemented.

    print(content: str, **kwargs): Log arbitrary content.

    format_json(json_obj: str): Format a JSON object.

    debug(content: str, **kwargs): Log a debug message.

    info(content: str, **kwargs): Log an informational message.

    warning(content: str, **kwargs): Log a warning message.

    error(content: str, **kwargs): Log an error message.

    critical(content: str, **kwargs): Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.stop","title":"stop","text":"
    stop()\n

    Stop the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.update_status","title":"update_status","text":"
    update_status(output, **kwargs)\n

    Update the status of the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.thinking","title":"thinking","text":"
    thinking(name)\n

    Log that a process is thinking.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.done","title":"done","text":"
    done(_all=False)\n

    Log that the process is done.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.stream_print","title":"stream_print","text":"
    stream_print(item)\n

    Stream print.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.json_print","title":"json_print","text":"
    json_print(item)\n

    Log a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.panel_print","title":"panel_print","text":"
    panel_print(item, title='Output', stream=False)\n

    Log a panel output.

    Parameters:

    Name Type Description Default item

    Any The item to log.

    required title

    str, optional The title of the panel, defaults to \"Output\".

    'Output' stream

    bool, optional

    False Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.clear","title":"clear","text":"
    clear()\n

    Not implemented.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.print","title":"print","text":"
    print(content, **kwargs)\n

    Log arbitrary content.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.format_json","title":"format_json","text":"
    format_json(json_obj)\n

    Format a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.debug","title":"debug","text":"
    debug(content, **kwargs)\n

    Log a debug message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.info","title":"info","text":"
    info(content, **kwargs)\n

    Log an informational message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.warning","title":"warning","text":"
    warning(content, **kwargs)\n

    Log a warning message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.error","title":"error","text":"
    error(content, **kwargs)\n

    Log an error message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.BaseScratchPad.critical","title":"critical","text":"
    critical(content, **kwargs)\n

    Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/#agents.LangchainAgent","title":"LangchainAgent","text":"

    Bases: BaseAgent

    Wrapper for Langchain Agent

    Source code in libs/kotaemon/kotaemon/agents/langchain_based.py
    class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -> None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -> AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n
    "},{"location":"reference/agents/#agents.ReactAgent","title":"ReactAgent","text":"

    Bases: BaseAgent

    Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -> str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -> dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -> str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -> AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n
    "},{"location":"reference/agents/#agents.ReactAgent.clear","title":"clear","text":"
    clear()\n

    Clear and reset the agent.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n
    "},{"location":"reference/agents/#agents.ReactAgent.run","title":"run","text":"
    run(instruction, max_iterations=None)\n

    Run the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def run(self, instruction, max_iterations=None) -> AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/#agents.ReactAgent.stream","title":"stream","text":"
    stream(instruction, max_iterations=None)\n

    Stream the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/#agents.RewooAgent","title":"RewooAgent","text":"

    Bases: BaseAgent

    Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -> tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -> tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -> Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) > 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n
    "},{"location":"reference/agents/#agents.RewooAgent.run","title":"run","text":"
    run(instruction, use_citation=False)\n

    Run the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    @BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/#agents.RewooAgent.stream","title":"stream","text":"
    stream(instruction, use_citation=False)\n

    Stream the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/#agents.BaseTool","title":"BaseTool","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -> Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -> Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -> LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n
    "},{"location":"reference/agents/#agents.BaseTool.name","title":"name instance-attribute","text":"
    name\n

    The unique name of the tool that clearly communicates its purpose.

    "},{"location":"reference/agents/#agents.BaseTool.description","title":"description instance-attribute","text":"
    description\n

    Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.

    "},{"location":"reference/agents/#agents.BaseTool.args_schema","title":"args_schema class-attribute instance-attribute","text":"
    args_schema = None\n

    Pydantic model class to validate and parse the tool's input arguments.

    "},{"location":"reference/agents/#agents.BaseTool.verbose","title":"verbose class-attribute instance-attribute","text":"
    verbose = False\n

    Whether to log the tool's progress.

    "},{"location":"reference/agents/#agents.BaseTool.handle_tool_error","title":"handle_tool_error class-attribute instance-attribute","text":"
    handle_tool_error = False\n

    Handle the content of the ToolException thrown.

    "},{"location":"reference/agents/#agents.BaseTool.to_langchain_format","title":"to_langchain_format","text":"
    to_langchain_format()\n

    Convert this tool to Langchain format to use with its agent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def to_langchain_format(self) -> LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n
    "},{"location":"reference/agents/#agents.BaseTool.run","title":"run","text":"
    run(tool_input, verbose=None, **kwargs)\n

    Run the tool.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -> Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n
    "},{"location":"reference/agents/#agents.BaseTool.from_langchain_format","title":"from_langchain_format classmethod","text":"
    from_langchain_format(langchain_tool)\n

    Wrapper for Langchain Tool

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    @classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n
    "},{"location":"reference/agents/#agents.ComponentTool","title":"ComponentTool","text":"

    Bases: BaseTool

    Wrapper around other BaseComponent to use it as a tool

    Parameters:

    Name Type Description Default component

    BaseComponent-based component to wrap

    required postprocessor

    Optional postprocessor for the component output

    required Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -> Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n
    "},{"location":"reference/agents/#agents.WikipediaTool","title":"WikipediaTool","text":"

    Bases: BaseTool

    Tool that adds the capability to query the Wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -> AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n
    "},{"location":"reference/agents/base/","title":"Base","text":""},{"location":"reference/agents/base/#agents.base.BaseAgent","title":"BaseAgent","text":"

    Bases: BaseComponent

    Define base agent interface

    Source code in libs/kotaemon/kotaemon/agents/base.py
    class BaseAgent(BaseComponent):\n    \"\"\"Define base agent interface\"\"\"\n\n    name: str = Param(help=\"Name of the agent.\")\n    agent_type: AgentType = Param(help=\"Agent type, must be one of AgentType\")\n    description: str = Param(\n        help=(\n            \"Description used to tell the model how/when/why to use the agent. You can\"\n            \" provide few-shot examples as a part of the description. This will be\"\n            \" input to the prompt of LLM.\"\n        )\n    )\n    llm: Optional[BaseLLM] = Node(\n        help=(\n            \"LLM to be used for the agent (optional). LLM must implement BaseLLM\"\n            \" interface.\"\n        )\n    )\n    prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param(\n        help=\"A prompt template or a dict to supply different prompt to the agent\"\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [],\n        help=\"List of plugins / tools to be used in the agent\",\n    )\n\n    @staticmethod\n    def safeguard_run(run_func, *args, **kwargs):\n        def wrapper(self, *args, **kwargs):\n            try:\n                return run_func(self, *args, **kwargs)\n            except Exception as e:\n                return AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"failed\",\n                    error=str(e),\n                )\n\n        return wrapper\n\n    def add_tools(self, tools: list[BaseTool]) -> None:\n        \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n        self.plugins.extend(tools)\n\n    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n        \"\"\"Run the component.\"\"\"\n        raise NotImplementedError()\n
    "},{"location":"reference/agents/base/#agents.base.BaseAgent.add_tools","title":"add_tools","text":"
    add_tools(tools)\n

    Helper method to add tools and update agent state if needed

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def add_tools(self, tools: list[BaseTool]) -> None:\n    \"\"\"Helper method to add tools and update agent state if needed\"\"\"\n    self.plugins.extend(tools)\n
    "},{"location":"reference/agents/base/#agents.base.BaseAgent.run","title":"run","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/agents/base.py
    def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]:\n    \"\"\"Run the component.\"\"\"\n    raise NotImplementedError()\n
    "},{"location":"reference/agents/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/agents/langchain_based/#agents.langchain_based.LangchainAgent","title":"LangchainAgent","text":"

    Bases: BaseAgent

    Wrapper for Langchain Agent

    Source code in libs/kotaemon/kotaemon/agents/langchain_based.py
    class LangchainAgent(BaseAgent):\n    \"\"\"Wrapper for Langchain Agent\"\"\"\n\n    name: str = \"LangchainAgent\"\n    agent_type: AgentType\n    description: str = \"LangchainAgent for answering multi-step reasoning questions\"\n    AGENT_TYPE_MAP = {\n        AgentType.openai: LCAgentType.OPENAI_FUNCTIONS,\n        AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS,\n        AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION,\n        AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH,\n    }\n    agent: Optional[LCAgentExecutor] = None\n\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        if self.agent_type not in self.AGENT_TYPE_MAP:\n            raise NotImplementedError(\n                f\"AgentType {self.agent_type } not supported by Langchain wrapper\"\n            )\n        self.update_agent_tools()\n\n    def update_agent_tools(self):\n        assert isinstance(self.llm, (ChatLLM, LLM))\n        langchain_plugins = [tool.to_langchain_format() for tool in self.plugins]\n\n        # a fix for search_doc tool name:\n        # use \"Intermediate Answer\" for self-ask agent\n        found_search_tool = False\n        if self.agent_type == AgentType.self_ask:\n            for plugin in langchain_plugins:\n                if plugin.name == \"search_doc\":\n                    plugin.name = \"Intermediate Answer\"\n                    langchain_plugins = [plugin]\n                    found_search_tool = True\n                    break\n\n        if self.agent_type != AgentType.self_ask or found_search_tool:\n            # reinit Langchain AgentExecutor\n            self.agent = initialize_agent(\n                langchain_plugins,\n                self.llm.to_langchain_format(),\n                agent=self.AGENT_TYPE_MAP[self.agent_type],\n                handle_parsing_errors=True,\n                verbose=True,\n            )\n\n    def add_tools(self, tools: List[BaseTool]) -> None:\n        super().add_tools(tools)\n        self.update_agent_tools()\n        return\n\n    def run(self, instruction: str) -> AgentOutput:\n        assert (\n            self.agent is not None\n        ), \"Lanchain AgentExecutor is not correctly initialized\"\n\n        # Langchain AgentExecutor call\n        output = self.agent(instruction)[\"output\"]\n\n        return AgentOutput(\n            text=output,\n            agent_type=self.agent_type,\n            status=\"finished\",\n        )\n
    "},{"location":"reference/agents/utils/","title":"Utils","text":""},{"location":"reference/agents/utils/#agents.utils.get_plugin_response_content","title":"get_plugin_response_content","text":"
    get_plugin_response_content(output)\n

    Wrapper for AgentOutput content return

    Source code in libs/kotaemon/kotaemon/agents/utils.py
    def get_plugin_response_content(output) -> str:\n    \"\"\"\n    Wrapper for AgentOutput content return\n    \"\"\"\n    if isinstance(output, Document):\n        return output.text\n    else:\n        return str(output)\n
    "},{"location":"reference/agents/utils/#agents.utils.calculate_cost","title":"calculate_cost","text":"
    calculate_cost(model_name, prompt_token, completion_token)\n

    Calculate the cost of a prompt and completion.

    Returns:

    Name Type Description float float

    Cost of the provided model name with provided token information

    Source code in libs/kotaemon/kotaemon/agents/utils.py
    def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -> float:\n    \"\"\"\n    Calculate the cost of a prompt and completion.\n\n    Returns:\n        float: Cost of the provided model name with provided token information\n    \"\"\"\n    # TODO: to be implemented\n    return 0.0\n
    "},{"location":"reference/agents/io/","title":"Io","text":""},{"location":"reference/agents/io/#agents.io.AgentAction","title":"AgentAction dataclass","text":"

    Agent's action to take.

    Parameters:

    Name Type Description Default tool str

    The tool to invoke.

    required tool_input Union[str, dict]

    The input to the tool.

    required log str

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    @dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n
    "},{"location":"reference/agents/io/#agents.io.AgentFinish","title":"AgentFinish","text":"

    Bases: NamedTuple

    Agent's return value when finishing execution.

    Parameters:

    Name Type Description Default return_values

    The return values of the agent.

    required log

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n
    "},{"location":"reference/agents/io/#agents.io.AgentOutput","title":"AgentOutput","text":"

    Bases: LLMInterface

    Output from an agent.

    Parameters:

    Name Type Description Default text

    The text output from the agent.

    required agent_type

    The type of agent.

    required status

    The status after executing the agent.

    required error

    The error message if any.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n
    "},{"location":"reference/agents/io/#agents.io.AgentType","title":"AgentType","text":"

    Bases: Enum

    Enumerated type for agent types.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad","title":"BaseScratchPad","text":"

    Base class for output handlers.

    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad--attributes","title":"Attributes:","text":"

    logger : logging.Logger The logger object to log messages.

    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad--methods","title":"Methods:","text":"

    stop(): Stop the output.

    update_status(output: str, **kwargs): Update the status of the output.

    thinking(name: str): Log that a process is thinking.

    done(_all=False): Log that the process is done.

    stream_print(item: str): Not implemented.

    json_print(item: Dict[str, Any]): Log a JSON object.

    panel_print(item: Any, title: str = \"Output\", stream: bool = False): Log a panel output.

    clear(): Not implemented.

    print(content: str, **kwargs): Log arbitrary content.

    format_json(json_obj: str): Format a JSON object.

    debug(content: str, **kwargs): Log a debug message.

    info(content: str, **kwargs): Log an informational message.

    warning(content: str, **kwargs): Log a warning message.

    error(content: str, **kwargs): Log an error message.

    critical(content: str, **kwargs): Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stop","title":"stop","text":"
    stop()\n

    Stop the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.update_status","title":"update_status","text":"
    update_status(output, **kwargs)\n

    Update the status of the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.thinking","title":"thinking","text":"
    thinking(name)\n

    Log that a process is thinking.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.done","title":"done","text":"
    done(_all=False)\n

    Log that the process is done.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.stream_print","title":"stream_print","text":"
    stream_print(item)\n

    Stream print.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.json_print","title":"json_print","text":"
    json_print(item)\n

    Log a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.panel_print","title":"panel_print","text":"
    panel_print(item, title='Output', stream=False)\n

    Log a panel output.

    Parameters:

    Name Type Description Default item

    Any The item to log.

    required title

    str, optional The title of the panel, defaults to \"Output\".

    'Output' stream

    bool, optional

    False Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.clear","title":"clear","text":"
    clear()\n

    Not implemented.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.print","title":"print","text":"
    print(content, **kwargs)\n

    Log arbitrary content.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.format_json","title":"format_json","text":"
    format_json(json_obj)\n

    Format a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.debug","title":"debug","text":"
    debug(content, **kwargs)\n

    Log a debug message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.info","title":"info","text":"
    info(content, **kwargs)\n

    Log an informational message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.warning","title":"warning","text":"
    warning(content, **kwargs)\n

    Log a warning message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.error","title":"error","text":"
    error(content, **kwargs)\n

    Log an error message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n
    "},{"location":"reference/agents/io/#agents.io.BaseScratchPad.critical","title":"critical","text":"
    critical(content, **kwargs)\n

    Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/","title":"Base","text":""},{"location":"reference/agents/io/base/#agents.io.base.AgentType","title":"AgentType","text":"

    Bases: Enum

    Enumerated type for agent types.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentType(Enum):\n    \"\"\"\n    Enumerated type for agent types.\n    \"\"\"\n\n    openai = \"openai\"\n    openai_multi = \"openai_multi\"\n    openai_tool = \"openai_tool\"\n    self_ask = \"self_ask\"\n    react = \"react\"\n    rewoo = \"rewoo\"\n    vanilla = \"vanilla\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad","title":"BaseScratchPad","text":"

    Base class for output handlers.

    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--attributes","title":"Attributes:","text":"

    logger : logging.Logger The logger object to log messages.

    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad--methods","title":"Methods:","text":"

    stop(): Stop the output.

    update_status(output: str, **kwargs): Update the status of the output.

    thinking(name: str): Log that a process is thinking.

    done(_all=False): Log that the process is done.

    stream_print(item: str): Not implemented.

    json_print(item: Dict[str, Any]): Log a JSON object.

    panel_print(item: Any, title: str = \"Output\", stream: bool = False): Log a panel output.

    clear(): Not implemented.

    print(content: str, **kwargs): Log arbitrary content.

    format_json(json_obj: str): Format a JSON object.

    debug(content: str, **kwargs): Log a debug message.

    info(content: str, **kwargs): Log an informational message.

    warning(content: str, **kwargs): Log a warning message.

    error(content: str, **kwargs): Log an error message.

    critical(content: str, **kwargs): Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class BaseScratchPad:\n    \"\"\"\n    Base class for output handlers.\n\n    Attributes:\n    -----------\n    logger : logging.Logger\n        The logger object to log messages.\n\n    Methods:\n    --------\n    stop():\n        Stop the output.\n\n    update_status(output: str, **kwargs):\n        Update the status of the output.\n\n    thinking(name: str):\n        Log that a process is thinking.\n\n    done(_all=False):\n        Log that the process is done.\n\n    stream_print(item: str):\n        Not implemented.\n\n    json_print(item: Dict[str, Any]):\n        Log a JSON object.\n\n    panel_print(item: Any, title: str = \"Output\", stream: bool = False):\n        Log a panel output.\n\n    clear():\n        Not implemented.\n\n    print(content: str, **kwargs):\n        Log arbitrary content.\n\n    format_json(json_obj: str):\n        Format a JSON object.\n\n    debug(content: str, **kwargs):\n        Log a debug message.\n\n    info(content: str, **kwargs):\n        Log an informational message.\n\n    warning(content: str, **kwargs):\n        Log a warning message.\n\n    error(content: str, **kwargs):\n        Log an error message.\n\n    critical(content: str, **kwargs):\n        Log a critical message.\n    \"\"\"\n\n    def __init__(self):\n        \"\"\"\n        Initialize the BaseOutput object.\n\n        \"\"\"\n        self.logger = logging\n        self.log = []\n\n    def stop(self):\n        \"\"\"\n        Stop the output.\n        \"\"\"\n\n    def update_status(self, output: str, **kwargs):\n        \"\"\"\n        Update the status of the output.\n        \"\"\"\n        if check_log():\n            self.logger.info(output)\n\n    def thinking(self, name: str):\n        \"\"\"\n        Log that a process is thinking.\n        \"\"\"\n        if check_log():\n            self.logger.info(f\"{name} is thinking...\")\n\n    def done(self, _all=False):\n        \"\"\"\n        Log that the process is done.\n        \"\"\"\n\n        if check_log():\n            self.logger.info(\"Done\")\n\n    def stream_print(self, item: str):\n        \"\"\"\n        Stream print.\n        \"\"\"\n\n    def json_print(self, item: Dict[str, Any]):\n        \"\"\"\n        Log a JSON object.\n        \"\"\"\n        if check_log():\n            self.logger.info(json.dumps(item, indent=2))\n\n    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n        \"\"\"\n        Log a panel output.\n\n        Args:\n            item : Any\n                The item to log.\n            title : str, optional\n                The title of the panel, defaults to \"Output\".\n            stream : bool, optional\n        \"\"\"\n        if not stream:\n            self.log.append(item)\n        if check_log():\n            self.logger.info(\"-\" * 20)\n            self.logger.info(item)\n            self.logger.info(\"-\" * 20)\n\n    def clear(self):\n        \"\"\"\n        Not implemented.\n        \"\"\"\n\n    def print(self, content: str, **kwargs):\n        \"\"\"\n        Log arbitrary content.\n        \"\"\"\n        self.log.append(content)\n        if check_log():\n            self.logger.info(content)\n\n    def format_json(self, json_obj: str):\n        \"\"\"\n        Format a JSON object.\n        \"\"\"\n        formatted_json = json.dumps(json_obj, indent=2)\n        return formatted_json\n\n    def debug(self, content: str, **kwargs):\n        \"\"\"\n        Log a debug message.\n        \"\"\"\n        if check_log():\n            self.logger.debug(content, **kwargs)\n\n    def info(self, content: str, **kwargs):\n        \"\"\"\n        Log an informational message.\n        \"\"\"\n        if check_log():\n            self.logger.info(content, **kwargs)\n\n    def warning(self, content: str, **kwargs):\n        \"\"\"\n        Log a warning message.\n        \"\"\"\n        if check_log():\n            self.logger.warning(content, **kwargs)\n\n    def error(self, content: str, **kwargs):\n        \"\"\"\n        Log an error message.\n        \"\"\"\n        if check_log():\n            self.logger.error(content, **kwargs)\n\n    def critical(self, content: str, **kwargs):\n        \"\"\"\n        Log a critical message.\n        \"\"\"\n        if check_log():\n            self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stop","title":"stop","text":"
    stop()\n

    Stop the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stop(self):\n    \"\"\"\n    Stop the output.\n    \"\"\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.update_status","title":"update_status","text":"
    update_status(output, **kwargs)\n

    Update the status of the output.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def update_status(self, output: str, **kwargs):\n    \"\"\"\n    Update the status of the output.\n    \"\"\"\n    if check_log():\n        self.logger.info(output)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.thinking","title":"thinking","text":"
    thinking(name)\n

    Log that a process is thinking.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def thinking(self, name: str):\n    \"\"\"\n    Log that a process is thinking.\n    \"\"\"\n    if check_log():\n        self.logger.info(f\"{name} is thinking...\")\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.done","title":"done","text":"
    done(_all=False)\n

    Log that the process is done.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def done(self, _all=False):\n    \"\"\"\n    Log that the process is done.\n    \"\"\"\n\n    if check_log():\n        self.logger.info(\"Done\")\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.stream_print","title":"stream_print","text":"
    stream_print(item)\n

    Stream print.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def stream_print(self, item: str):\n    \"\"\"\n    Stream print.\n    \"\"\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.json_print","title":"json_print","text":"
    json_print(item)\n

    Log a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def json_print(self, item: Dict[str, Any]):\n    \"\"\"\n    Log a JSON object.\n    \"\"\"\n    if check_log():\n        self.logger.info(json.dumps(item, indent=2))\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.panel_print","title":"panel_print","text":"
    panel_print(item, title='Output', stream=False)\n

    Log a panel output.

    Parameters:

    Name Type Description Default item

    Any The item to log.

    required title

    str, optional The title of the panel, defaults to \"Output\".

    'Output' stream

    bool, optional

    False Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def panel_print(self, item: Any, title: str = \"Output\", stream: bool = False):\n    \"\"\"\n    Log a panel output.\n\n    Args:\n        item : Any\n            The item to log.\n        title : str, optional\n            The title of the panel, defaults to \"Output\".\n        stream : bool, optional\n    \"\"\"\n    if not stream:\n        self.log.append(item)\n    if check_log():\n        self.logger.info(\"-\" * 20)\n        self.logger.info(item)\n        self.logger.info(\"-\" * 20)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.clear","title":"clear","text":"
    clear()\n

    Not implemented.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def clear(self):\n    \"\"\"\n    Not implemented.\n    \"\"\"\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.print","title":"print","text":"
    print(content, **kwargs)\n

    Log arbitrary content.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def print(self, content: str, **kwargs):\n    \"\"\"\n    Log arbitrary content.\n    \"\"\"\n    self.log.append(content)\n    if check_log():\n        self.logger.info(content)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.format_json","title":"format_json","text":"
    format_json(json_obj)\n

    Format a JSON object.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def format_json(self, json_obj: str):\n    \"\"\"\n    Format a JSON object.\n    \"\"\"\n    formatted_json = json.dumps(json_obj, indent=2)\n    return formatted_json\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.debug","title":"debug","text":"
    debug(content, **kwargs)\n

    Log a debug message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def debug(self, content: str, **kwargs):\n    \"\"\"\n    Log a debug message.\n    \"\"\"\n    if check_log():\n        self.logger.debug(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.info","title":"info","text":"
    info(content, **kwargs)\n

    Log an informational message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def info(self, content: str, **kwargs):\n    \"\"\"\n    Log an informational message.\n    \"\"\"\n    if check_log():\n        self.logger.info(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.warning","title":"warning","text":"
    warning(content, **kwargs)\n

    Log a warning message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def warning(self, content: str, **kwargs):\n    \"\"\"\n    Log a warning message.\n    \"\"\"\n    if check_log():\n        self.logger.warning(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.error","title":"error","text":"
    error(content, **kwargs)\n

    Log an error message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def error(self, content: str, **kwargs):\n    \"\"\"\n    Log an error message.\n    \"\"\"\n    if check_log():\n        self.logger.error(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.BaseScratchPad.critical","title":"critical","text":"
    critical(content, **kwargs)\n

    Log a critical message.

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def critical(self, content: str, **kwargs):\n    \"\"\"\n    Log a critical message.\n    \"\"\"\n    if check_log():\n        self.logger.critical(content, **kwargs)\n
    "},{"location":"reference/agents/io/base/#agents.io.base.AgentAction","title":"AgentAction dataclass","text":"

    Agent's action to take.

    Parameters:

    Name Type Description Default tool str

    The tool to invoke.

    required tool_input Union[str, dict]

    The input to the tool.

    required log str

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    @dataclass\nclass AgentAction:\n    \"\"\"Agent's action to take.\n\n    Args:\n        tool: The tool to invoke.\n        tool_input: The input to the tool.\n        log: The log message.\n    \"\"\"\n\n    tool: str\n    tool_input: Union[str, dict]\n    log: str\n
    "},{"location":"reference/agents/io/base/#agents.io.base.AgentFinish","title":"AgentFinish","text":"

    Bases: NamedTuple

    Agent's return value when finishing execution.

    Parameters:

    Name Type Description Default return_values

    The return values of the agent.

    required log

    The log message.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentFinish(NamedTuple):\n    \"\"\"Agent's return value when finishing execution.\n\n    Args:\n        return_values: The return values of the agent.\n        log: The log message.\n    \"\"\"\n\n    return_values: dict\n    log: str\n
    "},{"location":"reference/agents/io/base/#agents.io.base.AgentOutput","title":"AgentOutput","text":"

    Bases: LLMInterface

    Output from an agent.

    Parameters:

    Name Type Description Default text

    The text output from the agent.

    required agent_type

    The type of agent.

    required status

    The status after executing the agent.

    required error

    The error message if any.

    required Source code in libs/kotaemon/kotaemon/agents/io/base.py
    class AgentOutput(LLMInterface):\n    \"\"\"Output from an agent.\n\n    Args:\n        text: The text output from the agent.\n        agent_type: The type of agent.\n        status: The status after executing the agent.\n        error: The error message if any.\n    \"\"\"\n\n    model_config = ConfigDict(extra=\"allow\")\n\n    text: str\n    type: str = \"agent\"\n    agent_type: AgentType\n    status: Literal[\"thinking\", \"finished\", \"stopped\", \"failed\"]\n    error: Optional[str] = None\n    intermediate_steps: Optional[list] = None\n
    "},{"location":"reference/agents/io/base/#agents.io.base.check_log","title":"check_log","text":"
    check_log()\n

    Checks if logging has been enabled. :return: True if logging has been enabled, False otherwise. :rtype: bool

    Source code in libs/kotaemon/kotaemon/agents/io/base.py
    def check_log():\n    \"\"\"\n    Checks if logging has been enabled.\n    :return: True if logging has been enabled, False otherwise.\n    :rtype: bool\n    \"\"\"\n    return os.environ.get(\"LOG_PATH\", None) is not None\n
    "},{"location":"reference/agents/react/","title":"React","text":""},{"location":"reference/agents/react/#agents.react.ReactAgent","title":"ReactAgent","text":"

    Bases: BaseAgent

    Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -> str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -> dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -> str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -> AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n
    "},{"location":"reference/agents/react/#agents.react.ReactAgent.clear","title":"clear","text":"
    clear()\n

    Clear and reset the agent.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n
    "},{"location":"reference/agents/react/#agents.react.ReactAgent.run","title":"run","text":"
    run(instruction, max_iterations=None)\n

    Run the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def run(self, instruction, max_iterations=None) -> AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/#agents.react.ReactAgent.stream","title":"stream","text":"
    stream(instruction, max_iterations=None)\n

    Stream the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/agent/","title":"Agent","text":""},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent","title":"ReactAgent","text":"

    Bases: BaseAgent

    Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    class ReactAgent(BaseAgent):\n    \"\"\"\n    Sequential ReactAgent class inherited from BaseAgent.\n    Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf\n    \"\"\"\n\n    name: str = \"ReactAgent\"\n    agent_type: AgentType = AgentType.react\n    description: str = \"ReactAgent for answering multi-step reasoning questions\"\n    llm: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    output_lang: str = \"English\"\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"List of tools to be used in the agent. \"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent. \"\n    )\n    intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param(\n        default_callback=lambda _: [],\n        help=\"List of AgentAction and observation (tool) output\",\n    )\n    max_iterations: int = 5\n    strict_decode: bool = False\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    def _compose_plugin_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for plugin in self.plugins:\n                prompt += f\"{plugin.name}[input]: {plugin.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _construct_scratchpad(\n        self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = []\n    ) -> str:\n        \"\"\"Construct the scratchpad that lets the agent continue its thought process.\"\"\"\n        thoughts = \"\"\n        for action, observation in intermediate_steps:\n            thoughts += action.log\n            thoughts += f\"\\nObservation: {observation}\\nThought:\"\n        return thoughts\n\n    def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]:\n        \"\"\"\n        Parse text output from LLM for the next Action or Final Answer\n        Using Regex to parse \"Action:\\n Action Input:\\n\" for the next Action\n        Using FINAL_ANSWER_ACTION to parse Final Answer\n\n        Args:\n            text[str]: input text to parse\n        \"\"\"\n        includes_answer = FINAL_ANSWER_ACTION in text\n        regex = (\n            r\"Action\\s*\\d*\\s*:[\\s]*(.*?)[\\s]*Action\\s*\\d*\\s*Input\\s*\\d*\\s*:[\\s]*(.*)\"\n        )\n        action_match = re.search(regex, text, re.DOTALL)\n        action_output: Optional[AgentAction | AgentFinish] = None\n        if action_match:\n            if includes_answer:\n                raise Exception(\n                    \"Parsing LLM output produced both a final answer \"\n                    f\"and a parse-able action: {text}\"\n                )\n            action = action_match.group(1).strip()\n            action_input = action_match.group(2)\n            tool_input = action_input.strip(\" \")\n            # ensure if its a well formed SQL query we don't remove any trailing \" chars\n            if tool_input.startswith(\"SELECT \") is False:\n                tool_input = tool_input.strip('\"')\n\n            action_output = AgentAction(action, tool_input, text)\n\n        elif includes_answer:\n            action_output = AgentFinish(\n                {\"output\": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text\n            )\n        else:\n            if self.strict_decode:\n                raise Exception(f\"Could not parse LLM output: `{text}`\")\n            else:\n                action_output = AgentFinish({\"output\": text}, text)\n\n        return action_output\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        agent_scratchpad = self._construct_scratchpad(self.intermediate_steps)\n        tool_description = self._compose_plugin_description()\n        tool_names = \", \".join([plugin.name for plugin in self.plugins])\n        if self.prompt_template is None:\n            from .prompt import zero_shot_react_prompt\n\n            self.prompt_template = zero_shot_react_prompt\n        return self.prompt_template.populate(\n            instruction=instruction,\n            agent_scratchpad=agent_scratchpad,\n            tool_description=tool_description,\n            tool_names=tool_names,\n            lang=self.output_lang,\n        )\n\n    def _format_function_map(self) -> dict[str, BaseTool]:\n        \"\"\"Format the function map for the open AI function API.\n\n        Return:\n            Dict[str, Callable]: The function map.\n        \"\"\"\n        # Map the function name to the real function object.\n        function_map = {}\n        for plugin in self.plugins:\n            function_map[plugin.name] = plugin\n        return function_map\n\n    def _trim(self, text: str | Document) -> str:\n        \"\"\"\n        Trim the text to the maximum token length.\n        \"\"\"\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if isinstance(text, str):\n            texts = evidence_trim_func([Document(text=text)])\n        elif isinstance(text, Document):\n            texts = evidence_trim_func([text])\n        else:\n            raise ValueError(\"Invalid text type to trim\")\n        trim_text = texts[0].text\n        logging.info(f\"len (trimmed): {len(trim_text)}\")\n        return trim_text\n\n    def clear(self):\n        \"\"\"\n        Clear and reset the agent.\n        \"\"\"\n        self.intermediate_steps = []\n\n    def run(self, instruction, max_iterations=None) -> AgentOutput:\n        \"\"\"\n        Run the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = \"\"\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                break\n        else:\n            status = \"stopped\"\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n\n    def stream(self, instruction, max_iterations=None):\n        \"\"\"\n        Stream the agent with the given instruction.\n\n        Args:\n            instruction: Instruction to run the agent with.\n            max_iterations: Maximum number of iterations\n                of reasoning steps, defaults to 10.\n\n        Return:\n            AgentOutput object.\n        \"\"\"\n        if not max_iterations:\n            max_iterations = self.max_iterations\n        assert max_iterations > 0\n\n        self.clear()\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        print(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n        status = \"failed\"\n        response_text = None\n\n        for step_count in range(1, max_iterations + 1):\n            prompt = self._compose_prompt(instruction)\n            logging.info(f\"Prompt: {prompt}\")\n            print(f\"Prompt: {prompt}\")\n            response = self.llm(\n                prompt, stop=[\"Observation:\"]\n            )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n            response_text = response.text\n            logging.info(f\"Response: {response_text}\")\n            print(f\"Response: {response_text}\")\n            action_step = self._parse_output(response_text)\n            if action_step is None:\n                raise ValueError(\"Invalid action\")\n            is_finished_chain = isinstance(action_step, AgentFinish)\n            if is_finished_chain:\n                result = response_text\n                if \"Final Answer:\" in response_text:\n                    result = response_text.split(\"Final Answer:\")[-1].strip()\n            else:\n                assert isinstance(action_step, AgentAction)\n                action_name = action_step.tool\n                tool_input = action_step.tool_input\n                logging.info(f\"Action: {action_name}\")\n                print(f\"Action: {action_name}\")\n                logging.info(f\"Tool Input: {tool_input}\")\n                print(f\"Tool Input: {tool_input}\")\n                result = self._format_function_map()[action_name](tool_input)\n\n                # trim the worker output to 1000 tokens, as we are appending\n                # all workers' logs and it can exceed the token limit if we\n                # don't limit each. Fix this number regarding to the LLM capacity.\n                result = self._trim(result)\n                logging.info(f\"Result: {result}\")\n                print(f\"Result: {result}\")\n\n            self.intermediate_steps.append((action_step, result))\n            if is_finished_chain:\n                logging.info(f\"Finished after {step_count} steps.\")\n                status = \"finished\"\n                yield AgentOutput(\n                    text=result,\n                    agent_type=self.agent_type,\n                    status=status,\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n                break\n            else:\n                yield AgentOutput(\n                    text=\"\",\n                    agent_type=self.agent_type,\n                    status=\"thinking\",\n                    intermediate_steps=self.intermediate_steps[-1],\n                )\n\n        else:\n            status = \"stopped\"\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n        return AgentOutput(\n            text=response_text,\n            agent_type=self.agent_type,\n            status=status,\n            total_tokens=total_token,\n            total_cost=total_cost,\n            intermediate_steps=self.intermediate_steps,\n            max_iterations=max_iterations,\n        )\n
    "},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.clear","title":"clear","text":"
    clear()\n

    Clear and reset the agent.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def clear(self):\n    \"\"\"\n    Clear and reset the agent.\n    \"\"\"\n    self.intermediate_steps = []\n
    "},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.run","title":"run","text":"
    run(instruction, max_iterations=None)\n

    Run the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def run(self, instruction, max_iterations=None) -> AgentOutput:\n    \"\"\"\n    Run the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = \"\"\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            break\n    else:\n        status = \"stopped\"\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/agent/#agents.react.agent.ReactAgent.stream","title":"stream","text":"
    stream(instruction, max_iterations=None)\n

    Stream the agent with the given instruction.

    Parameters:

    Name Type Description Default instruction

    Instruction to run the agent with.

    required max_iterations

    Maximum number of iterations of reasoning steps, defaults to 10.

    None Return

    AgentOutput object.

    Source code in libs/kotaemon/kotaemon/agents/react/agent.py
    def stream(self, instruction, max_iterations=None):\n    \"\"\"\n    Stream the agent with the given instruction.\n\n    Args:\n        instruction: Instruction to run the agent with.\n        max_iterations: Maximum number of iterations\n            of reasoning steps, defaults to 10.\n\n    Return:\n        AgentOutput object.\n    \"\"\"\n    if not max_iterations:\n        max_iterations = self.max_iterations\n    assert max_iterations > 0\n\n    self.clear()\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    print(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n    status = \"failed\"\n    response_text = None\n\n    for step_count in range(1, max_iterations + 1):\n        prompt = self._compose_prompt(instruction)\n        logging.info(f\"Prompt: {prompt}\")\n        print(f\"Prompt: {prompt}\")\n        response = self.llm(\n            prompt, stop=[\"Observation:\"]\n        )  # TODO: could cause bugs if llm doesn't have `stop` as a parameter\n        response_text = response.text\n        logging.info(f\"Response: {response_text}\")\n        print(f\"Response: {response_text}\")\n        action_step = self._parse_output(response_text)\n        if action_step is None:\n            raise ValueError(\"Invalid action\")\n        is_finished_chain = isinstance(action_step, AgentFinish)\n        if is_finished_chain:\n            result = response_text\n            if \"Final Answer:\" in response_text:\n                result = response_text.split(\"Final Answer:\")[-1].strip()\n        else:\n            assert isinstance(action_step, AgentAction)\n            action_name = action_step.tool\n            tool_input = action_step.tool_input\n            logging.info(f\"Action: {action_name}\")\n            print(f\"Action: {action_name}\")\n            logging.info(f\"Tool Input: {tool_input}\")\n            print(f\"Tool Input: {tool_input}\")\n            result = self._format_function_map()[action_name](tool_input)\n\n            # trim the worker output to 1000 tokens, as we are appending\n            # all workers' logs and it can exceed the token limit if we\n            # don't limit each. Fix this number regarding to the LLM capacity.\n            result = self._trim(result)\n            logging.info(f\"Result: {result}\")\n            print(f\"Result: {result}\")\n\n        self.intermediate_steps.append((action_step, result))\n        if is_finished_chain:\n            logging.info(f\"Finished after {step_count} steps.\")\n            status = \"finished\"\n            yield AgentOutput(\n                text=result,\n                agent_type=self.agent_type,\n                status=status,\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n            break\n        else:\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=self.intermediate_steps[-1],\n            )\n\n    else:\n        status = \"stopped\"\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=status,\n            intermediate_steps=self.intermediate_steps[-1],\n        )\n\n    return AgentOutput(\n        text=response_text,\n        agent_type=self.agent_type,\n        status=status,\n        total_tokens=total_token,\n        total_cost=total_cost,\n        intermediate_steps=self.intermediate_steps,\n        max_iterations=max_iterations,\n    )\n
    "},{"location":"reference/agents/react/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/","title":"Rewoo","text":""},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent","title":"RewooAgent","text":"

    Bases: BaseAgent

    Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -> tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -> tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -> Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) > 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n
    "},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.run","title":"run","text":"
    run(instruction, use_citation=False)\n

    Run the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    @BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/#agents.rewoo.RewooAgent.stream","title":"stream","text":"
    stream(instruction, use_citation=False)\n

    Stream the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/agent/","title":"Agent","text":""},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent","title":"RewooAgent","text":"

    Bases: BaseAgent

    Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    class RewooAgent(BaseAgent):\n    \"\"\"Distributive RewooAgent class inherited from BaseAgent.\n    Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf\"\"\"\n\n    name: str = \"RewooAgent\"\n    agent_type: AgentType = AgentType.rewoo\n    description: str = \"RewooAgent for answering multi-step reasoning questions\"\n    output_lang: str = \"English\"\n    planner_llm: BaseLLM\n    solver_llm: BaseLLM\n    prompt_template: dict[str, PromptTemplate] = Param(\n        default_callback=lambda _: {},\n        help=\"A dict to supply different prompt to the agent.\",\n    )\n    plugins: list[BaseTool] = Param(\n        default_callback=lambda _: [], help=\"A list of plugins to be used in the model.\"\n    )\n    examples: dict[str, str | list[str]] = Param(\n        default_callback=lambda _: {}, help=\"Examples to be used in the agent.\"\n    )\n    max_context_length: int = Param(\n        default=3000,\n        help=\"Max context length for each tool output.\",\n    )\n    trim_func: TokenSplitter | None = None\n\n    @Node.auto(depends_on=[\"planner_llm\", \"plugins\", \"prompt_template\", \"examples\"])\n    def planner(self):\n        return Planner(\n            model=self.planner_llm,\n            plugins=self.plugins,\n            prompt_template=self.prompt_template.get(\"Planner\", None),\n            examples=self.examples.get(\"Planner\", None),\n        )\n\n    @Node.auto(depends_on=[\"solver_llm\", \"prompt_template\", \"examples\"])\n    def solver(self):\n        return Solver(\n            model=self.solver_llm,\n            prompt_template=self.prompt_template.get(\"Solver\", None),\n            examples=self.examples.get(\"Solver\", None),\n            output_lang=self.output_lang,\n        )\n\n    def _parse_plan_map(\n        self, planner_response: str\n    ) -> tuple[dict[str, list[str]], dict[str, str]]:\n        \"\"\"\n        Parse planner output. It should be an n-to-n mapping from Plans to #Es.\n        This is because sometimes LLM cannot follow the strict output format.\n        Example:\n            #Plan1\n            #E1\n            #E2\n        should result in: {\"#Plan1\": [\"#E1\", \"#E2\"]}\n        Or:\n            #Plan1\n            #Plan2\n            #E1\n        should result in: {\"#Plan1\": [], \"#Plan2\": [\"#E1\"]}\n        This function should also return a plan map.\n\n        Returns:\n            tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map\n        \"\"\"\n        valid_chunk = [\n            line\n            for line in planner_response.splitlines()\n            if line.startswith(\"#Plan\") or line.startswith(\"#E\")\n        ]\n\n        plan_to_es: dict[str, list[str]] = dict()\n        plans: dict[str, str] = dict()\n        prev_key = \"\"\n        for line in valid_chunk:\n            key, description = line.split(\":\", 1)\n            key = key.strip()\n            if key.startswith(\"#Plan\"):\n                plans[key] = description.strip()\n                plan_to_es[key] = []\n                prev_key = key\n            elif key.startswith(\"#E\"):\n                plan_to_es[prev_key].append(key)\n\n        return plan_to_es, plans\n\n    def _parse_planner_evidences(\n        self, planner_response: str\n    ) -> tuple[dict[str, str], list[list[str]]]:\n        \"\"\"\n        Parse planner output. This should return a mapping from #E to tool call.\n        It should also identify the level of each #E in dependency map.\n        Example:\n            {\n            \"#E1\": \"Tool1\", \"#E2\": \"Tool2\",\n            \"#E3\": \"Tool3\", \"#E4\": \"Tool4\"\n            }, [[#E1, #E2], [#E3, #E4]]\n\n        Returns:\n            tuple[dict[str, str], List[List[str]]]:\n            A mapping from #E to tool call and a list of levels.\n        \"\"\"\n        evidences: dict[str, str] = dict()\n        dependence: dict[str, list[str]] = dict()\n        for line in planner_response.splitlines():\n            if line.startswith(\"#E\") and line[2].isdigit():\n                e, tool_call = line.split(\":\", 1)\n                e, tool_call = e.strip(), tool_call.strip()\n                if len(e) == 3:\n                    dependence[e] = []\n                    evidences[e] = tool_call\n                    for var in re.findall(r\"#E\\d+\", tool_call):\n                        if var in evidences:\n                            dependence[e].append(var)\n                else:\n                    evidences[e] = \"No evidence found\"\n        level = []\n        while dependence:\n            select = [i for i in dependence if not dependence[i]]\n            if len(select) == 0:\n                raise ValueError(\"Circular dependency detected.\")\n            level.append(select)\n            for item in select:\n                dependence.pop(item)\n            for item in dependence:\n                for i in select:\n                    if i in dependence[item]:\n                        dependence[item].remove(i)\n\n        return evidences, level\n\n    def _run_plugin(\n        self,\n        e: str,\n        planner_evidences: dict[str, str],\n        worker_evidences: dict[str, str],\n        output=BaseScratchPad(),\n    ):\n        \"\"\"\n        Run a plugin for a given evidence.\n        This function should also cumulate the cost and tokens.\n        \"\"\"\n        result = dict(e=e, plugin_cost=0, plugin_token=0, evidence=\"\")\n        tool_call = planner_evidences[e]\n        if \"[\" not in tool_call:\n            result[\"evidence\"] = tool_call\n        else:\n            tool, tool_input = tool_call.split(\"[\", 1)\n            tool_input = tool_input[:-1]\n            # find variables in input and replace with previous evidences\n            for var in re.findall(r\"#E\\d+\", tool_input):\n                print(\"Tool input: \", tool_input)\n                print(\"Var: \", var)\n                print(\"Worker evidences: \", worker_evidences)\n                if var in worker_evidences:\n                    tool_input = tool_input.replace(\n                        var, worker_evidences.get(var, \"\") or \"\"\n                    )\n            try:\n                selected_plugin = self._find_plugin(tool)\n                if selected_plugin is None:\n                    raise ValueError(\"Invalid plugin detected\")\n                tool_response = selected_plugin(tool_input)\n                result[\"evidence\"] = get_plugin_response_content(tool_response)\n            except ValueError:\n                result[\"evidence\"] = \"No evidence found.\"\n            finally:\n                output.panel_print(\n                    result[\"evidence\"], f\"[green] Function Response of [blue]{tool}: \"\n                )\n        return result\n\n    def _get_worker_evidence(\n        self,\n        planner_evidences: dict[str, str],\n        evidences_level: list[list[str]],\n        output=BaseScratchPad(),\n    ) -> Any:\n        \"\"\"\n        Parallel execution of plugins in DAG for speedup.\n        This is one of core benefits of ReWOO agents.\n\n        Args:\n            planner_evidences: A mapping from #E to tool call.\n            evidences_level: A list of levels of evidences.\n                Calculated from DAG of plugin calls.\n            output: Output object, defaults to BaseOutput().\n        Returns:\n            A mapping from #E to tool call.\n        \"\"\"\n        worker_evidences: dict[str, str] = dict()\n        plugin_cost, plugin_token = 0.0, 0.0\n        with ThreadPoolExecutor() as pool:\n            for level in evidences_level:\n                results = []\n                for e in level:\n                    results.append(\n                        pool.submit(\n                            self._run_plugin,\n                            e,\n                            planner_evidences,\n                            worker_evidences,\n                            output,\n                        )\n                    )\n                if len(results) > 1:\n                    output.update_status(f\"Running tasks {level} in parallel.\")\n                else:\n                    output.update_status(f\"Running task {level[0]}.\")\n                for r in results:\n                    resp = r.result()\n                    plugin_cost += resp[\"plugin_cost\"]\n                    plugin_token += resp[\"plugin_token\"]\n                    worker_evidences[resp[\"e\"]] = self._trim_evidence(resp[\"evidence\"])\n                output.done()\n\n        return worker_evidences, plugin_cost, plugin_token\n\n    def _find_plugin(self, name: str):\n        for p in self.plugins:\n            if p.name == name:\n                return p\n\n    def _trim_evidence(self, evidence: str):\n        evidence_trim_func = (\n            self.trim_func\n            if self.trim_func\n            else TokenSplitter(\n                chunk_size=self.max_context_length,\n                chunk_overlap=0,\n                separator=\" \",\n                tokenizer=partial(\n                    tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n                    allowed_special=set(),\n                    disallowed_special=\"all\",\n                ),\n            )\n        )\n        if evidence:\n            texts = evidence_trim_func([Document(text=evidence)])\n            evidence = texts[0].text\n            logging.info(f\"len (trimmed): {len(evidence)}\")\n            return evidence\n\n    @BaseAgent.safeguard_run\n    def run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n        \"\"\"\n        Run the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Running {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n        # Solve\n        solver_output = self.solver(instruction, worker_log)\n        solver_output_text = solver_output.text\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline(context=worker_log, question=instruction)\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n\n    def stream(self, instruction: str, use_citation: bool = False):\n        \"\"\"\n        Stream the agent with a given instruction.\n        \"\"\"\n        logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n        total_cost = 0.0\n        total_token = 0\n\n        # Plan\n        planner_output = self.planner(instruction)\n        planner_text_output = planner_output.text\n        plan_to_es, plans = self._parse_plan_map(planner_text_output)\n        planner_evidences, evidence_level = self._parse_planner_evidences(\n            planner_text_output\n        )\n\n        print(\"Planner output:\", planner_text_output)\n        # output planner to info panel\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"planner_log\": planner_text_output}],\n        )\n\n        # Work\n        worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n            planner_evidences, evidence_level\n        )\n        worker_log = \"\"\n        for plan in plan_to_es:\n            worker_log += f\"{plan}: {plans[plan]}\\n\"\n            current_progress = f\"{plan}: {plans[plan]}\\n\"\n            for e in plan_to_es[plan]:\n                worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n                current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n                current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n            yield AgentOutput(\n                text=\"\",\n                agent_type=self.agent_type,\n                status=\"thinking\",\n                intermediate_steps=[{\"worker_log\": current_progress}],\n            )\n\n        # Solve\n        solver_response = \"\"\n        for solver_output in self.solver.stream(instruction, worker_log):\n            solver_output_text = solver_output.text\n            solver_response += solver_output_text\n            yield AgentOutput(\n                text=solver_output_text,\n                agent_type=self.agent_type,\n                status=\"thinking\",\n            )\n        if use_citation:\n            citation_pipeline = CitationPipeline(llm=self.solver_llm)\n            citation = citation_pipeline.invoke(\n                context=worker_log, question=instruction\n            )\n        else:\n            citation = None\n\n        return AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"finished\",\n            total_tokens=total_token,\n            total_cost=total_cost,\n            citation=citation,\n            metadata={\"citation\": citation, \"worker_log\": worker_log},\n        )\n
    "},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.run","title":"run","text":"
    run(instruction, use_citation=False)\n

    Run the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    @BaseAgent.safeguard_run\ndef run(self, instruction: str, use_citation: bool = False) -> AgentOutput:\n    \"\"\"\n    Run the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Running {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n\n    # Solve\n    solver_output = self.solver(instruction, worker_log)\n    solver_output_text = solver_output.text\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline(context=worker_log, question=instruction)\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=solver_output_text,\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/agent/#agents.rewoo.agent.RewooAgent.stream","title":"stream","text":"
    stream(instruction, use_citation=False)\n

    Stream the agent with a given instruction.

    Source code in libs/kotaemon/kotaemon/agents/rewoo/agent.py
    def stream(self, instruction: str, use_citation: bool = False):\n    \"\"\"\n    Stream the agent with a given instruction.\n    \"\"\"\n    logging.info(f\"Streaming {self.name} with instruction: {instruction}\")\n    total_cost = 0.0\n    total_token = 0\n\n    # Plan\n    planner_output = self.planner(instruction)\n    planner_text_output = planner_output.text\n    plan_to_es, plans = self._parse_plan_map(planner_text_output)\n    planner_evidences, evidence_level = self._parse_planner_evidences(\n        planner_text_output\n    )\n\n    print(\"Planner output:\", planner_text_output)\n    # output planner to info panel\n    yield AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"thinking\",\n        intermediate_steps=[{\"planner_log\": planner_text_output}],\n    )\n\n    # Work\n    worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(\n        planner_evidences, evidence_level\n    )\n    worker_log = \"\"\n    for plan in plan_to_es:\n        worker_log += f\"{plan}: {plans[plan]}\\n\"\n        current_progress = f\"{plan}: {plans[plan]}\\n\"\n        for e in plan_to_es[plan]:\n            worker_log += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            worker_log += f\"{e}: {worker_evidences[e]}\\n\"\n            current_progress += f\"#Action: {planner_evidences.get(e, None)}\\n\"\n            current_progress += f\"{e}: {worker_evidences[e]}\\n\"\n\n        yield AgentOutput(\n            text=\"\",\n            agent_type=self.agent_type,\n            status=\"thinking\",\n            intermediate_steps=[{\"worker_log\": current_progress}],\n        )\n\n    # Solve\n    solver_response = \"\"\n    for solver_output in self.solver.stream(instruction, worker_log):\n        solver_output_text = solver_output.text\n        solver_response += solver_output_text\n        yield AgentOutput(\n            text=solver_output_text,\n            agent_type=self.agent_type,\n            status=\"thinking\",\n        )\n    if use_citation:\n        citation_pipeline = CitationPipeline(llm=self.solver_llm)\n        citation = citation_pipeline.invoke(\n            context=worker_log, question=instruction\n        )\n    else:\n        citation = None\n\n    return AgentOutput(\n        text=\"\",\n        agent_type=self.agent_type,\n        status=\"finished\",\n        total_tokens=total_token,\n        total_cost=total_cost,\n        citation=citation,\n        metadata={\"citation\": citation, \"worker_log\": worker_log},\n    )\n
    "},{"location":"reference/agents/rewoo/planner/","title":"Planner","text":""},{"location":"reference/agents/rewoo/planner/#agents.rewoo.planner.Planner","title":"Planner","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/rewoo/planner.py
    class Planner(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    plugins: List[BaseTool]\n\n    def _compose_worker_description(self) -> str:\n        \"\"\"\n        Compose the worker prompt from the workers.\n\n        Example:\n        toolname1[input]: tool1 description\n        toolname2[input]: tool2 description\n        \"\"\"\n        prompt = \"\"\n        try:\n            for worker in self.plugins:\n                prompt += f\"{worker.name}[input]: {worker.description}\\n\"\n        except Exception:\n            raise ValueError(\"Worker must have a name and description.\")\n        return prompt\n\n    def _compose_fewshot_prompt(self) -> str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction) -> str:\n        \"\"\"\n        Compose the prompt from template, worker description, examples and instruction.\n        \"\"\"\n        worker_desctription = self._compose_worker_description()\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return self.prompt_template.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_planner_prompt.populate(\n                    tool_description=worker_desctription,\n                    fewshot=fewshot,\n                    task=instruction,\n                )\n            else:\n                return zero_shot_planner_prompt.populate(\n                    tool_description=worker_desctription, task=instruction\n                )\n\n    def run(self, instruction: str, output: BaseScratchPad = BaseScratchPad()) -> Any:\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n\n    def stream(self, instruction: str, output: BaseScratchPad = BaseScratchPad()):\n        response = None\n        output.info(\"Running Planner\")\n        prompt = self._compose_prompt(instruction)\n        output.debug(f\"Prompt: {prompt}\")\n\n        response = \"\"\n        try:\n            for text in self.model.stream(prompt):\n                response += text\n                yield text\n            self.log_progress(\".planner\", response=response)\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            print(\"Streaming is not supported, falling back to normal run\")\n            response = self.model(prompt)\n            yield response\n        except ValueError as e:\n            output.error(\"Planner failed to retrieve response from LLM\")\n            raise ValueError(\"Planner failed to retrieve response from LLM\") from e\n\n        return response\n
    "},{"location":"reference/agents/rewoo/prompt/","title":"Prompt","text":""},{"location":"reference/agents/rewoo/solver/","title":"Solver","text":""},{"location":"reference/agents/rewoo/solver/#agents.rewoo.solver.Solver","title":"Solver","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/rewoo/solver.py
    class Solver(BaseComponent):\n    model: BaseLLM\n    prompt_template: Optional[PromptTemplate] = None\n    examples: Optional[Union[str, List[str]]] = None\n    output_lang: str = \"English\"\n\n    def _compose_fewshot_prompt(self) -> str:\n        if self.examples is None:\n            return \"\"\n        if isinstance(self.examples, str):\n            return self.examples\n        else:\n            return \"\\n\\n\".join([e.strip(\"\\n\") for e in self.examples])\n\n    def _compose_prompt(self, instruction, plan_evidence, output_lang) -> str:\n        \"\"\"\n        Compose the prompt from template, plan&evidence, examples and instruction.\n        \"\"\"\n        fewshot = self._compose_fewshot_prompt()\n        if self.prompt_template is not None:\n            if \"fewshot\" in self.prompt_template.placeholders:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return self.prompt_template.populate(\n                    plan_evidence=plan_evidence, task=instruction, lang=output_lang\n                )\n        else:\n            if self.examples is not None:\n                return few_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    fewshot=fewshot,\n                    task=instruction,\n                    lang=output_lang,\n                )\n            else:\n                return zero_shot_solver_prompt.populate(\n                    plan_evidence=plan_evidence,\n                    task=instruction,\n                    lang=output_lang,\n                )\n\n    def run(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -> Any:\n        response = None\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            response = self.model(prompt)\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n\n    def stream(\n        self,\n        instruction: str,\n        plan_evidence: str,\n        output: BaseScratchPad = BaseScratchPad(),\n    ) -> Any:\n        response = \"\"\n        output.info(\"Running Solver\")\n        output.debug(f\"Instruction: {instruction}\")\n        output.debug(f\"Plan Evidence: {plan_evidence}\")\n        prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang)\n        output.debug(f\"Prompt: {prompt}\")\n        try:\n            for text in self.model.stream(prompt):\n                response += text.text\n                yield text\n            output.info(\"Planner run successful.\")\n        except NotImplementedError:\n            response = self.model(prompt).text\n            output.info(\"Solver run successful.\")\n        except ValueError:\n            output.error(\"Solver failed to retrieve response from LLM\")\n\n        return response\n
    "},{"location":"reference/agents/tools/","title":"Tools","text":""},{"location":"reference/agents/tools/#agents.tools.BaseTool","title":"BaseTool","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -> Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -> Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -> LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n
    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.name","title":"name instance-attribute","text":"
    name\n

    The unique name of the tool that clearly communicates its purpose.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.description","title":"description instance-attribute","text":"
    description\n

    Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.args_schema","title":"args_schema class-attribute instance-attribute","text":"
    args_schema = None\n

    Pydantic model class to validate and parse the tool's input arguments.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.verbose","title":"verbose class-attribute instance-attribute","text":"
    verbose = False\n

    Whether to log the tool's progress.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.handle_tool_error","title":"handle_tool_error class-attribute instance-attribute","text":"
    handle_tool_error = False\n

    Handle the content of the ToolException thrown.

    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.to_langchain_format","title":"to_langchain_format","text":"
    to_langchain_format()\n

    Convert this tool to Langchain format to use with its agent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def to_langchain_format(self) -> LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n
    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.run","title":"run","text":"
    run(tool_input, verbose=None, **kwargs)\n

    Run the tool.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -> Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n
    "},{"location":"reference/agents/tools/#agents.tools.BaseTool.from_langchain_format","title":"from_langchain_format classmethod","text":"
    from_langchain_format(langchain_tool)\n

    Wrapper for Langchain Tool

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    @classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n
    "},{"location":"reference/agents/tools/#agents.tools.ComponentTool","title":"ComponentTool","text":"

    Bases: BaseTool

    Wrapper around other BaseComponent to use it as a tool

    Parameters:

    Name Type Description Default component

    BaseComponent-based component to wrap

    required postprocessor

    Optional postprocessor for the component output

    required Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -> Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n
    "},{"location":"reference/agents/tools/#agents.tools.WikipediaTool","title":"WikipediaTool","text":"

    Bases: BaseTool

    Tool that adds the capability to query the Wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -> AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n
    "},{"location":"reference/agents/tools/base/","title":"Base","text":""},{"location":"reference/agents/tools/base/#agents.tools.base.ToolException","title":"ToolException","text":"

    Bases: Exception

    An optional exception that tool throws when execution error occurs.

    When this exception is thrown, the agent will not stop working, but will handle the exception according to the handle_tool_error variable of the tool, and the processing result will be returned to the agent as observation, and printed in red on the console.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ToolException(Exception):\n    \"\"\"An optional exception that tool throws when execution error occurs.\n\n    When this exception is thrown, the agent will not stop working,\n    but will handle the exception according to the handle_tool_error\n    variable of the tool, and the processing result will be returned\n    to the agent as observation, and printed in red on the console.\n    \"\"\"\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool","title":"BaseTool","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class BaseTool(BaseComponent):\n    name: str\n    \"\"\"The unique name of the tool that clearly communicates its purpose.\"\"\"\n    description: str\n    \"\"\"Description used to tell the model how/when/why to use the tool.\n    You can provide few-shot examples as a part of the description. This will be\n    input to the prompt of LLM.\n    \"\"\"\n    args_schema: Optional[Type[BaseModel]] = None\n    \"\"\"Pydantic model class to validate and parse the tool's input arguments.\"\"\"\n    verbose: bool = False\n    \"\"\"Whether to log the tool's progress.\"\"\"\n    handle_tool_error: Optional[\n        Union[bool, str, Callable[[ToolException], str]]\n    ] = False\n    \"\"\"Handle the content of the ToolException thrown.\"\"\"\n\n    def _parse_input(\n        self,\n        tool_input: Union[str, Dict],\n    ) -> Union[str, Dict[str, Any]]:\n        \"\"\"Convert tool input to pydantic model.\"\"\"\n        args_schema = self.args_schema\n        if isinstance(tool_input, str):\n            if args_schema is not None:\n                key_ = next(iter(args_schema.model_fields.keys()))\n                args_schema.validate({key_: tool_input})\n            return tool_input\n        else:\n            if args_schema is not None:\n                result = args_schema.parse_obj(tool_input)\n                return {k: v for k, v in result.dict().items() if k in tool_input}\n        return tool_input\n\n    def _run_tool(\n        self,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Call tool.\"\"\"\n        raise NotImplementedError(f\"_run_tool is not implemented for {self.name}\")\n\n    def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]:\n        # For backwards compatibility, if run_input is a string,\n        # pass as a positional argument.\n        if isinstance(tool_input, str):\n            return (tool_input,), {}\n        else:\n            return (), tool_input\n\n    def _handle_tool_error(self, e: ToolException) -> Any:\n        \"\"\"Handle the content of the ToolException thrown.\"\"\"\n        observation = None\n        if not self.handle_tool_error:\n            raise e\n        elif isinstance(self.handle_tool_error, bool):\n            if e.args:\n                observation = e.args[0]\n            else:\n                observation = \"Tool execution error\"\n        elif isinstance(self.handle_tool_error, str):\n            observation = self.handle_tool_error\n        elif callable(self.handle_tool_error):\n            observation = self.handle_tool_error(e)\n        else:\n            raise ValueError(\n                f\"Got unexpected type of `handle_tool_error`. Expected bool, str \"\n                f\"or callable. Received: {self.handle_tool_error}\"\n            )\n        return observation\n\n    def to_langchain_format(self) -> LCTool:\n        \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n        return LCTool(name=self.name, description=self.description, func=self.run)\n\n    def run(\n        self,\n        tool_input: Union[str, Dict],\n        verbose: Optional[bool] = None,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Run the tool.\"\"\"\n        parsed_input = self._parse_input(tool_input)\n        # TODO (verbose_): Add logging\n        try:\n            tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n            call_kwargs = {**kwargs, **tool_kwargs}\n            observation = self._run_tool(*tool_args, **call_kwargs)\n        except ToolException as e:\n            observation = self._handle_tool_error(e)\n            return observation\n        else:\n            return observation\n\n    @classmethod\n    def from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n        \"\"\"Wrapper for Langchain Tool\"\"\"\n        new_tool = BaseTool(\n            name=langchain_tool.name, description=langchain_tool.description\n        )\n        new_tool._run_tool = langchain_tool._run  # type: ignore\n        return new_tool\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.name","title":"name instance-attribute","text":"
    name\n

    The unique name of the tool that clearly communicates its purpose.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.description","title":"description instance-attribute","text":"
    description\n

    Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.args_schema","title":"args_schema class-attribute instance-attribute","text":"
    args_schema = None\n

    Pydantic model class to validate and parse the tool's input arguments.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.verbose","title":"verbose class-attribute instance-attribute","text":"
    verbose = False\n

    Whether to log the tool's progress.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.handle_tool_error","title":"handle_tool_error class-attribute instance-attribute","text":"
    handle_tool_error = False\n

    Handle the content of the ToolException thrown.

    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.to_langchain_format","title":"to_langchain_format","text":"
    to_langchain_format()\n

    Convert this tool to Langchain format to use with its agent

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def to_langchain_format(self) -> LCTool:\n    \"\"\"Convert this tool to Langchain format to use with its agent\"\"\"\n    return LCTool(name=self.name, description=self.description, func=self.run)\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.run","title":"run","text":"
    run(tool_input, verbose=None, **kwargs)\n

    Run the tool.

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    def run(\n    self,\n    tool_input: Union[str, Dict],\n    verbose: Optional[bool] = None,\n    **kwargs: Any,\n) -> Any:\n    \"\"\"Run the tool.\"\"\"\n    parsed_input = self._parse_input(tool_input)\n    # TODO (verbose_): Add logging\n    try:\n        tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input)\n        call_kwargs = {**kwargs, **tool_kwargs}\n        observation = self._run_tool(*tool_args, **call_kwargs)\n    except ToolException as e:\n        observation = self._handle_tool_error(e)\n        return observation\n    else:\n        return observation\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.BaseTool.from_langchain_format","title":"from_langchain_format classmethod","text":"
    from_langchain_format(langchain_tool)\n

    Wrapper for Langchain Tool

    Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    @classmethod\ndef from_langchain_format(cls, langchain_tool: LCTool) -> \"BaseTool\":\n    \"\"\"Wrapper for Langchain Tool\"\"\"\n    new_tool = BaseTool(\n        name=langchain_tool.name, description=langchain_tool.description\n    )\n    new_tool._run_tool = langchain_tool._run  # type: ignore\n    return new_tool\n
    "},{"location":"reference/agents/tools/base/#agents.tools.base.ComponentTool","title":"ComponentTool","text":"

    Bases: BaseTool

    Wrapper around other BaseComponent to use it as a tool

    Parameters:

    Name Type Description Default component

    BaseComponent-based component to wrap

    required postprocessor

    Optional postprocessor for the component output

    required Source code in libs/kotaemon/kotaemon/agents/tools/base.py
    class ComponentTool(BaseTool):\n    \"\"\"Wrapper around other BaseComponent to use it as a tool\n\n    Args:\n        component: BaseComponent-based component to wrap\n        postprocessor: Optional postprocessor for the component output\n    \"\"\"\n\n    component: BaseComponent\n    postprocessor: Optional[Callable] = None\n\n    def _run_tool(self, *args: Any, **kwargs: Any) -> Any:\n        output = self.component(*args, **kwargs)\n        if self.postprocessor:\n            output = self.postprocessor(output)\n\n        return output\n
    "},{"location":"reference/agents/tools/google/","title":"Google","text":""},{"location":"reference/agents/tools/llm/","title":"Llm","text":""},{"location":"reference/agents/tools/wikipedia/","title":"Wikipedia","text":""},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki","title":"Wiki","text":"

    Wrapper around wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class Wiki:\n    \"\"\"Wrapper around wikipedia API.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"Check that wikipedia package is installed.\"\"\"\n        try:\n            import wikipedia  # noqa: F401\n        except ImportError:\n            raise ValueError(\n                \"Could not import wikipedia python package. \"\n                \"Please install it with `pip install wikipedia`.\"\n            )\n\n    def search(self, search: str) -> Union[str, Document]:\n        \"\"\"Try to search for wiki page.\n\n        If page exists, return the page summary, and a PageWithLookups object.\n        If page does not exist, return similar entries.\n        \"\"\"\n        import wikipedia\n\n        try:\n            page_content = wikipedia.page(search).content\n            url = wikipedia.page(search).url\n            result: Union[str, Document] = Document(\n                text=page_content, metadata={\"page\": url}\n            )\n        except wikipedia.PageError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        except wikipedia.DisambiguationError:\n            result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n        return result\n
    "},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.Wiki.search","title":"search","text":"
    search(search)\n

    Try to search for wiki page.

    If page exists, return the page summary, and a PageWithLookups object. If page does not exist, return similar entries.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    def search(self, search: str) -> Union[str, Document]:\n    \"\"\"Try to search for wiki page.\n\n    If page exists, return the page summary, and a PageWithLookups object.\n    If page does not exist, return similar entries.\n    \"\"\"\n    import wikipedia\n\n    try:\n        page_content = wikipedia.page(search).content\n        url = wikipedia.page(search).url\n        result: Union[str, Document] = Document(\n            text=page_content, metadata={\"page\": url}\n        )\n    except wikipedia.PageError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    except wikipedia.DisambiguationError:\n        result = f\"Could not find [{search}]. Similar: {wikipedia.search(search)}\"\n    return result\n
    "},{"location":"reference/agents/tools/wikipedia/#agents.tools.wikipedia.WikipediaTool","title":"WikipediaTool","text":"

    Bases: BaseTool

    Tool that adds the capability to query the Wikipedia API.

    Source code in libs/kotaemon/kotaemon/agents/tools/wikipedia.py
    class WikipediaTool(BaseTool):\n    \"\"\"Tool that adds the capability to query the Wikipedia API.\"\"\"\n\n    name: str = \"wikipedia\"\n    description: str = (\n        \"Search engine from Wikipedia, retrieving relevant wiki page. \"\n        \"Useful when you need to get holistic knowledge about people, \"\n        \"places, companies, historical events, or other subjects. \"\n        \"Input should be a search query.\"\n    )\n    args_schema: Optional[Type[BaseModel]] = WikipediaArgs\n    doc_store: Any = None\n\n    def _run_tool(self, query: AnyStr) -> AnyStr:\n        if not self.doc_store:\n            self.doc_store = Wiki()\n        tool = self.doc_store\n        evidence = tool.search(query)\n        return evidence\n
    "},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#base.BaseComponent","title":"BaseComponent","text":"

    Bases: Function

    A component is a class that can be used to compose a pipeline.

    Benefits of component

    For each component, the spirit is

    as generic as possible.

    Source code in libs/kotaemon/kotaemon/base/component.py
    class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -> Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -> Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n
    "},{"location":"reference/base/#base.BaseComponent.run","title":"run abstractmethod","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/base/component.py
    @abstractmethod\ndef run(\n    self, *args, **kwargs\n) -> Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n
    "},{"location":"reference/base/#base.Document","title":"Document","text":"

    Bases: Document

    Base document class, mostly inherited from Document class from llama-index.

    This class accept one positional argument content of an arbitrary type, which will store the raw content of the document. If specified, the class will use content to initialize the base llama_index class.

    Attributes:

    Name Type Description content Any

    raw content of the document, can be anything

    source Optional[str]

    id of the source of the Document. Optional.

    channel Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]

    the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"<EMBEDDING>\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -> \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -> \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n
    "},{"location":"reference/base/#base.Document.to_haystack_format","title":"to_haystack_format","text":"
    to_haystack_format()\n

    Convert struct to Haystack document format.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    def to_haystack_format(self) -> \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n
    "},{"location":"reference/base/#base.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"

    Bases: Document

    Subclass of Document which must contains embedding

    Use this if you want to enforce component's IOs to must contain embedding.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n
    "},{"location":"reference/base/#base.ExtractorOutput","title":"ExtractorOutput","text":"

    Bases: Document

    Represents the output of an extractor.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n
    "},{"location":"reference/base/#base.RetrievedDocument","title":"RetrievedDocument","text":"

    Bases: Document

    Subclass of Document with retrieval-related information

    Attributes:

    Name Type Description score float

    score of the document (from 0.0 to 1.0)

    retrieval_metadata dict

    metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n
    "},{"location":"reference/base/component/","title":"Component","text":""},{"location":"reference/base/component/#base.component.BaseComponent","title":"BaseComponent","text":"

    Bases: Function

    A component is a class that can be used to compose a pipeline.

    Benefits of component

    For each component, the spirit is

    as generic as possible.

    Source code in libs/kotaemon/kotaemon/base/component.py
    class BaseComponent(Function):\n    \"\"\"A component is a class that can be used to compose a pipeline.\n\n    !!! tip \"Benefits of component\"\n        - Auto caching, logging\n        - Allow deployment\n\n    !!! tip \"For each component, the spirit is\"\n        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]\n        - Enforce single output type. Hence, the output type of a component should be\n    as generic as possible.\n    \"\"\"\n\n    inflow = None\n\n    def flow(self):\n        if self.inflow is None:\n            raise ValueError(\"No inflow provided.\")\n\n        if not isinstance(self.inflow, BaseComponent):\n            raise ValueError(\n                f\"inflow must be a BaseComponent, found {type(self.inflow)}\"\n            )\n\n        return self.__call__(self.inflow.flow())\n\n    def set_output_queue(self, queue):\n        self._queue = queue\n        for name in self._ff_nodes:\n            node = getattr(self, name)\n            if isinstance(node, BaseComponent):\n                node.set_output_queue(queue)\n\n    def report_output(self, output: Optional[Document]):\n        if self._queue is not None:\n            self._queue.put_nowait(output)\n\n    def invoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None:\n        ...\n\n    def stream(self, *args, **kwargs) -> Iterator[Document] | None:\n        ...\n\n    def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None:\n        ...\n\n    @abstractmethod\n    def run(\n        self, *args, **kwargs\n    ) -> Document | list[Document] | Iterator[Document] | None | Any:\n        \"\"\"Run the component.\"\"\"\n        ...\n
    "},{"location":"reference/base/component/#base.component.BaseComponent.run","title":"run abstractmethod","text":"
    run(*args, **kwargs)\n

    Run the component.

    Source code in libs/kotaemon/kotaemon/base/component.py
    @abstractmethod\ndef run(\n    self, *args, **kwargs\n) -> Document | list[Document] | Iterator[Document] | None | Any:\n    \"\"\"Run the component.\"\"\"\n    ...\n
    "},{"location":"reference/base/schema/","title":"Schema","text":""},{"location":"reference/base/schema/#base.schema.Document","title":"Document","text":"

    Bases: Document

    Base document class, mostly inherited from Document class from llama-index.

    This class accept one positional argument content of an arbitrary type, which will store the raw content of the document. If specified, the class will use content to initialize the base llama_index class.

    Attributes:

    Name Type Description content Any

    raw content of the document, can be anything

    source Optional[str]

    id of the source of the Document. Optional.

    channel Optional[Literal['chat', 'info', 'index', 'debug', 'plot']]

    the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class Document(BaseDocument):\n    \"\"\"\n    Base document class, mostly inherited from Document class from llama-index.\n\n    This class accept one positional argument `content` of an arbitrary type, which will\n        store the raw content of the document. If specified, the class will use\n        `content` to initialize the base llama_index class.\n\n    Attributes:\n        content: raw content of the document, can be anything\n        source: id of the source of the Document. Optional.\n        channel: the channel to show the document. Optional.:\n            - chat: show in chat message\n            - info: show in information panel\n            - index: show in index panel\n            - debug: show in debug panel\n    \"\"\"\n\n    content: Any = None\n    source: Optional[str] = None\n    channel: Optional[Literal[\"chat\", \"info\", \"index\", \"debug\", \"plot\"]] = None\n\n    def __init__(self, content: Optional[Any] = None, *args, **kwargs):\n        if content is None:\n            if kwargs.get(\"text\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"text\"]\n            elif kwargs.get(\"embedding\", None) is not None:\n                kwargs[\"content\"] = kwargs[\"embedding\"]\n                # default text indicating this document only contains embedding\n                kwargs[\"text\"] = \"<EMBEDDING>\"\n        elif isinstance(content, Document):\n            # TODO: simplify the Document class\n            temp_ = content.dict()\n            temp_.update(kwargs)\n            kwargs = temp_\n        else:\n            kwargs[\"content\"] = content\n            if content:\n                kwargs[\"text\"] = str(content)\n            else:\n                kwargs[\"text\"] = \"\"\n        super().__init__(*args, **kwargs)\n\n    def __bool__(self):\n        return bool(self.content)\n\n    @classmethod\n    def example(cls) -> \"Document\":\n        document = Document(\n            text=SAMPLE_TEXT,\n            metadata={\"filename\": \"README.md\", \"category\": \"codebase\"},\n        )\n        return document\n\n    def to_haystack_format(self) -> \"HaystackDocument\":\n        \"\"\"Convert struct to Haystack document format.\"\"\"\n        from haystack.schema import Document as HaystackDocument\n\n        metadata = self.metadata or {}\n        text = self.text\n        return HaystackDocument(content=text, meta=metadata)\n\n    def __str__(self):\n        return str(self.content)\n
    "},{"location":"reference/base/schema/#base.schema.Document.to_haystack_format","title":"to_haystack_format","text":"
    to_haystack_format()\n

    Convert struct to Haystack document format.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    def to_haystack_format(self) -> \"HaystackDocument\":\n    \"\"\"Convert struct to Haystack document format.\"\"\"\n    from haystack.schema import Document as HaystackDocument\n\n    metadata = self.metadata or {}\n    text = self.text\n    return HaystackDocument(content=text, meta=metadata)\n
    "},{"location":"reference/base/schema/#base.schema.DocumentWithEmbedding","title":"DocumentWithEmbedding","text":"

    Bases: Document

    Subclass of Document which must contains embedding

    Use this if you want to enforce component's IOs to must contain embedding.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class DocumentWithEmbedding(Document):\n    \"\"\"Subclass of Document which must contains embedding\n\n    Use this if you want to enforce component's IOs to must contain embedding.\n    \"\"\"\n\n    def __init__(self, embedding: list[float], *args, **kwargs):\n        kwargs[\"embedding\"] = embedding\n        super().__init__(*args, **kwargs)\n
    "},{"location":"reference/base/schema/#base.schema.RetrievedDocument","title":"RetrievedDocument","text":"

    Bases: Document

    Subclass of Document with retrieval-related information

    Attributes:

    Name Type Description score float

    score of the document (from 0.0 to 1.0)

    retrieval_metadata dict

    metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class RetrievedDocument(Document):\n    \"\"\"Subclass of Document with retrieval-related information\n\n    Attributes:\n        score (float): score of the document (from 0.0 to 1.0)\n        retrieval_metadata (dict): metadata from the retrieval process, can be used\n            by different components in a retrieved pipeline to communicate with each\n            other\n    \"\"\"\n\n    score: float = Field(default=0.0)\n    retrieval_metadata: dict = Field(default={})\n
    "},{"location":"reference/base/schema/#base.schema.ExtractorOutput","title":"ExtractorOutput","text":"

    Bases: Document

    Represents the output of an extractor.

    Source code in libs/kotaemon/kotaemon/base/schema.py
    class ExtractorOutput(Document):\n    \"\"\"\n    Represents the output of an extractor.\n    \"\"\"\n\n    matches: list[str]\n
    "},{"location":"reference/chatbot/","title":"Chatbot","text":""},{"location":"reference/chatbot/#chatbot.ChatConversation","title":"ChatConversation","text":"

    Bases: SessionFunction

    Base implementation of a chat bot component

    A chatbot component should Source code in libs/kotaemon/kotaemon/chatbot/base.py
    class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -> bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\">> Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: <No response>\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n
    "},{"location":"reference/chatbot/#chatbot.ChatConversation.run","title":"run","text":"
    run(message)\n

    Chat, given a message, return a response

    Parameters:

    Name Type Description Default message HumanMessage

    The message to respond to

    required

    Returns:

    Type Description Optional[BaseMessage]

    The response to the message. If None, no response is sent.

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n
    "},{"location":"reference/chatbot/#chatbot.ChatConversation.check_end","title":"check_end","text":"
    check_end(\n    history=None, user_message=None, bot_message=None\n)\n

    Check if a conversation should end

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -> bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n
    "},{"location":"reference/chatbot/#chatbot.ChatConversation.terminal_session","title":"terminal_session","text":"
    terminal_session()\n

    Create a terminal session

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\">> Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: <No response>\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n
    "},{"location":"reference/chatbot/#chatbot.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"

    Bases: BaseChatBot

    Simple text respondent chatbot that essentially wraps around a chat LLM

    Source code in libs/kotaemon/kotaemon/chatbot/simple_respondent.py
    class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -> str:\n        return self.llm(self.history).text\n
    "},{"location":"reference/chatbot/base/","title":"Base","text":""},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation","title":"ChatConversation","text":"

    Bases: SessionFunction

    Base implementation of a chat bot component

    A chatbot component should Source code in libs/kotaemon/kotaemon/chatbot/base.py
    class ChatConversation(SessionFunction):\n    \"\"\"Base implementation of a chat bot component\n\n    A chatbot component should:\n        - handle internal state, including history messages\n        - return output for a given input\n    \"\"\"\n\n    class Config:\n        store_result = session_chat_storage\n\n    system_message: str = \"\"\n    bot: BaseChatBot\n\n    def __init__(self, *args, **kwargs):\n        self._history: List[BaseMessage] = []\n        self._store_result = (\n            f\"{self.__module__}.{self.__class__.__name__},uninitiated_bot\"\n        )\n        super().__init__(*args, **kwargs)\n\n    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n        \"\"\"Chat, given a message, return a response\n\n        Args:\n            message: The message to respond to\n\n        Returns:\n            The response to the message. If None, no response is sent.\n        \"\"\"\n        user_message = (\n            HumanMessage(content=message) if isinstance(message, str) else message\n        )\n        self.history.append(user_message)\n\n        output = self.bot(self.history).text\n        output_message = None\n        if output is not None:\n            output_message = AIMessage(content=output)\n            self.history.append(output_message)\n\n        return output_message\n\n    def start_session(self):\n        self._store_result = self.bot.config.store_result\n        super().start_session()\n        if not self.history and self.system_message:\n            system_message = SystemMessage(content=self.system_message)\n            self.history.append(system_message)\n\n    def end_session(self):\n        super().end_session()\n        self._history = []\n\n    def check_end(\n        self,\n        history: Optional[List[BaseMessage]] = None,\n        user_message: Optional[HumanMessage] = None,\n        bot_message: Optional[AIMessage] = None,\n    ) -> bool:\n        \"\"\"Check if a conversation should end\"\"\"\n        if user_message is not None and user_message.content == \"\":\n            return True\n\n        return False\n\n    def terminal_session(self):\n        \"\"\"Create a terminal session\"\"\"\n        self.start_session()\n        print(\">> Start chat:\")\n\n        while True:\n            human = HumanMessage(content=input(\"Human: \"))\n            if self.check_end(history=self.history, user_message=human):\n                break\n\n            output = self(human)\n            if output is None:\n                print(\"AI: <No response>\")\n            else:\n                print(\"AI:\", output.content)\n\n            if self.check_end(history=self.history, bot_message=output):\n                break\n\n        self.end_session()\n\n    @property\n    def history(self):\n        return self._history\n\n    @history.setter\n    def history(self, value):\n        self._history = value\n        self._variablex()\n
    "},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.run","title":"run","text":"
    run(message)\n

    Chat, given a message, return a response

    Parameters:

    Name Type Description Default message HumanMessage

    The message to respond to

    required

    Returns:

    Type Description Optional[BaseMessage]

    The response to the message. If None, no response is sent.

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def run(self, message: HumanMessage) -> Optional[BaseMessage]:\n    \"\"\"Chat, given a message, return a response\n\n    Args:\n        message: The message to respond to\n\n    Returns:\n        The response to the message. If None, no response is sent.\n    \"\"\"\n    user_message = (\n        HumanMessage(content=message) if isinstance(message, str) else message\n    )\n    self.history.append(user_message)\n\n    output = self.bot(self.history).text\n    output_message = None\n    if output is not None:\n        output_message = AIMessage(content=output)\n        self.history.append(output_message)\n\n    return output_message\n
    "},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.check_end","title":"check_end","text":"
    check_end(\n    history=None, user_message=None, bot_message=None\n)\n

    Check if a conversation should end

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def check_end(\n    self,\n    history: Optional[List[BaseMessage]] = None,\n    user_message: Optional[HumanMessage] = None,\n    bot_message: Optional[AIMessage] = None,\n) -> bool:\n    \"\"\"Check if a conversation should end\"\"\"\n    if user_message is not None and user_message.content == \"\":\n        return True\n\n    return False\n
    "},{"location":"reference/chatbot/base/#chatbot.base.ChatConversation.terminal_session","title":"terminal_session","text":"
    terminal_session()\n

    Create a terminal session

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def terminal_session(self):\n    \"\"\"Create a terminal session\"\"\"\n    self.start_session()\n    print(\">> Start chat:\")\n\n    while True:\n        human = HumanMessage(content=input(\"Human: \"))\n        if self.check_end(history=self.history, user_message=human):\n            break\n\n        output = self(human)\n        if output is None:\n            print(\"AI: <No response>\")\n        else:\n            print(\"AI:\", output.content)\n\n        if self.check_end(history=self.history, bot_message=output):\n            break\n\n    self.end_session()\n
    "},{"location":"reference/chatbot/base/#chatbot.base.session_chat_storage","title":"session_chat_storage","text":"
    session_chat_storage(obj)\n

    Store using the bot location rather than the session location

    Source code in libs/kotaemon/kotaemon/chatbot/base.py
    def session_chat_storage(obj):\n    \"\"\"Store using the bot location rather than the session location\"\"\"\n    return obj._store_result\n
    "},{"location":"reference/chatbot/simple_respondent/","title":"Simple Respondent","text":""},{"location":"reference/chatbot/simple_respondent/#chatbot.simple_respondent.SimpleRespondentChatbot","title":"SimpleRespondentChatbot","text":"

    Bases: BaseChatBot

    Simple text respondent chatbot that essentially wraps around a chat LLM

    Source code in libs/kotaemon/kotaemon/chatbot/simple_respondent.py
    class SimpleRespondentChatbot(BaseChatBot):\n    \"\"\"Simple text respondent chatbot that essentially wraps around a chat LLM\"\"\"\n\n    llm: ChatLLM\n\n    def _get_message(self) -> str:\n        return self.llm(self.history).text\n
    "},{"location":"reference/embeddings/","title":"Embeddings","text":""},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings","title":"EndpointEmbeddings","text":"

    Bases: BaseEmbeddings

    An Embeddings component that uses an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of an OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n
    "},{"location":"reference/embeddings/#embeddings.EndpointEmbeddings.run","title":"run","text":"
    run(text)\n
    Generate embeddings from text Args

    text (str | list[str] | Document | list[Document]): text to generate embeddings from

    Returns: list[DocumentWithEmbedding]: embeddings

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n
    "},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"

    Bases: BaseEmbeddings

    Utilize fastembed library for embeddings locally without GPU.

    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If > 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -> \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/#embeddings.FastEmbedEmbeddings.ainvoke","title":"ainvoke async","text":"
    ainvoke(text, *args, **kwargs)\n

    Fastembed does not support async API.

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -> list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/#embeddings.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's Cohere embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's HuggingFace embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's OpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n
    "},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/embeddings/#embeddings.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/embeddings/#embeddings.OpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/base/","title":"Base","text":""},{"location":"reference/embeddings/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings","title":"EndpointEmbeddings","text":"

    Bases: BaseEmbeddings

    An Embeddings component that uses an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of an OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    class EndpointEmbeddings(BaseEmbeddings):\n    \"\"\"\n    An Embeddings component that uses an OpenAI API compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of an OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"\n        Generate embeddings from text Args:\n            text (str | list[str] | Document | list[Document]): text to generate\n            embeddings from\n        Returns:\n            list[DocumentWithEmbedding]: embeddings\n        \"\"\"\n        if not isinstance(text, list):\n            text = [text]\n\n        outputs = []\n\n        for item in text:\n            response = requests.post(\n                self.endpoint_url, json={\"input\": str(item)}\n            ).json()\n            outputs.append(\n                DocumentWithEmbedding(\n                    text=str(item),\n                    embedding=response[\"data\"][0][\"embedding\"],\n                    total_tokens=response[\"usage\"][\"total_tokens\"],\n                    prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n                )\n            )\n\n        return outputs\n
    "},{"location":"reference/embeddings/endpoint_based/#embeddings.endpoint_based.EndpointEmbeddings.run","title":"run","text":"
    run(text)\n
    Generate embeddings from text Args

    text (str | list[str] | Document | list[Document]): text to generate embeddings from

    Returns: list[DocumentWithEmbedding]: embeddings

    Source code in libs/kotaemon/kotaemon/embeddings/endpoint_based.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[DocumentWithEmbedding]:\n    \"\"\"\n    Generate embeddings from text Args:\n        text (str | list[str] | Document | list[Document]): text to generate\n        embeddings from\n    Returns:\n        list[DocumentWithEmbedding]: embeddings\n    \"\"\"\n    if not isinstance(text, list):\n        text = [text]\n\n    outputs = []\n\n    for item in text:\n        response = requests.post(\n            self.endpoint_url, json={\"input\": str(item)}\n        ).json()\n        outputs.append(\n            DocumentWithEmbedding(\n                text=str(item),\n                embedding=response[\"data\"][0][\"embedding\"],\n                total_tokens=response[\"usage\"][\"total_tokens\"],\n                prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n            )\n        )\n\n    return outputs\n
    "},{"location":"reference/embeddings/fastembed/","title":"Fastembed","text":""},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings","title":"FastEmbedEmbeddings","text":"

    Bases: BaseEmbeddings

    Utilize fastembed library for embeddings locally without GPU.

    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    class FastEmbedEmbeddings(BaseEmbeddings):\n    \"\"\"Utilize fastembed library for embeddings locally without GPU.\n\n    Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/\n    Code: https://github.com/qdrant/fastembed\n    \"\"\"\n\n    model_name: str = Param(\n        \"BAAI/bge-small-en-v1.5\",\n        help=(\n            \"Model name for fastembed. Please refer \"\n            \"[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) \"\n            \"for the list of supported models.\"\n        ),\n        required=True,\n    )\n    batch_size: int = Param(\n        256,\n        help=\"Batch size for embeddings. Higher values use more memory, but are faster\",\n    )\n    parallel: Optional[int] = Param(\n        None,\n        help=(\n            \"Number of threads to use for embeddings. \"\n            \"If > 1, data-parallel encoding will be used. \"\n            \"If 0, use all available CPUs. \"\n            \"If None, use default onnxruntime threading. \"\n            \"Defaults to None.\"\n        ),\n    )\n\n    @Param.auto()\n    def client_(self) -> \"TextEmbedding\":\n        try:\n            from fastembed import TextEmbedding\n        except ImportError:\n            raise ImportError(\"Please install FastEmbed: `pip install fastembed`\")\n\n        return TextEmbedding(model_name=self.model_name)\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        embeddings = self.client_.embed(\n            [_.content for _ in input_],\n            batch_size=self.batch_size,\n            parallel=self.parallel,\n        )\n        return [\n            DocumentWithEmbedding(\n                content=doc,\n                embedding=list(embedding),\n            )\n            for doc, embedding in zip(input_, embeddings)\n        ]\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        \"\"\"Fastembed does not support async API.\"\"\"\n        return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/fastembed/#embeddings.fastembed.FastEmbedEmbeddings.ainvoke","title":"ainvoke async","text":"
    ainvoke(text, *args, **kwargs)\n

    Fastembed does not support async API.

    Source code in libs/kotaemon/kotaemon/embeddings/fastembed.py
    async def ainvoke(\n    self, text: str | list[str] | Document | list[Document], *args, **kwargs\n) -> list[DocumentWithEmbedding]:\n    \"\"\"Fastembed does not support async API.\"\"\"\n    return self.invoke(text, *args, **kwargs)\n
    "},{"location":"reference/embeddings/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCOpenAIEmbeddings","title":"LCOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's OpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's OpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model: str = \"text-embedding-ada-002\",\n        openai_api_version: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        openai_api_type: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            openai_api_version=openai_api_version,\n            openai_api_base=openai_api_base,\n            openai_api_type=openai_api_type,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import OpenAIEmbeddings\n\n        return OpenAIEmbeddings\n
    "},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCAzureOpenAIEmbeddings","title":"LCAzureOpenAIEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment: Optional[str] = None,\n        openai_api_key: Optional[str] = None,\n        api_version: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment=deployment,\n            api_version=api_version,\n            openai_api_key=openai_api_key,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAIEmbeddings\n        except ImportError:\n            from langchain.embeddings import AzureOpenAIEmbeddings\n\n        return AzureOpenAIEmbeddings\n
    "},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCCohereEmbeddings","title":"LCCohereEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's Cohere embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's Cohere embedding, focusing on key parameters\"\"\"\n\n    cohere_api_key: str = Param(\n        help=\"API key (https://dashboard.cohere.com/api-keys)\",\n        default=None,\n        required=True,\n    )\n    model: str = Param(\n        help=\"Model name to use (https://docs.cohere.com/docs/models)\",\n        default=None,\n        required=True,\n    )\n    user_agent: str = Param(\n        help=\"User agent (leave default)\", default=\"default\", required=True\n    )\n\n    def __init__(\n        self,\n        model: str = \"embed-english-v2.0\",\n        cohere_api_key: Optional[str] = None,\n        truncate: Optional[str] = None,\n        request_timeout: Optional[float] = None,\n        **params,\n    ):\n        super().__init__(\n            model=model,\n            cohere_api_key=cohere_api_key,\n            truncate=truncate,\n            request_timeout=request_timeout,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_cohere import CohereEmbeddings\n        except ImportError:\n            from langchain.embeddings import CohereEmbeddings\n\n        return CohereEmbeddings\n
    "},{"location":"reference/embeddings/langchain_based/#embeddings.langchain_based.LCHuggingFaceEmbeddings","title":"LCHuggingFaceEmbeddings","text":"

    Bases: LCEmbeddingMixin, BaseEmbeddings

    Wrapper around Langchain's HuggingFace embedding, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/embeddings/langchain_based.py
    class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):\n    \"\"\"Wrapper around Langchain's HuggingFace embedding, focusing on key parameters\"\"\"\n\n    model_name: str = Param(\n        help=(\n            \"Model name to use (https://huggingface.co/models?\"\n            \"pipeline_tag=sentence-similarity&sort=trending)\"\n        ),\n        default=None,\n        required=True,\n    )\n\n    def __init__(\n        self,\n        model_name: str = \"sentence-transformers/all-mpnet-base-v2\",\n        **params,\n    ):\n        super().__init__(\n            model_name=model_name,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.embeddings import HuggingFaceBgeEmbeddings\n        except ImportError:\n            from langchain.embeddings import HuggingFaceBgeEmbeddings\n\n        return HuggingFaceBgeEmbeddings\n
    "},{"location":"reference/embeddings/openai/","title":"Openai","text":""},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings","title":"BaseOpenAIEmbeddings","text":"

    Bases: BaseEmbeddings

    Base interface for OpenAI embedding model, using the openai library.

    This class exposes the parameters in resources.Chat. To subclass this class:

    - Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n
    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class BaseOpenAIEmbeddings(BaseEmbeddings):\n    \"\"\"Base interface for OpenAI embedding model, using the openai library.\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n\n    api_key: str = Param(None, help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request.\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request.\"\n    )\n\n    dimensions: Optional[int] = Param(\n        None,\n        help=(\n            \"The number of dimensions the resulting output embeddings should have. \"\n            \"Only supported in `text-embedding-3` and later models.\"\n        ),\n    )\n    context_length: Optional[int] = Param(\n        None, help=\"The maximum context length of the embedding model\"\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_doc = self.prepare_input(text)\n        client = self.prepare_client(async_version=False)\n\n        input_: list[str | list[int]] = []\n        splitted_indices = {}\n        for idx, text in enumerate(input_doc):\n            if self.context_length:\n                chunks = split_text_by_chunk_size(text.text or \" \", self.context_length)\n                splitted_indices[idx] = (len(input_), len(input_) + len(chunks))\n                input_.extend(chunks)\n            else:\n                splitted_indices[idx] = (len(input_), len(input_) + 1)\n                input_.append(text.text)\n\n        resp = self.openai_response(client, input=input_, **kwargs).dict()\n        output_ = list(sorted(resp[\"data\"], key=lambda x: x[\"index\"]))\n\n        output = []\n        for idx, doc in enumerate(input_doc):\n            embs = output_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            if len(embs) == 1:\n                output.append(\n                    DocumentWithEmbedding(embedding=embs[0][\"embedding\"], content=doc)\n                )\n                continue\n\n            chunk_lens = [\n                len(_)\n                for _ in input_[splitted_indices[idx][0] : splitted_indices[idx][1]]\n            ]\n            vs: list[list[float]] = [_[\"embedding\"] for _ in embs]\n            emb = np.average(vs, axis=0, weights=chunk_lens)\n            emb = emb / np.linalg.norm(emb)\n            output.append(DocumentWithEmbedding(embedding=emb.tolist(), content=doc))\n\n        return output\n\n    async def ainvoke(\n        self, text: str | list[str] | Document | list[Document], *args, **kwargs\n    ) -> list[DocumentWithEmbedding]:\n        input_ = self.prepare_input(text)\n        client = self.prepare_client(async_version=True)\n        resp = await self.openai_response(\n            client, input=[_.text if _.text else \" \" for _ in input_], **kwargs\n        ).dict()\n        output_ = sorted(resp[\"data\"], key=lambda x: x[\"index\"])\n        return [\n            DocumentWithEmbedding(embedding=o[\"embedding\"], content=i)\n            for i, o in zip(input_, output_)\n        ]\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.BaseOpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings","title":"OpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class OpenAIEmbeddings(BaseOpenAIEmbeddings):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(\n        None,\n        help=(\n            \"ID of the model to use. You can go to [Model overview](https://platform.\"\n            \"openai.com/docs/models/overview) to see the available models.\"\n        ),\n        required=True,\n    )\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.model,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.OpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.model,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings","title":"AzureOpenAIEmbeddings","text":"

    Bases: BaseOpenAIEmbeddings

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings):\n    azure_endpoint: str = Param(\n        None,\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(None, help=\"Azure deployment name\", required=True)\n    api_version: str = Param(None, help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    @retry(\n        retry=retry_if_not_exception_type(\n            (openai.NotFoundError, openai.BadRequestError)\n        ),\n        wait=wait_random_exponential(min=1, max=40),\n        stop=stop_after_attempt(6),\n    )\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        params: dict = {\n            \"model\": self.azure_deployment,\n        }\n        if self.dimensions:\n            params[\"dimensions\"] = self.dimensions\n        params.update(kwargs)\n\n        return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.AzureOpenAIEmbeddings.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    @retry(\n    retry=retry_if_not_exception_type(\n        (openai.NotFoundError, openai.BadRequestError)\n    ),\n    wait=wait_random_exponential(min=1, max=40),\n    stop=stop_after_attempt(6),\n)\ndef openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    params: dict = {\n        \"model\": self.azure_deployment,\n    }\n    if self.dimensions:\n        params[\"dimensions\"] = self.dimensions\n    params.update(kwargs)\n\n    return client.embeddings.create(**params)\n
    "},{"location":"reference/embeddings/openai/#embeddings.openai.split_text_by_chunk_size","title":"split_text_by_chunk_size","text":"
    split_text_by_chunk_size(text, chunk_size)\n

    Split the text into chunks of a given size

    Parameters:

    Name Type Description Default text str

    text to split

    required chunk_size int

    size of each chunk

    required

    Returns:

    Type Description list[list[int]]

    list of chunks (as tokens)

    Source code in libs/kotaemon/kotaemon/embeddings/openai.py
    def split_text_by_chunk_size(text: str, chunk_size: int) -> list[list[int]]:\n    \"\"\"Split the text into chunks of a given size\n\n    Args:\n        text: text to split\n        chunk_size: size of each chunk\n\n    Returns:\n        list of chunks (as tokens)\n    \"\"\"\n    encoding = tiktoken.get_encoding(\"cl100k_base\")\n    tokens = iter(encoding.encode(text))\n    result = []\n    while chunk := list(islice(tokens, chunk_size)):\n        result.append(chunk)\n    return result\n
    "},{"location":"reference/indices/","title":"Indices","text":""},{"location":"reference/indices/#indices.VectorIndexing","title":"VectorIndexing","text":"

    Bases: BaseIndexing

    Ingest the document, run through the embedding, and store the embedding in a vector store.

    This pipeline supports the following set of inputs Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'<p><img src=\"{image_origin}\"></p>'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n
    "},{"location":"reference/indices/#indices.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"
    to_retrieval_pipeline(*args, **kwargs)\n

    Convert the indexing pipeline to a retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n
    "},{"location":"reference/indices/#indices.VectorRetrieval","title":"VectorRetrieval","text":"

    Bases: BaseRetrieval

    Retrieve list of documents from vector store

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -> list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) < thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n
    "},{"location":"reference/indices/#indices.VectorRetrieval.run","title":"run","text":"
    run(text, top_k=None, **kwargs)\n

    Retrieve a list of documents from vector store

    Parameters:

    Name Type Description Default text str | Document

    the text to retrieve similar documents

    required top_k Optional[int]

    number of top similar documents to return

    None

    Returns:

    Type Description list[RetrievedDocument]

    list[RetrievedDocument]: list of retrieved documents

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -> list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) < thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n
    "},{"location":"reference/indices/base/","title":"Base","text":""},{"location":"reference/indices/base/#indices.base.DocTransformer","title":"DocTransformer","text":"

    Bases: BaseComponent

    This is a base class for document transformers

    A document transformer transforms a list of documents into another list of documents. Transforming can mean splitting a document into multiple documents, reducing a large list of documents into a smaller list of documents, or adding metadata to each document in a list of documents, etc.

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class DocTransformer(BaseComponent):\n    \"\"\"This is a base class for document transformers\n\n    A document transformer transforms a list of documents into another list\n    of documents. Transforming can mean splitting a document into multiple documents,\n    reducing a large list of documents into a smaller list of documents, or adding\n    metadata to each document in a list of documents, etc.\n    \"\"\"\n\n    @abstractmethod\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -> list[Document]:\n        ...\n
    "},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin","title":"LlamaIndexDocTransformerMixin","text":"

    Allow automatically wrapping a Llama-index component into kotaemon component

    Example

    class TokenSplitter(LlamaIndexMixin, BaseSplitter): def _get_li_class(self): from llama_index.core.text_splitter import TokenTextSplitter return TokenTextSplitter

    To use this mixin, please: 1. Use this class as the 1st parent class, so that Python will prefer to use the attributes and methods of this class whenever possible. 2. Overwrite _get_li_class to return the relevant LlamaIndex component.

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class LlamaIndexDocTransformerMixin:\n    \"\"\"Allow automatically wrapping a Llama-index component into kotaemon component\n\n    Example:\n        class TokenSplitter(LlamaIndexMixin, BaseSplitter):\n            def _get_li_class(self):\n                from llama_index.core.text_splitter import TokenTextSplitter\n                return TokenTextSplitter\n\n    To use this mixin, please:\n        1. Use this class as the 1st parent class, so that Python will prefer to use\n        the attributes and methods of this class whenever possible.\n        2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.\n    \"\"\"\n\n    def _get_li_class(self) -> Type[NodeParser]:\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in _get_li_class\"\n        )\n\n    def __init__(self, **params):\n        self._li_cls = self._get_li_class()\n        self._obj = self._li_cls(**params)\n        self._kwargs = params\n        super().__init__()\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) > 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        if name.startswith(\"_\") or name in self._protected_keywords():\n            return super().__setattr__(name, value)\n\n        self._kwargs[name] = value\n        return setattr(self._obj, name, value)\n\n    def __getattr__(self, name: str) -> Any:\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def run(\n        self,\n        documents: list[Document],\n        **kwargs,\n    ) -> list[Document]:\n        \"\"\"Run Llama-index node parser and convert the output to Document from\n        kotaemon\n        \"\"\"\n        docs = self._obj(documents, **kwargs)  # type: ignore\n        return [Document.from_dict(doc.to_dict()) for doc in docs]\n
    "},{"location":"reference/indices/base/#indices.base.LlamaIndexDocTransformerMixin.run","title":"run","text":"
    run(documents, **kwargs)\n

    Run Llama-index node parser and convert the output to Document from kotaemon

    Source code in libs/kotaemon/kotaemon/indices/base.py
    def run(\n    self,\n    documents: list[Document],\n    **kwargs,\n) -> list[Document]:\n    \"\"\"Run Llama-index node parser and convert the output to Document from\n    kotaemon\n    \"\"\"\n    docs = self._obj(documents, **kwargs)  # type: ignore\n    return [Document.from_dict(doc.to_dict()) for doc in docs]\n
    "},{"location":"reference/indices/base/#indices.base.BaseIndexing","title":"BaseIndexing","text":"

    Bases: BaseComponent

    Define the base interface for indexing pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class BaseIndexing(BaseComponent):\n    \"\"\"Define the base interface for indexing pipeline\"\"\"\n\n    def to_retrieval_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        raise NotImplementedError\n\n    def to_qa_pipeline(self, **kwargs):\n        \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n        raise NotImplementedError\n
    "},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"
    to_retrieval_pipeline(**kwargs)\n

    Convert the indexing pipeline to a retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    def to_retrieval_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/indices/base/#indices.base.BaseIndexing.to_qa_pipeline","title":"to_qa_pipeline","text":"
    to_qa_pipeline(**kwargs)\n

    Convert the indexing pipeline to a QA pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    def to_qa_pipeline(self, **kwargs):\n    \"\"\"Convert the indexing pipeline to a QA pipeline\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/indices/base/#indices.base.BaseRetrieval","title":"BaseRetrieval","text":"

    Bases: BaseComponent

    Define the base interface for retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/base.py
    class BaseRetrieval(BaseComponent):\n    \"\"\"Define the base interface for retrieval pipeline\"\"\"\n\n    @abstractmethod\n    def run(self, *args, **kwargs) -> list[RetrievedDocument]:\n        ...\n
    "},{"location":"reference/indices/vectorindex/","title":"Vectorindex","text":""},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing","title":"VectorIndexing","text":"

    Bases: BaseIndexing

    Ingest the document, run through the embedding, and store the embedding in a vector store.

    This pipeline supports the following set of inputs Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorIndexing(BaseIndexing):\n    \"\"\"Ingest the document, run through the embedding, and store the embedding in a\n    vector store.\n\n    This pipeline supports the following set of inputs:\n        - List of documents\n        - List of texts\n    \"\"\"\n\n    cache_dir: Optional[str] = getattr(flowsettings, \"KH_CHUNKS_OUTPUT_DIR\", None)\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    count_: int = 0\n\n    def to_retrieval_pipeline(self, *args, **kwargs):\n        \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n        return VectorRetrieval(\n            vector_store=self.vector_store,\n            doc_store=self.doc_store,\n            embedding=self.embedding,\n            **kwargs,\n        )\n\n    def to_qa_pipeline(self, *args, **kwargs):\n        from .qa import CitationQAPipeline\n\n        return TextVectorQA(\n            retrieving_pipeline=self.to_retrieval_pipeline(**kwargs),\n            qa_pipeline=CitationQAPipeline(**kwargs),\n        )\n\n    def write_chunk_to_file(self, docs: list[Document]):\n        # save the chunks content into markdown format\n        if self.cache_dir:\n            file_name = Path(docs[0].metadata[\"file_name\"])\n            for i in range(len(docs)):\n                markdown_content = \"\"\n                if \"page_label\" in docs[i].metadata:\n                    page_label = str(docs[i].metadata[\"page_label\"])\n                    markdown_content += f\"Page label: {page_label}\"\n                if \"file_name\" in docs[i].metadata:\n                    filename = docs[i].metadata[\"file_name\"]\n                    markdown_content += f\"\\nFile name: {filename}\"\n                if \"section\" in docs[i].metadata:\n                    section = docs[i].metadata[\"section\"]\n                    markdown_content += f\"\\nSection: {section}\"\n                if \"type\" in docs[i].metadata:\n                    if docs[i].metadata[\"type\"] == \"image\":\n                        image_origin = docs[i].metadata[\"image_origin\"]\n                        image_origin = f'<p><img src=\"{image_origin}\"></p>'\n                        markdown_content += f\"\\nImage origin: {image_origin}\"\n                if docs[i].text:\n                    markdown_content += f\"\\ntext:\\n{docs[i].text}\"\n\n                with open(\n                    Path(self.cache_dir) / f\"{file_name.stem}_{self.count_+i}.md\",\n                    \"w\",\n                    encoding=\"utf-8\",\n                ) as f:\n                    f.write(markdown_content)\n\n    def add_to_docstore(self, docs: list[Document]):\n        if self.doc_store:\n            print(\"Adding documents to doc store\")\n            self.doc_store.add(docs)\n\n    def add_to_vectorstore(self, docs: list[Document]):\n        # in case we want to skip embedding\n        if self.vector_store:\n            print(f\"Getting embeddings for {len(docs)} nodes\")\n            embeddings = self.embedding(docs)\n            print(\"Adding embeddings to vector store\")\n            self.vector_store.add(\n                embeddings=embeddings,\n                ids=[t.doc_id for t in docs],\n            )\n\n    def run(self, text: str | list[str] | Document | list[Document]):\n        input_: list[Document] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in cast(list, text):\n            if isinstance(item, str):\n                input_.append(Document(text=item, id_=str(uuid.uuid4())))\n            elif isinstance(item, Document):\n                input_.append(item)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        self.add_to_vectorstore(input_)\n        self.add_to_docstore(input_)\n        self.write_chunk_to_file(input_)\n        self.count_ += len(input_)\n
    "},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorIndexing.to_retrieval_pipeline","title":"to_retrieval_pipeline","text":"
    to_retrieval_pipeline(*args, **kwargs)\n

    Convert the indexing pipeline to a retrieval pipeline

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def to_retrieval_pipeline(self, *args, **kwargs):\n    \"\"\"Convert the indexing pipeline to a retrieval pipeline\"\"\"\n    return VectorRetrieval(\n        vector_store=self.vector_store,\n        doc_store=self.doc_store,\n        embedding=self.embedding,\n        **kwargs,\n    )\n
    "},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval","title":"VectorRetrieval","text":"

    Bases: BaseRetrieval

    Retrieve list of documents from vector store

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    class VectorRetrieval(BaseRetrieval):\n    \"\"\"Retrieve list of documents from vector store\"\"\"\n\n    vector_store: BaseVectorStore\n    doc_store: Optional[BaseDocumentStore] = None\n    embedding: BaseEmbeddings\n    rerankers: Sequence[BaseReranking] = []\n    top_k: int = 5\n    first_round_top_k_mult: int = 10\n    retrieval_mode: str = \"hybrid\"  # vector, text, hybrid\n\n    def _filter_docs(\n        self, documents: list[RetrievedDocument], top_k: int | None = None\n    ):\n        if top_k:\n            documents = documents[:top_k]\n        return documents\n\n    def run(\n        self, text: str | Document, top_k: Optional[int] = None, **kwargs\n    ) -> list[RetrievedDocument]:\n        \"\"\"Retrieve a list of documents from vector store\n\n        Args:\n            text: the text to retrieve similar documents\n            top_k: number of top similar documents to return\n\n        Returns:\n            list[RetrievedDocument]: list of retrieved documents\n        \"\"\"\n        if top_k is None:\n            top_k = self.top_k\n\n        do_extend = kwargs.pop(\"do_extend\", False)\n        thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n        if do_extend:\n            top_k_first_round = top_k * self.first_round_top_k_mult\n        else:\n            top_k_first_round = top_k\n\n        if self.doc_store is None:\n            raise ValueError(\n                \"doc_store is not provided. Please provide a doc_store to \"\n                \"retrieve the documents\"\n            )\n\n        result: list[RetrievedDocument] = []\n        # TODO: should declare scope directly in the run params\n        scope = kwargs.pop(\"scope\", None)\n        emb: list[float]\n\n        if self.retrieval_mode == \"vector\":\n            emb = self.embedding(text)[0].embedding\n            _, scores, ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            docs = self.doc_store.get(ids)\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(docs, scores)\n            ]\n        elif self.retrieval_mode == \"text\":\n            query = text.text if isinstance(text, Document) else text\n            docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n            result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n        elif self.retrieval_mode == \"hybrid\":\n            # similarity search section\n            emb = self.embedding(text)[0].embedding\n            vs_docs: list[RetrievedDocument] = []\n            vs_ids: list[str] = []\n            vs_scores: list[float] = []\n\n            def query_vectorstore():\n                nonlocal vs_docs\n                nonlocal vs_scores\n                nonlocal vs_ids\n\n                assert self.doc_store is not None\n                _, vs_scores, vs_ids = self.vector_store.query(\n                    embedding=emb, top_k=top_k_first_round, **kwargs\n                )\n                if vs_ids:\n                    vs_docs = self.doc_store.get(vs_ids)\n\n            # full-text search section\n            ds_docs: list[RetrievedDocument] = []\n\n            def query_docstore():\n                nonlocal ds_docs\n\n                assert self.doc_store is not None\n                query = text.text if isinstance(text, Document) else text\n                ds_docs = self.doc_store.query(\n                    query, top_k=top_k_first_round, doc_ids=scope\n                )\n\n            vs_query_thread = threading.Thread(target=query_vectorstore)\n            ds_query_thread = threading.Thread(target=query_docstore)\n\n            vs_query_thread.start()\n            ds_query_thread.start()\n\n            vs_query_thread.join()\n            ds_query_thread.join()\n\n            result = [\n                RetrievedDocument(**doc.to_dict(), score=-1.0)\n                for doc in ds_docs\n                if doc not in vs_ids\n            ]\n            result += [\n                RetrievedDocument(**doc.to_dict(), score=score)\n                for doc, score in zip(vs_docs, vs_scores)\n            ]\n            print(f\"Got {len(vs_docs)} from vectorstore\")\n            print(f\"Got {len(ds_docs)} from docstore\")\n\n        # use additional reranker to re-order the document list\n        if self.rerankers and text:\n            for reranker in self.rerankers:\n                # if reranker is LLMReranking, limit the document with top_k items only\n                if isinstance(reranker, LLMReranking):\n                    result = self._filter_docs(result, top_k=top_k)\n                result = reranker(documents=result, query=text)\n\n        result = self._filter_docs(result, top_k=top_k)\n        print(f\"Got raw {len(result)} retrieved documents\")\n\n        # add page thumbnails to the result if exists\n        thumbnail_doc_ids: set[str] = set()\n        # we should copy the text from retrieved text chunk\n        # to the thumbnail to get relevant LLM score correctly\n        text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n        non_thumbnail_docs = []\n        raw_thumbnail_docs = []\n        for doc in result:\n            if doc.metadata.get(\"type\") == \"thumbnail\":\n                # change type to image to display on UI\n                doc.metadata[\"type\"] = \"image\"\n                raw_thumbnail_docs.append(doc)\n                continue\n            if (\n                \"thumbnail_doc_id\" in doc.metadata\n                and len(thumbnail_doc_ids) < thumbnail_count\n            ):\n                thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n                thumbnail_doc_ids.add(thumbnail_id)\n                text_thumbnail_docs[thumbnail_id] = doc\n            else:\n                non_thumbnail_docs.append(doc)\n\n        linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n        print(\n            \"thumbnail docs\",\n            len(linked_thumbnail_docs),\n            \"non-thumbnail docs\",\n            len(non_thumbnail_docs),\n            \"raw-thumbnail docs\",\n            len(raw_thumbnail_docs),\n        )\n        additional_docs = []\n\n        for thumbnail_doc in linked_thumbnail_docs:\n            text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n            doc_dict = thumbnail_doc.to_dict()\n            doc_dict[\"_id\"] = text_doc.doc_id\n            doc_dict[\"content\"] = text_doc.content\n            doc_dict[\"metadata\"][\"type\"] = \"image\"\n            for key in text_doc.metadata:\n                if key not in doc_dict[\"metadata\"]:\n                    doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n            additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n        result = additional_docs + non_thumbnail_docs\n\n        if not result:\n            # return output from raw retrieved thumbnails\n            result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n        return result\n
    "},{"location":"reference/indices/vectorindex/#indices.vectorindex.VectorRetrieval.run","title":"run","text":"
    run(text, top_k=None, **kwargs)\n

    Retrieve a list of documents from vector store

    Parameters:

    Name Type Description Default text str | Document

    the text to retrieve similar documents

    required top_k Optional[int]

    number of top similar documents to return

    None

    Returns:

    Type Description list[RetrievedDocument]

    list[RetrievedDocument]: list of retrieved documents

    Source code in libs/kotaemon/kotaemon/indices/vectorindex.py
    def run(\n    self, text: str | Document, top_k: Optional[int] = None, **kwargs\n) -> list[RetrievedDocument]:\n    \"\"\"Retrieve a list of documents from vector store\n\n    Args:\n        text: the text to retrieve similar documents\n        top_k: number of top similar documents to return\n\n    Returns:\n        list[RetrievedDocument]: list of retrieved documents\n    \"\"\"\n    if top_k is None:\n        top_k = self.top_k\n\n    do_extend = kwargs.pop(\"do_extend\", False)\n    thumbnail_count = kwargs.pop(\"thumbnail_count\", 3)\n\n    if do_extend:\n        top_k_first_round = top_k * self.first_round_top_k_mult\n    else:\n        top_k_first_round = top_k\n\n    if self.doc_store is None:\n        raise ValueError(\n            \"doc_store is not provided. Please provide a doc_store to \"\n            \"retrieve the documents\"\n        )\n\n    result: list[RetrievedDocument] = []\n    # TODO: should declare scope directly in the run params\n    scope = kwargs.pop(\"scope\", None)\n    emb: list[float]\n\n    if self.retrieval_mode == \"vector\":\n        emb = self.embedding(text)[0].embedding\n        _, scores, ids = self.vector_store.query(\n            embedding=emb, top_k=top_k_first_round, **kwargs\n        )\n        docs = self.doc_store.get(ids)\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(docs, scores)\n        ]\n    elif self.retrieval_mode == \"text\":\n        query = text.text if isinstance(text, Document) else text\n        docs = self.doc_store.query(query, top_k=top_k_first_round, doc_ids=scope)\n        result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs]\n    elif self.retrieval_mode == \"hybrid\":\n        # similarity search section\n        emb = self.embedding(text)[0].embedding\n        vs_docs: list[RetrievedDocument] = []\n        vs_ids: list[str] = []\n        vs_scores: list[float] = []\n\n        def query_vectorstore():\n            nonlocal vs_docs\n            nonlocal vs_scores\n            nonlocal vs_ids\n\n            assert self.doc_store is not None\n            _, vs_scores, vs_ids = self.vector_store.query(\n                embedding=emb, top_k=top_k_first_round, **kwargs\n            )\n            if vs_ids:\n                vs_docs = self.doc_store.get(vs_ids)\n\n        # full-text search section\n        ds_docs: list[RetrievedDocument] = []\n\n        def query_docstore():\n            nonlocal ds_docs\n\n            assert self.doc_store is not None\n            query = text.text if isinstance(text, Document) else text\n            ds_docs = self.doc_store.query(\n                query, top_k=top_k_first_round, doc_ids=scope\n            )\n\n        vs_query_thread = threading.Thread(target=query_vectorstore)\n        ds_query_thread = threading.Thread(target=query_docstore)\n\n        vs_query_thread.start()\n        ds_query_thread.start()\n\n        vs_query_thread.join()\n        ds_query_thread.join()\n\n        result = [\n            RetrievedDocument(**doc.to_dict(), score=-1.0)\n            for doc in ds_docs\n            if doc not in vs_ids\n        ]\n        result += [\n            RetrievedDocument(**doc.to_dict(), score=score)\n            for doc, score in zip(vs_docs, vs_scores)\n        ]\n        print(f\"Got {len(vs_docs)} from vectorstore\")\n        print(f\"Got {len(ds_docs)} from docstore\")\n\n    # use additional reranker to re-order the document list\n    if self.rerankers and text:\n        for reranker in self.rerankers:\n            # if reranker is LLMReranking, limit the document with top_k items only\n            if isinstance(reranker, LLMReranking):\n                result = self._filter_docs(result, top_k=top_k)\n            result = reranker(documents=result, query=text)\n\n    result = self._filter_docs(result, top_k=top_k)\n    print(f\"Got raw {len(result)} retrieved documents\")\n\n    # add page thumbnails to the result if exists\n    thumbnail_doc_ids: set[str] = set()\n    # we should copy the text from retrieved text chunk\n    # to the thumbnail to get relevant LLM score correctly\n    text_thumbnail_docs: dict[str, RetrievedDocument] = {}\n\n    non_thumbnail_docs = []\n    raw_thumbnail_docs = []\n    for doc in result:\n        if doc.metadata.get(\"type\") == \"thumbnail\":\n            # change type to image to display on UI\n            doc.metadata[\"type\"] = \"image\"\n            raw_thumbnail_docs.append(doc)\n            continue\n        if (\n            \"thumbnail_doc_id\" in doc.metadata\n            and len(thumbnail_doc_ids) < thumbnail_count\n        ):\n            thumbnail_id = doc.metadata[\"thumbnail_doc_id\"]\n            thumbnail_doc_ids.add(thumbnail_id)\n            text_thumbnail_docs[thumbnail_id] = doc\n        else:\n            non_thumbnail_docs.append(doc)\n\n    linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids))\n    print(\n        \"thumbnail docs\",\n        len(linked_thumbnail_docs),\n        \"non-thumbnail docs\",\n        len(non_thumbnail_docs),\n        \"raw-thumbnail docs\",\n        len(raw_thumbnail_docs),\n    )\n    additional_docs = []\n\n    for thumbnail_doc in linked_thumbnail_docs:\n        text_doc = text_thumbnail_docs[thumbnail_doc.doc_id]\n        doc_dict = thumbnail_doc.to_dict()\n        doc_dict[\"_id\"] = text_doc.doc_id\n        doc_dict[\"content\"] = text_doc.content\n        doc_dict[\"metadata\"][\"type\"] = \"image\"\n        for key in text_doc.metadata:\n            if key not in doc_dict[\"metadata\"]:\n                doc_dict[\"metadata\"][key] = text_doc.metadata[key]\n\n        additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score))\n\n    result = additional_docs + non_thumbnail_docs\n\n    if not result:\n        # return output from raw retrieved thumbnails\n        result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count)\n\n    return result\n
    "},{"location":"reference/indices/extractors/","title":"Extractors","text":""},{"location":"reference/indices/extractors/doc_parsers/","title":"Doc Parsers","text":""},{"location":"reference/indices/ingests/","title":"Ingests","text":""},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor","title":"DocumentIngestor","text":"

    Bases: BaseComponent

    Ingest common office document types into Document for indexing

    Document types

    Parameters:

    Name Type Description Default pdf_mode

    mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax

    required doc_parsers

    list of document parsers to parse the document

    required text_splitter

    splitter to split the document into text nodes

    required override_file_extractors

    override file extractors for specific file extensions The default file extractors are stored in KH_DEFAULT_FILE_EXTRACTORS

    required Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n
    "},{"location":"reference/indices/ingests/#indices.ingests.DocumentIngestor.run","title":"run","text":"
    run(file_paths)\n

    Ingest the file paths into Document

    Parameters:

    Name Type Description Default file_paths list[str | Path] | str | Path

    list of file paths or a single file path

    required

    Returns:

    Type Description list[Document]

    list of parsed Documents

    Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n
    "},{"location":"reference/indices/ingests/files/","title":"Files","text":""},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor","title":"DocumentIngestor","text":"

    Bases: BaseComponent

    Ingest common office document types into Document for indexing

    Document types

    Parameters:

    Name Type Description Default pdf_mode

    mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax

    required doc_parsers

    list of document parsers to parse the document

    required text_splitter

    splitter to split the document into text nodes

    required override_file_extractors

    override file extractors for specific file extensions The default file extractors are stored in KH_DEFAULT_FILE_EXTRACTORS

    required Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    class DocumentIngestor(BaseComponent):\n    \"\"\"Ingest common office document types into Document for indexing\n\n    Document types:\n        - pdf\n        - xlsx, xls\n        - docx, doc\n\n    Args:\n        pdf_mode: mode for pdf extraction, one of \"normal\", \"mathpix\", \"ocr\"\n            - normal: parse pdf text\n            - mathpix: parse pdf text using mathpix\n            - ocr: parse pdf image using flax\n        doc_parsers: list of document parsers to parse the document\n        text_splitter: splitter to split the document into text nodes\n        override_file_extractors: override file extractors for specific file extensions\n            The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`\n    \"\"\"\n\n    pdf_mode: str = \"normal\"  # \"normal\", \"mathpix\", \"ocr\", \"multimodal\"\n    doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: [])\n    text_splitter: BaseSplitter = TokenSplitter.withx(\n        chunk_size=1024,\n        chunk_overlap=256,\n        separator=\"\\n\\n\",\n        backup_separators=[\"\\n\", \".\", \" \", \"\\u200B\"],\n    )\n    override_file_extractors: dict[str, Type[BaseReader]] = {}\n\n    def _get_reader(self, input_files: list[str | Path]):\n        \"\"\"Get appropriate readers for the input files based on file extension\"\"\"\n        file_extractors: dict[str, BaseReader] = {\n            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()\n        }\n        for ext, cls in self.override_file_extractors.items():\n            file_extractors[ext] = cls()\n\n        if self.pdf_mode == \"normal\":\n            file_extractors[\".pdf\"] = PDFReader()\n        elif self.pdf_mode == \"ocr\":\n            file_extractors[\".pdf\"] = OCRReader()\n        elif self.pdf_mode == \"multimodal\":\n            file_extractors[\".pdf\"] = AdobeReader()\n        else:\n            file_extractors[\".pdf\"] = MathpixPDFReader()\n\n        main_reader = DirectoryReader(\n            input_files=input_files,\n            file_extractor=file_extractors,\n        )\n\n        return main_reader\n\n    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n        \"\"\"Ingest the file paths into Document\n\n        Args:\n            file_paths: list of file paths or a single file path\n\n        Returns:\n            list of parsed Documents\n        \"\"\"\n        if not isinstance(file_paths, list):\n            file_paths = [file_paths]\n\n        documents = self._get_reader(input_files=file_paths)()\n        print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n        nodes = self.text_splitter(documents)\n        print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n        self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n        # document parsers call\n        if self.doc_parsers:\n            for parser in self.doc_parsers:\n                nodes = parser(nodes)\n\n        return nodes\n
    "},{"location":"reference/indices/ingests/files/#indices.ingests.files.DocumentIngestor.run","title":"run","text":"
    run(file_paths)\n

    Ingest the file paths into Document

    Parameters:

    Name Type Description Default file_paths list[str | Path] | str | Path

    list of file paths or a single file path

    required

    Returns:

    Type Description list[Document]

    list of parsed Documents

    Source code in libs/kotaemon/kotaemon/indices/ingests/files.py
    def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]:\n    \"\"\"Ingest the file paths into Document\n\n    Args:\n        file_paths: list of file paths or a single file path\n\n    Returns:\n        list of parsed Documents\n    \"\"\"\n    if not isinstance(file_paths, list):\n        file_paths = [file_paths]\n\n    documents = self._get_reader(input_files=file_paths)()\n    print(f\"Read {len(file_paths)} files into {len(documents)} documents.\")\n    nodes = self.text_splitter(documents)\n    print(f\"Transform {len(documents)} documents into {len(nodes)} nodes.\")\n    self.log_progress(\".num_docs\", num_docs=len(nodes))\n\n    # document parsers call\n    if self.doc_parsers:\n        for parser in self.doc_parsers:\n            nodes = parser(nodes)\n\n    return nodes\n
    "},{"location":"reference/indices/qa/","title":"Qa","text":""},{"location":"reference/indices/qa/#indices.qa.CitationPipeline","title":"CitationPipeline","text":"

    Bases: BaseComponent

    Citation pipeline to extract cited evidences from source (based on input question)

    Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
    class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n
    "},{"location":"reference/indices/qa/#indices.qa.CitationQAPipeline","title":"CitationQAPipeline","text":"

    Bases: BaseComponent

    Answering question from a text corpus with citation

    Source code in libs/kotaemon/kotaemon/indices/qa/text_based.py
    class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -> str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -> Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n
    "},{"location":"reference/indices/qa/citation/","title":"Citation","text":""},{"location":"reference/indices/qa/citation/#indices.qa.citation.CiteEvidence","title":"CiteEvidence","text":"

    Bases: BaseModel

    List of evidences (maximum 5) to support the answer.

    Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
    class CiteEvidence(BaseModel):\n    \"\"\"List of evidences (maximum 5) to support the answer.\"\"\"\n\n    evidences: List[str] = Field(\n        ...,\n        description=(\n            \"Each source should be a direct quote from the context, \"\n            \"as a substring of the original content (max 15 words).\"\n        ),\n    )\n
    "},{"location":"reference/indices/qa/citation/#indices.qa.citation.CitationPipeline","title":"CitationPipeline","text":"

    Bases: BaseComponent

    Citation pipeline to extract cited evidences from source (based on input question)

    Source code in libs/kotaemon/kotaemon/indices/qa/citation.py
    class CitationPipeline(BaseComponent):\n    \"\"\"Citation pipeline to extract cited evidences from source\n    (based on input question)\"\"\"\n\n    llm: BaseLLM\n\n    def run(self, context: str, question: str):\n        return self.invoke(context, question)\n\n    def prepare_llm(self, context: str, question: str):\n        schema = CiteEvidence.schema()\n        function = {\n            \"name\": schema[\"title\"],\n            \"description\": schema[\"description\"],\n            \"parameters\": schema,\n        }\n        llm_kwargs = {\n            \"tools\": [{\"type\": \"function\", \"function\": function}],\n            \"tool_choice\": \"required\",\n            \"tools_pydantic\": [CiteEvidence],\n        }\n        messages = [\n            SystemMessage(\n                content=(\n                    \"You are a world class algorithm to answer \"\n                    \"questions with correct and exact citations.\"\n                )\n            ),\n            HumanMessage(\n                content=(\n                    \"Answer question using the following context. \"\n                    \"Use the provided function CiteEvidence() to cite your sources.\"\n                )\n            ),\n            HumanMessage(content=context),\n            HumanMessage(content=f\"Question: {question}\"),\n            HumanMessage(\n                content=(\n                    \"Tips: Make sure to cite your sources, \"\n                    \"and use the exact words from the context.\"\n                )\n            ),\n        ]\n        return messages, llm_kwargs\n\n    def invoke(self, context: str, question: str):\n        messages, llm_kwargs = self.prepare_llm(context, question)\n        try:\n            print(\"CitationPipeline: invoking LLM\")\n            llm_output = self.get_from_path(\"llm\").invoke(messages, **llm_kwargs)\n            print(\"CitationPipeline: finish invoking LLM\")\n            if not llm_output.additional_kwargs.get(\"tool_calls\"):\n                return None\n\n            first_func = llm_output.additional_kwargs[\"tool_calls\"][0]\n\n            if \"function\" in first_func:\n                # openai and cohere format\n                function_output = first_func[\"function\"][\"arguments\"]\n            else:\n                # anthropic format\n                function_output = first_func[\"args\"]\n\n            print(\"CitationPipeline:\", function_output)\n\n            if isinstance(function_output, str):\n                output = CiteEvidence.parse_raw(function_output)\n            else:\n                output = CiteEvidence.parse_obj(function_output)\n        except Exception as e:\n            print(e)\n            return None\n\n        return output\n\n    async def ainvoke(self, context: str, question: str):\n        raise NotImplementedError()\n
    "},{"location":"reference/indices/qa/text_based/","title":"Text Based","text":""},{"location":"reference/indices/qa/text_based/#indices.qa.text_based.CitationQAPipeline","title":"CitationQAPipeline","text":"

    Bases: BaseComponent

    Answering question from a text corpus with citation

    Source code in libs/kotaemon/kotaemon/indices/qa/text_based.py
    class CitationQAPipeline(BaseComponent):\n    \"\"\"Answering question from a text corpus with citation\"\"\"\n\n    qa_prompt_template: PromptTemplate = PromptTemplate(\n        'Answer the following question: \"{question}\". '\n        \"The context is: \\n{context}\\nAnswer: \"\n    )\n    llm: BaseLLM = LCAzureChatOpenAI.withx(\n        azure_endpoint=\"https://bleh-dummy.openai.azure.com/\",\n        openai_api_key=os.environ.get(\"OPENAI_API_KEY\", \"\"),\n        openai_api_version=\"2023-07-01-preview\",\n        deployment_name=\"dummy-q2-16k\",\n        temperature=0,\n        request_timeout=60,\n    )\n    citation_pipeline: CitationPipeline = Node(\n        default_callback=lambda self: CitationPipeline(llm=self.llm)\n    )\n\n    def _format_doc_text(self, text: str) -> str:\n        \"\"\"Format the text of each document\"\"\"\n        return text.replace(\"\\n\", \" \")\n\n    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:\n        \"\"\"Format the texts between all documents\"\"\"\n        matched_texts: list[str] = [\n            self._format_doc_text(doc.text) for doc in documents\n        ]\n        return \"\\n\\n\".join(matched_texts)\n\n    def run(\n        self,\n        question: str,\n        documents: list[RetrievedDocument],\n        use_citation: bool = False,\n        **kwargs\n    ) -> Document:\n        # retrieve relevant documents as context\n        context = self._format_retrieved_context(documents)\n        self.log_progress(\".context\", context=context)\n\n        # generate the answer\n        prompt = self.qa_prompt_template.populate(\n            context=context,\n            question=question,\n        )\n        self.log_progress(\".prompt\", prompt=prompt)\n        answer_text = self.llm(prompt).text\n        if use_citation:\n            citation = self.citation_pipeline(context=context, question=question)\n        else:\n            citation = None\n\n        answer = Document(text=answer_text, metadata={\"citation\": citation})\n        return answer\n
    "},{"location":"reference/indices/rankings/","title":"Rankings","text":""},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking","title":"BaseReranking","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n
    "},{"location":"reference/indices/rankings/#indices.rankings.BaseReranking.run","title":"run abstractmethod","text":"
    run(documents, query)\n

    Main method to transform list of documents (re-ranking, filtering, etc)

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    @abstractmethod\ndef run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n
    "},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking","title":"CohereReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.CohereReranking.run","title":"run","text":"
    run(documents, query)\n

    Use Cohere Reranker model to re-order documents with their relevance score

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    def run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking","title":"LLMReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMReranking.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring","title":"LLMScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring","title":"LLMTrulensScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/#indices.rankings.LLMTrulensScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/base/","title":"Base","text":""},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking","title":"BaseReranking","text":"

    Bases: BaseComponent

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    class BaseReranking(BaseComponent):\n    @abstractmethod\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Main method to transform list of documents\n        (re-ranking, filtering, etc)\"\"\"\n        ...\n
    "},{"location":"reference/indices/rankings/base/#indices.rankings.base.BaseReranking.run","title":"run abstractmethod","text":"
    run(documents, query)\n

    Main method to transform list of documents (re-ranking, filtering, etc)

    Source code in libs/kotaemon/kotaemon/indices/rankings/base.py
    @abstractmethod\ndef run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Main method to transform list of documents\n    (re-ranking, filtering, etc)\"\"\"\n    ...\n
    "},{"location":"reference/indices/rankings/cohere/","title":"Cohere","text":""},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking","title":"CohereReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    class CohereReranking(BaseReranking):\n    model_name: str = \"rerank-multilingual-v2.0\"\n    cohere_api_key: str = config(\"COHERE_API_KEY\", \"\")\n    use_key_from_ktem: bool = False\n\n    def run(self, documents: list[Document], query: str) -> list[Document]:\n        \"\"\"Use Cohere Reranker model to re-order documents\n        with their relevance score\"\"\"\n        try:\n            import cohere\n        except ImportError:\n            raise ImportError(\n                \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n            )\n\n        # try to get COHERE_API_KEY from embeddings\n        if not self.cohere_api_key and self.use_key_from_ktem:\n            try:\n                from ktem.embeddings.manager import (\n                    embedding_models_manager as embeddings,\n                )\n\n                cohere_model = embeddings.get(\"cohere\")\n                ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                    \"cohere_api_key\"\n                )\n                if ktem_cohere_api_key != \"your-key\":\n                    self.cohere_api_key = ktem_cohere_api_key\n            except Exception as e:\n                print(\"Cannot get Cohere API key from `ktem`\", e)\n\n        if not self.cohere_api_key:\n            print(\"Cohere API key not found. Skipping reranking.\")\n            return documents\n\n        cohere_client = cohere.Client(self.cohere_api_key)\n        compressed_docs: list[Document] = []\n\n        if not documents:  # to avoid empty api call\n            return compressed_docs\n\n        _docs = [d.content for d in documents]\n        response = cohere_client.rerank(\n            model=self.model_name, query=query, documents=_docs\n        )\n        # print(\"Cohere score\", [r.relevance_score for r in response.results])\n        for r in response.results:\n            doc = documents[r.index]\n            doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n            compressed_docs.append(doc)\n\n        return compressed_docs\n
    "},{"location":"reference/indices/rankings/cohere/#indices.rankings.cohere.CohereReranking.run","title":"run","text":"
    run(documents, query)\n

    Use Cohere Reranker model to re-order documents with their relevance score

    Source code in libs/kotaemon/kotaemon/indices/rankings/cohere.py
    def run(self, documents: list[Document], query: str) -> list[Document]:\n    \"\"\"Use Cohere Reranker model to re-order documents\n    with their relevance score\"\"\"\n    try:\n        import cohere\n    except ImportError:\n        raise ImportError(\n            \"Please install Cohere `pip install cohere` to use Cohere Reranking\"\n        )\n\n    # try to get COHERE_API_KEY from embeddings\n    if not self.cohere_api_key and self.use_key_from_ktem:\n        try:\n            from ktem.embeddings.manager import (\n                embedding_models_manager as embeddings,\n            )\n\n            cohere_model = embeddings.get(\"cohere\")\n            ktem_cohere_api_key = cohere_model._kwargs.get(  # type: ignore\n                \"cohere_api_key\"\n            )\n            if ktem_cohere_api_key != \"your-key\":\n                self.cohere_api_key = ktem_cohere_api_key\n        except Exception as e:\n            print(\"Cannot get Cohere API key from `ktem`\", e)\n\n    if not self.cohere_api_key:\n        print(\"Cohere API key not found. Skipping reranking.\")\n        return documents\n\n    cohere_client = cohere.Client(self.cohere_api_key)\n    compressed_docs: list[Document] = []\n\n    if not documents:  # to avoid empty api call\n        return compressed_docs\n\n    _docs = [d.content for d in documents]\n    response = cohere_client.rerank(\n        model=self.model_name, query=query, documents=_docs\n    )\n    # print(\"Cohere score\", [r.relevance_score for r in response.results])\n    for r in response.results:\n        doc = documents[r.index]\n        doc.metadata[\"cohere_reranking_score\"] = r.relevance_score\n        compressed_docs.append(doc)\n\n    return compressed_docs\n
    "},{"location":"reference/indices/rankings/llm/","title":"Llm","text":""},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking","title":"LLMReranking","text":"

    Bases: BaseReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    class LLMReranking(BaseReranking):\n    llm: BaseLLM\n    prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE)\n    top_k: int = 3\n    concurrent: bool = True\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [output_parser.parse(result) for result in results]\n        for include_doc, doc in zip(results, documents):\n            if include_doc:\n                filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm/#indices.rankings.llm.LLMReranking.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt).text))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [output_parser.parse(result) for result in results]\n    for include_doc, doc in zip(results, documents):\n        if include_doc:\n            filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_scoring/","title":"Llm Scoring","text":""},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring","title":"LLMScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    class LLMScoring(LLMReranking):\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs: list[Document] = []\n        output_parser = BooleanOutputParser()\n\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    _prompt = self.prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                    futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                results.append(self.llm(_prompt))\n\n        for result, doc in zip(results, documents):\n            score = np.exp(np.average(result.logprobs))\n            include_doc = output_parser.parse(result.text)\n            if include_doc:\n                doc.metadata[\"llm_reranking_score\"] = score\n            else:\n                doc.metadata[\"llm_reranking_score\"] = 1 - score\n            filtered_docs.append(doc)\n\n        # prevent returning empty result\n        if len(filtered_docs) == 0:\n            filtered_docs = documents[: self.top_k]\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_scoring/#indices.rankings.llm_scoring.LLMScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs: list[Document] = []\n    output_parser = BooleanOutputParser()\n\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                _prompt = self.prompt_template.populate(\n                    question=query, context=doc.get_content()\n                )\n                futures.append(executor.submit(lambda: self.llm(_prompt)))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            _prompt = self.prompt_template.populate(\n                question=query, context=doc.get_content()\n            )\n            results.append(self.llm(_prompt))\n\n    for result, doc in zip(results, documents):\n        score = np.exp(np.average(result.logprobs))\n        include_doc = output_parser.parse(result.text)\n        if include_doc:\n            doc.metadata[\"llm_reranking_score\"] = score\n        else:\n            doc.metadata[\"llm_reranking_score\"] = 1 - score\n        filtered_docs.append(doc)\n\n    # prevent returning empty result\n    if len(filtered_docs) == 0:\n        filtered_docs = documents[: self.top_k]\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_trulens/","title":"Llm Trulens","text":""},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.PATTERN_INTEGER","title":"PATTERN_INTEGER module-attribute","text":"
    PATTERN_INTEGER = compile('([+-]?[1-9][0-9]*|0)')\n

    Regex that matches integers.

    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring","title":"LLMTrulensScoring","text":"

    Bases: LLMReranking

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    class LLMTrulensScoring(LLMReranking):\n    llm: BaseLLM\n    system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE\n    user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE\n    concurrent: bool = True\n    normalize: float = 10\n    trim_func: TokenSplitter = TokenSplitter.withx(\n        chunk_size=MAX_CONTEXT_LEN,\n        chunk_overlap=0,\n        separator=\" \",\n        tokenizer=partial(\n            tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode,\n            allowed_special=set(),\n            disallowed_special=\"all\",\n        ),\n    )\n\n    def run(\n        self,\n        documents: list[Document],\n        query: str,\n    ) -> list[Document]:\n        \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n        filtered_docs = []\n\n        documents = sorted(documents, key=lambda doc: doc.get_content())\n        if self.concurrent:\n            with ThreadPoolExecutor() as executor:\n                futures = []\n                for doc in documents:\n                    chunked_doc_content = self.trim_func(\n                        [\n                            Document(content=doc.get_content())\n                            # skip metadata which cause troubles\n                        ]\n                    )[0].text\n\n                    messages = []\n                    messages.append(\n                        SystemMessage(self.system_prompt_template.populate())\n                    )\n                    messages.append(\n                        HumanMessage(\n                            self.user_prompt_template.populate(\n                                question=query, context=chunked_doc_content\n                            )\n                        )\n                    )\n\n                    def llm_call():\n                        return self.llm(messages).text\n\n                    futures.append(executor.submit(llm_call))\n\n                results = [future.result() for future in futures]\n        else:\n            results = []\n            for doc in documents:\n                messages = []\n                messages.append(SystemMessage(self.system_prompt_template.populate()))\n                messages.append(\n                    SystemMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=doc.get_content()\n                        )\n                    )\n                )\n                results.append(self.llm(messages).text)\n\n        # use Boolean parser to extract relevancy output from LLM\n        results = [\n            (r_idx, float(re_0_10_rating(result)) / self.normalize)\n            for r_idx, result in enumerate(results)\n        ]\n        results.sort(key=lambda x: x[1], reverse=True)\n\n        for r_idx, score in results:\n            doc = documents[r_idx]\n            doc.metadata[\"llm_trulens_score\"] = score\n            filtered_docs.append(doc)\n\n        print(\n            \"LLM rerank scores\",\n            [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n        )\n\n        return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.LLMTrulensScoring.run","title":"run","text":"
    run(documents, query)\n

    Filter down documents based on their relevance to the query.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def run(\n    self,\n    documents: list[Document],\n    query: str,\n) -> list[Document]:\n    \"\"\"Filter down documents based on their relevance to the query.\"\"\"\n    filtered_docs = []\n\n    documents = sorted(documents, key=lambda doc: doc.get_content())\n    if self.concurrent:\n        with ThreadPoolExecutor() as executor:\n            futures = []\n            for doc in documents:\n                chunked_doc_content = self.trim_func(\n                    [\n                        Document(content=doc.get_content())\n                        # skip metadata which cause troubles\n                    ]\n                )[0].text\n\n                messages = []\n                messages.append(\n                    SystemMessage(self.system_prompt_template.populate())\n                )\n                messages.append(\n                    HumanMessage(\n                        self.user_prompt_template.populate(\n                            question=query, context=chunked_doc_content\n                        )\n                    )\n                )\n\n                def llm_call():\n                    return self.llm(messages).text\n\n                futures.append(executor.submit(llm_call))\n\n            results = [future.result() for future in futures]\n    else:\n        results = []\n        for doc in documents:\n            messages = []\n            messages.append(SystemMessage(self.system_prompt_template.populate()))\n            messages.append(\n                SystemMessage(\n                    self.user_prompt_template.populate(\n                        question=query, context=doc.get_content()\n                    )\n                )\n            )\n            results.append(self.llm(messages).text)\n\n    # use Boolean parser to extract relevancy output from LLM\n    results = [\n        (r_idx, float(re_0_10_rating(result)) / self.normalize)\n        for r_idx, result in enumerate(results)\n    ]\n    results.sort(key=lambda x: x[1], reverse=True)\n\n    for r_idx, score in results:\n        doc = documents[r_idx]\n        doc.metadata[\"llm_trulens_score\"] = score\n        filtered_docs.append(doc)\n\n    print(\n        \"LLM rerank scores\",\n        [doc.metadata[\"llm_trulens_score\"] for doc in filtered_docs],\n    )\n\n    return filtered_docs\n
    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.validate_rating","title":"validate_rating","text":"
    validate_rating(rating)\n

    Validate a rating is between 0 and 10.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def validate_rating(rating) -> int:\n    \"\"\"Validate a rating is between 0 and 10.\"\"\"\n\n    if not 0 <= rating <= 10:\n        raise ValueError(\"Rating must be between 0 and 10\")\n\n    return rating\n
    "},{"location":"reference/indices/rankings/llm_trulens/#indices.rankings.llm_trulens.re_0_10_rating","title":"re_0_10_rating","text":"
    re_0_10_rating(s)\n

    Extract a 0-10 rating from a string.

    If the string does not match an integer or matches an integer outside the 0-10 range, raises an error instead. If multiple numbers are found within the expected 0-10 range, the smallest is returned.

    Parameters:

    Name Type Description Default s str

    String to extract rating from.

    required

    Returns:

    Name Type Description int int

    Extracted rating.

    Raises:

    Type Description ParseError

    If no integers between 0 and 10 are found in the string.

    Source code in libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py
    def re_0_10_rating(s: str) -> int:\n    \"\"\"Extract a 0-10 rating from a string.\n\n    If the string does not match an integer or matches an integer outside the\n    0-10 range, raises an error instead. If multiple numbers are found within\n    the expected 0-10 range, the smallest is returned.\n\n    Args:\n        s: String to extract rating from.\n\n    Returns:\n        int: Extracted rating.\n\n    Raises:\n        ParseError: If no integers between 0 and 10 are found in the string.\n    \"\"\"\n\n    matches = PATTERN_INTEGER.findall(s)\n    if not matches:\n        raise AssertionError\n\n    vals = set()\n    for match in matches:\n        try:\n            vals.add(validate_rating(int(match)))\n        except ValueError:\n            pass\n\n    if not vals:\n        raise AssertionError\n\n    # Min to handle cases like \"The rating is 8 out of 10.\"\n    return min(vals)\n
    "},{"location":"reference/indices/splitters/","title":"Splitters","text":""},{"location":"reference/indices/splitters/#indices.splitters.BaseSplitter","title":"BaseSplitter","text":"

    Bases: DocTransformer

    Represent base splitter class

    Source code in libs/kotaemon/kotaemon/indices/splitters/__init__.py
    class BaseSplitter(DocTransformer):\n    \"\"\"Represent base splitter class\"\"\"\n\n    ...\n
    "},{"location":"reference/llms/","title":"LLMs","text":""},{"location":"reference/llms/#llms.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"

    Bases: SimpleBranchingPipeline

    A simple gated branching pipeline for executing multiple branches based on a condition.

    This class extends the SimpleBranchingPipeline class and adds the ability to execute the branches until a branch returns a non-empty output based on a condition.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n
    "},{"location":"reference/llms/#llms.GatedBranchingPipeline.run","title":"run","text":"
    run(*, condition_text=None, **prompt_kwargs)\n

    Execute the pipeline by running each branch and return the output of the first branch that returns a non-empty output based on the provided condition.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate for each branch. Default to None.

    None **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Type Description

    Union[OutputType, None]: The output of the first branch that satisfies the

    condition, or None if no branch satisfies the condition.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n
    "},{"location":"reference/llms/#llms.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"

    Bases: BaseComponent

    A simple branching pipeline for executing multiple branches.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n
    "},{"location":"reference/llms/#llms.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"
    add_branch(component)\n

    Add a new branch to the pipeline.

    Parameters:

    Name Type Description Default component BaseComponent

    The branch component to be added.

    required Source code in libs/kotaemon/kotaemon/llms/branching.py
    def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n
    "},{"location":"reference/llms/#llms.SimpleBranchingPipeline.run","title":"run","text":"
    run(**prompt_kwargs)\n

    Execute the pipeline by running each branch and return the outputs as a list.

    Parameters:

    Name Type Description Default **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Name Type Description List

    The outputs of each branch as a list.

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n
    "},{"location":"reference/llms/#llms.AzureChatOpenAI","title":"AzureChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model provided by Microsoft Azure

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/llms/#llms.AzureChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.ChatOpenAI","title":"ChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.ChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/llms/#llms.ChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/#llms.EndpointChatLLM","title":"EndpointChatLLM","text":"

    Bases: ChatLLM

    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of a OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        return self.invoke(messages, **kwargs)\n
    "},{"location":"reference/llms/#llms.EndpointChatLLM.run","title":"run","text":"
    run(messages, **kwargs)\n

    Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n
    "},{"location":"reference/llms/#llms.EndpointChatLLM.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Same as run

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n
    "},{"location":"reference/llms/#llms.LlamaCppChat","title":"LlamaCppChat","text":"

    Bases: ChatLLM

    Wrapper around the llama-cpp-python's Llama model

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -> \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n
    "},{"location":"reference/llms/#llms.LlamaCppChat.client_object","title":"client_object","text":"
    client_object()\n

    Get the llama-cpp-python client object

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    @Param.auto()\ndef client_object(self) -> \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n
    "},{"location":"reference/llms/#llms.AzureOpenAI","title":"AzureOpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's AzureOpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n
    "},{"location":"reference/llms/#llms.LlamaCpp","title":"LlamaCpp","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's LlamaCpp class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n
    "},{"location":"reference/llms/#llms.OpenAI","title":"OpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's OpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n
    "},{"location":"reference/llms/#llms.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"

    Bases: BaseComponent

    Perform sequential chain-of-thought with manual pre-defined prompts

    This method supports variable number of steps. Each step corresponds to a kotaemon.pipelines.cot.Thought. Please refer that section for Thought's detail. This section is about chaining thought together.

    Usage:

    Create and run a chain of thought without \"+\" operator:

    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n>>> llm = LCAzureChatOpenAI(...)\n>>> thought1 = Thought(\n>>>    prompt=\"Word {word} in {language} is \",\n>>>    post_process=lambda string: {\"translated\": string},\n>>> )\n>>> thought2 = Thought(\n>>>     prompt=\"Translate {translated} to Japanese\",\n>>>     post_process=lambda string: {\"output\": string},\n>>> )\n>>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n>>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Create and run a chain of thought without \"+\" operator: Please refer the kotaemon.pipelines.cot.Thought section for examples.

    This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    >>> llm = LCAzureChatOpenAI(...)\n    >>> thought1 = Thought(\n    >>>    prompt=\"Word {word} in {language} is \",\n    >>>    post_process=lambda string: {\"translated\": string},\n    >>> )\n    >>> thought2 = Thought(\n    >>>     prompt=\"Translate {translated} to Japanese\",\n    >>>     post_process=lambda string: {\"output\": string},\n    >>> )\n    >>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    >>> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/#llms.ManualSequentialChainOfThought.run","title":"run","text":"
    run(**kwargs)\n

    Run the manual chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n
    "},{"location":"reference/llms/#llms.Thought","title":"Thought","text":"

    Bases: BaseComponent

    A thought in the chain of thought

    Usage:

    Create and run a thought:

    >> from kotaemon.pipelines.cot import Thought\n>> thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n>> output = thought(action=\"install\", object=\"python\")\n>> print(output)\n{'tutorial': 'As an AI language model,...'}\n

    Basically, when a thought is run, it will:

    1. Populate the prompt template with the input **kwargs.
    2. Run the LLM model with the populated prompt.
    3. Post-process the LLM output with the post-processor.

    This Thought allows chaining sequentially with the + operator. For example:

    >> llm = LCAzureChatOpenAI(...)\n>> thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n>> thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n>> thought = thought1 + thought2\n>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Under the hood, when the + operator is used, a ManualSequentialChainOfThought is created.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    >> from kotaemon.pipelines.cot import Thought\n    >> thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    >> output = thought(action=\"install\", object=\"python\")\n    >> print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    >> llm = LCAzureChatOpenAI(...)\n    >> thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    >> thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    >> thought = thought1 + thought2\n    >> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -> List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/#llms.Thought.prompt_template","title":"prompt_template","text":"
    prompt_template()\n

    Automatically wrap around param prompt. Can ignore

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    @Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n
    "},{"location":"reference/llms/#llms.Thought.run","title":"run","text":"
    run(**kwargs)\n

    Run the chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n
    "},{"location":"reference/llms/#llms.GatedLinearPipeline","title":"GatedLinearPipeline","text":"

    Bases: SimpleLinearPipeline

    A pipeline that extends the SimpleLinearPipeline class and adds a condition attribute.

    Attributes:

    Name Type Description condition Callable[[IO_Type], Any]

    A callable function that represents the condition.

    Usage Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -> Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n
    "},{"location":"reference/llms/#llms.GatedLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the pipeline with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate. Default to None.

    None llm_kwargs dict

    Additional keyword arguments for the language model call.

    {} post_processor_kwargs dict

    Additional keyword arguments for the post-processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document Document

    The final output of the pipeline as a Document object.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -> Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n
    "},{"location":"reference/llms/#llms.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"

    Bases: BaseComponent

    A simple pipeline for running a function with a prompt, a language model, and an optional post-processor.

    Attributes:

    Name Type Description prompt BasePromptComponent

    The prompt component used to generate the initial input.

    llm Union[ChatLLM, LLM]

    The language model component used to generate the output.

    post_processor Union[BaseComponent, Callable[[IO_Type], IO_Type]]

    An optional post-processor component or function.

    Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n
    "},{"location":"reference/llms/#llms.SimpleLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the function with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default llm_kwargs dict

    Keyword arguments for the llm call.

    {} post_processor_kwargs dict

    Keyword arguments for the post_processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document

    The final output of the function as a Document object.

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n
    "},{"location":"reference/llms/#llms.BasePromptComponent","title":"BasePromptComponent","text":"

    Bases: BaseComponent

    Base class for prompt components.

    Parameters:

    Name Type Description Default template PromptTemplate

    The prompt template.

    required **kwargs

    Any additional keyword arguments that will be used to populate the given template.

    {} Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n
    "},{"location":"reference/llms/#llms.BasePromptComponent.set_value","title":"set_value","text":"
    set_value(**kwargs)\n

    Similar to __set but for external use.

    Set the values of the attributes in the object based on the provided keyword arguments.

    Parameters:

    Name Type Description Default kwargs dict

    A dictionary with the attribute names as keys and the new values as values.

    {}

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n
    "},{"location":"reference/llms/#llms.BasePromptComponent.run","title":"run","text":"
    run(**kwargs)\n

    Run the function with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to pass to the function.

    {}

    Returns:

    Type Description

    The result of calling the populate method of the template object

    with the given keyword arguments.

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n
    "},{"location":"reference/llms/#llms.PromptTemplate","title":"PromptTemplate","text":"

    Base class for prompt templates.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -> str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n
    "},{"location":"reference/llms/#llms.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"
    check_missing_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n
    "},{"location":"reference/llms/#llms.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"
    check_redundant_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n
    "},{"location":"reference/llms/#llms.PromptTemplate.populate","title":"populate","text":"
    populate(**kwargs)\n

    Strictly populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Type Description str

    The populated template.

    Raises:

    Type Description ValueError

    If an unknown placeholder is provided.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def populate(self, **kwargs) -> str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n
    "},{"location":"reference/llms/#llms.PromptTemplate.partial_populate","title":"partial_populate","text":"
    partial_populate(**kwargs)\n

    Partially populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Name Type Description str

    The populated template.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n
    "},{"location":"reference/llms/base/","title":"Base","text":""},{"location":"reference/llms/branching/","title":"Branching","text":""},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline","title":"SimpleBranchingPipeline","text":"

    Bases: BaseComponent

    A simple branching pipeline for executing multiple branches.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = SimpleBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\nprint(pipeline(condition_text=\"12\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class SimpleBranchingPipeline(BaseComponent):\n    \"\"\"\n    A simple branching pipeline for executing multiple branches.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = SimpleBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        print(pipeline(condition_text=\"12\"))\n        ```\n    \"\"\"\n\n    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])\n\n    def add_branch(self, component: BaseComponent):\n        \"\"\"\n        Add a new branch to the pipeline.\n\n        Args:\n            component (BaseComponent): The branch component to be added.\n        \"\"\"\n        self.branches.append(component)\n\n    def run(self, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the outputs as a list.\n\n        Args:\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            List: The outputs of each branch as a list.\n        \"\"\"\n        output = []\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output.append(branch(**prompt_kwargs))\n\n        return output\n
    "},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.add_branch","title":"add_branch","text":"
    add_branch(component)\n

    Add a new branch to the pipeline.

    Parameters:

    Name Type Description Default component BaseComponent

    The branch component to be added.

    required Source code in libs/kotaemon/kotaemon/llms/branching.py
    def add_branch(self, component: BaseComponent):\n    \"\"\"\n    Add a new branch to the pipeline.\n\n    Args:\n        component (BaseComponent): The branch component to be added.\n    \"\"\"\n    self.branches.append(component)\n
    "},{"location":"reference/llms/branching/#llms.branching.SimpleBranchingPipeline.run","title":"run","text":"
    run(**prompt_kwargs)\n

    Execute the pipeline by running each branch and return the outputs as a list.

    Parameters:

    Name Type Description Default **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Name Type Description List

    The outputs of each branch as a list.

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the outputs as a list.\n\n    Args:\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        List: The outputs of each branch as a list.\n    \"\"\"\n    output = []\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output.append(branch(**prompt_kwargs))\n\n    return output\n
    "},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline","title":"GatedBranchingPipeline","text":"

    Bases: SimpleBranchingPipeline

    A simple gated branching pipeline for executing multiple branches based on a condition.

    This class extends the SimpleBranchingPipeline class and adds the ability to execute the branches until a branch returns a non-empty output based on a condition.

    Attributes:

    Name Type Description branches List[BaseComponent]

    The list of branches to be executed.

    Example
    from kotaemon.llms import (\n    LCAzureChatOpenAI,\n    BasePromptComponent,\n    GatedLinearPipeline,\n)\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\npipeline = GatedBranchingPipeline()\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\nfor i in range(3):\n    pipeline.add_branch(\n        GatedLinearPipeline(\n            prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n            condition=RegexExtractor(pattern=f\"{i}\"),\n            llm=llm,\n            post_processor=identity,\n        )\n    )\nprint(pipeline(condition_text=\"1\"))\nprint(pipeline(condition_text=\"2\"))\n
    Source code in libs/kotaemon/kotaemon/llms/branching.py
    class GatedBranchingPipeline(SimpleBranchingPipeline):\n    \"\"\"\n    A simple gated branching pipeline for executing multiple branches based on a\n        condition.\n\n    This class extends the SimpleBranchingPipeline class and adds the ability to execute\n        the branches until a branch returns a non-empty output based on a condition.\n\n    Attributes:\n        branches (List[BaseComponent]): The list of branches to be executed.\n\n    Example:\n        ```python\n        from kotaemon.llms import (\n            LCAzureChatOpenAI,\n            BasePromptComponent,\n            GatedLinearPipeline,\n        )\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        pipeline = GatedBranchingPipeline()\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        for i in range(3):\n            pipeline.add_branch(\n                GatedLinearPipeline(\n                    prompt=BasePromptComponent(template=f\"what is {i} in Japanese ?\"),\n                    condition=RegexExtractor(pattern=f\"{i}\"),\n                    llm=llm,\n                    post_processor=identity,\n                )\n            )\n        print(pipeline(condition_text=\"1\"))\n        print(pipeline(condition_text=\"2\"))\n        ```\n    \"\"\"\n\n    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n        \"\"\"\n        Execute the pipeline by running each branch and return the output of the first\n            branch that returns a non-empty output based on the provided condition.\n\n        Args:\n            condition_text (str): The condition text to evaluate for each branch.\n                Default to None.\n            **prompt_kwargs: Keyword arguments for the branches.\n\n        Returns:\n            Union[OutputType, None]: The output of the first branch that satisfies the\n            condition, or None if no branch satisfies the condition.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided.\")\n\n        for i, branch in enumerate(self.branches):\n            self._prepare_child(branch, name=f\"branch-{i}\")\n            output = branch(condition_text=condition_text, **prompt_kwargs)\n            if output:\n                return output\n\n        return Document(None)\n
    "},{"location":"reference/llms/branching/#llms.branching.GatedBranchingPipeline.run","title":"run","text":"
    run(*, condition_text=None, **prompt_kwargs)\n

    Execute the pipeline by running each branch and return the output of the first branch that returns a non-empty output based on the provided condition.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate for each branch. Default to None.

    None **prompt_kwargs

    Keyword arguments for the branches.

    {}

    Returns:

    Type Description

    Union[OutputType, None]: The output of the first branch that satisfies the

    condition, or None if no branch satisfies the condition.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/branching.py
    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):\n    \"\"\"\n    Execute the pipeline by running each branch and return the output of the first\n        branch that returns a non-empty output based on the provided condition.\n\n    Args:\n        condition_text (str): The condition text to evaluate for each branch.\n            Default to None.\n        **prompt_kwargs: Keyword arguments for the branches.\n\n    Returns:\n        Union[OutputType, None]: The output of the first branch that satisfies the\n        condition, or None if no branch satisfies the condition.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided.\")\n\n    for i, branch in enumerate(self.branches):\n        self._prepare_child(branch, name=f\"branch-{i}\")\n        output = branch(condition_text=condition_text, **prompt_kwargs)\n        if output:\n            return output\n\n    return Document(None)\n
    "},{"location":"reference/llms/cot/","title":"Cot","text":""},{"location":"reference/llms/cot/#llms.cot.Thought","title":"Thought","text":"

    Bases: BaseComponent

    A thought in the chain of thought

    Usage:

    Create and run a thought:

    >> from kotaemon.pipelines.cot import Thought\n>> thought = Thought(\n     prompt=\"How to {action} {object}?\",\n     llm=LCAzureChatOpenAI(...),\n     post_process=lambda string: {\"tutorial\": string},\n   )\n>> output = thought(action=\"install\", object=\"python\")\n>> print(output)\n{'tutorial': 'As an AI language model,...'}\n

    Basically, when a thought is run, it will:

    1. Populate the prompt template with the input **kwargs.
    2. Run the LLM model with the populated prompt.
    3. Post-process the LLM output with the post-processor.

    This Thought allows chaining sequentially with the + operator. For example:

    >> llm = LCAzureChatOpenAI(...)\n>> thought1 = Thought(\n       prompt=\"Word {word} in {language} is \",\n       llm=llm,\n       post_process=lambda string: {\"translated\": string},\n   )\n>> thought2 = Thought(\n        prompt=\"Translate {translated} to Japanese\",\n        llm=llm,\n        post_process=lambda string: {\"output\": string},\n   )\n\n>> thought = thought1 + thought2\n>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Under the hood, when the + operator is used, a ManualSequentialChainOfThought is created.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class Thought(BaseComponent):\n    \"\"\"A thought in the chain of thought\n\n    - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and\n    value is the value.\n    - Output: an output dictionary\n\n    _**Usage:**_\n\n    Create and run a thought:\n\n    ```python\n    >> from kotaemon.pipelines.cot import Thought\n    >> thought = Thought(\n         prompt=\"How to {action} {object}?\",\n         llm=LCAzureChatOpenAI(...),\n         post_process=lambda string: {\"tutorial\": string},\n       )\n    >> output = thought(action=\"install\", object=\"python\")\n    >> print(output)\n    {'tutorial': 'As an AI language model,...'}\n    ```\n\n    Basically, when a thought is run, it will:\n\n    1. Populate the prompt template with the input `**kwargs`.\n    2. Run the LLM model with the populated prompt.\n    3. Post-process the LLM output with the post-processor.\n\n    This `Thought` allows chaining sequentially with the + operator. For example:\n\n    ```python\n    >> llm = LCAzureChatOpenAI(...)\n    >> thought1 = Thought(\n           prompt=\"Word {word} in {language} is \",\n           llm=llm,\n           post_process=lambda string: {\"translated\": string},\n       )\n    >> thought2 = Thought(\n            prompt=\"Translate {translated} to Japanese\",\n            llm=llm,\n            post_process=lambda string: {\"output\": string},\n       )\n\n    >> thought = thought1 + thought2\n    >> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought`\n    is created.\n    \"\"\"\n\n    prompt: str = Param(\n        help=(\n            \"The prompt template string. This prompt template has Python-like variable\"\n            \" placeholders, that then will be substituted with real values when this\"\n            \" component is executed\"\n        )\n    )\n    llm: LLM = Node(LCAzureChatOpenAI, help=\"The LLM model to execute the input prompt\")\n    post_process: Function = Node(\n        help=(\n            \"The function post-processor that post-processes LLM output prediction .\"\n            \"It should take a string as input (this is the LLM output text) and return \"\n            \"a dictionary, where the key should\"\n        )\n    )\n\n    @Node.auto(depends_on=\"prompt\")\n    def prompt_template(self):\n        \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n        return BasePromptComponent(template=self.prompt)\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the chain of thought\"\"\"\n        prompt = self.prompt_template(**kwargs).text\n        response = self.llm(prompt).text\n        response = self.post_process(response)\n\n        return Document(response)\n\n    def get_variables(self) -> List[str]:\n        return []\n\n    def __add__(self, next_thought: \"Thought\") -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=[self, next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/cot/#llms.cot.Thought.prompt_template","title":"prompt_template","text":"
    prompt_template()\n

    Automatically wrap around param prompt. Can ignore

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    @Node.auto(depends_on=\"prompt\")\ndef prompt_template(self):\n    \"\"\"Automatically wrap around param prompt. Can ignore\"\"\"\n    return BasePromptComponent(template=self.prompt)\n
    "},{"location":"reference/llms/cot/#llms.cot.Thought.run","title":"run","text":"
    run(**kwargs)\n

    Run the chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the chain of thought\"\"\"\n    prompt = self.prompt_template(**kwargs).text\n    response = self.llm(prompt).text\n    response = self.post_process(response)\n\n    return Document(response)\n
    "},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought","title":"ManualSequentialChainOfThought","text":"

    Bases: BaseComponent

    Perform sequential chain-of-thought with manual pre-defined prompts

    This method supports variable number of steps. Each step corresponds to a kotaemon.pipelines.cot.Thought. Please refer that section for Thought's detail. This section is about chaining thought together.

    Usage:

    Create and run a chain of thought without \"+\" operator:

    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n>>> llm = LCAzureChatOpenAI(...)\n>>> thought1 = Thought(\n>>>    prompt=\"Word {word} in {language} is \",\n>>>    post_process=lambda string: {\"translated\": string},\n>>> )\n>>> thought2 = Thought(\n>>>     prompt=\"Translate {translated} to Japanese\",\n>>>     post_process=lambda string: {\"output\": string},\n>>> )\n>>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n>>> thought(word=\"hello\", language=\"French\")\n{'word': 'hello',\n 'language': 'French',\n 'translated': '\"Bonjour\"',\n 'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n

    Create and run a chain of thought without \"+\" operator: Please refer the kotaemon.pipelines.cot.Thought section for examples.

    This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False.

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    class ManualSequentialChainOfThought(BaseComponent):\n    \"\"\"Perform sequential chain-of-thought with manual pre-defined prompts\n\n    This method supports variable number of steps. Each step corresponds to a\n    `kotaemon.pipelines.cot.Thought`. Please refer that section for\n    Thought's detail. This section is about chaining thought together.\n\n    _**Usage:**_\n\n    **Create and run a chain of thought without \"+\" operator:**\n\n    ```pycon\n    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought\n    >>> llm = LCAzureChatOpenAI(...)\n    >>> thought1 = Thought(\n    >>>    prompt=\"Word {word} in {language} is \",\n    >>>    post_process=lambda string: {\"translated\": string},\n    >>> )\n    >>> thought2 = Thought(\n    >>>     prompt=\"Translate {translated} to Japanese\",\n    >>>     post_process=lambda string: {\"output\": string},\n    >>> )\n    >>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)\n    >>> thought(word=\"hello\", language=\"French\")\n    {'word': 'hello',\n     'language': 'French',\n     'translated': '\"Bonjour\"',\n     'output': '\u3053\u3093\u306b\u3061\u306f (Konnichiwa)'}\n    ```\n\n    **Create and run a chain of thought without \"+\" operator:** Please refer the\n    `kotaemon.pipelines.cot.Thought` section for examples.\n\n    This chain-of-thought optionally takes a termination check callback function.\n    This function will be called after each thought is executed. It takes in a\n    dictionary of all thought outputs so far, and it returns True or False. If\n    True, the chain-of-thought will terminate. If unset, the default callback always\n    returns False.\n    \"\"\"\n\n    thoughts: List[Thought] = Param(\n        default_callback=lambda *_: [], help=\"List of Thought\"\n    )\n    llm: LLM = Param(help=\"The LLM model to use (base of kotaemon.llms.BaseLLM)\")\n    terminate: Callable = Param(\n        default=lambda _: False,\n        help=\"Callback on terminate condition. Default to always return False\",\n    )\n\n    def run(self, **kwargs) -> Document:\n        \"\"\"Run the manual chain of thought\"\"\"\n\n        inputs = deepcopy(kwargs)\n        for idx, thought in enumerate(self.thoughts):\n            if self.llm:\n                thought.llm = self.llm\n            self._prepare_child(thought, f\"thought{idx}\")\n\n            output = thought(**inputs)\n            inputs.update(output.content)\n            if self.terminate(inputs):\n                break\n\n        return Document(inputs)\n\n    def __add__(self, next_thought: Thought) -> \"ManualSequentialChainOfThought\":\n        return ManualSequentialChainOfThought(\n            thoughts=self.thoughts + [next_thought], llm=self.llm\n        )\n
    "},{"location":"reference/llms/cot/#llms.cot.ManualSequentialChainOfThought.run","title":"run","text":"
    run(**kwargs)\n

    Run the manual chain of thought

    Source code in libs/kotaemon/kotaemon/llms/cot.py
    def run(self, **kwargs) -> Document:\n    \"\"\"Run the manual chain of thought\"\"\"\n\n    inputs = deepcopy(kwargs)\n    for idx, thought in enumerate(self.thoughts):\n        if self.llm:\n            thought.llm = self.llm\n        self._prepare_child(thought, f\"thought{idx}\")\n\n        output = thought(**inputs)\n        inputs.update(output.content)\n        if self.terminate(inputs):\n            break\n\n    return Document(inputs)\n
    "},{"location":"reference/llms/linear/","title":"Linear","text":""},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline","title":"SimpleLinearPipeline","text":"

    Bases: BaseComponent

    A simple pipeline for running a function with a prompt, a language model, and an optional post-processor.

    Attributes:

    Name Type Description prompt BasePromptComponent

    The prompt component used to generate the initial input.

    llm Union[ChatLLM, LLM]

    The language model component used to generate the output.

    post_processor Union[BaseComponent, Callable[[IO_Type], IO_Type]]

    An optional post-processor component or function.

    Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = SimpleLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class SimpleLinearPipeline(BaseComponent):\n    \"\"\"\n    A simple pipeline for running a function with a prompt, a language model, and an\n        optional post-processor.\n\n    Attributes:\n        prompt (BasePromptComponent): The prompt component used to generate the initial\n            input.\n        llm (Union[ChatLLM, LLM]): The language model component used to generate the\n            output.\n        post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional\n            post-processor component or function.\n\n    Example Usage:\n        ```python\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = SimpleLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(word=\"lone\"))\n        ```\n    \"\"\"\n\n    prompt: BasePromptComponent\n    llm: Union[ChatLLM, LLM]\n    post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]]\n\n    def run(\n        self,\n        *,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ):\n        \"\"\"\n        Run the function with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            llm_kwargs (dict): Keyword arguments for the llm call.\n            post_processor_kwargs (dict): Keyword arguments for the post_processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the function as a Document object.\n        \"\"\"\n        prompt = self.prompt(**prompt_kwargs)\n        llm_output = self.llm(prompt.text, **llm_kwargs)\n        if self.post_processor is not None:\n            final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n        else:\n            final_output = llm_output\n\n        return Document(final_output)\n
    "},{"location":"reference/llms/linear/#llms.linear.SimpleLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the function with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default llm_kwargs dict

    Keyword arguments for the llm call.

    {} post_processor_kwargs dict

    Keyword arguments for the post_processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document

    The final output of the function as a Document object.

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n):\n    \"\"\"\n    Run the function with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        llm_kwargs (dict): Keyword arguments for the llm call.\n        post_processor_kwargs (dict): Keyword arguments for the post_processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the function as a Document object.\n    \"\"\"\n    prompt = self.prompt(**prompt_kwargs)\n    llm_output = self.llm(prompt.text, **llm_kwargs)\n    if self.post_processor is not None:\n        final_output = self.post_processor(llm_output, **post_processor_kwargs)[0]\n    else:\n        final_output = llm_output\n\n    return Document(final_output)\n
    "},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline","title":"GatedLinearPipeline","text":"

    Bases: SimpleLinearPipeline

    A pipeline that extends the SimpleLinearPipeline class and adds a condition attribute.

    Attributes:

    Name Type Description condition Callable[[IO_Type], Any]

    A callable function that represents the condition.

    Usage Example Usage
    from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\nfrom kotaemon.parsers import RegexExtractor\n\ndef identity(x):\n    return x\n\nllm = LCAzureChatOpenAI(\n    openai_api_base=\"your openai api base\",\n    openai_api_key=\"your openai api key\",\n    openai_api_version=\"your openai api version\",\n    deployment_name=\"dummy-q2-gpt35\",\n    temperature=0,\n    request_timeout=600,\n)\n\npipeline = GatedLinearPipeline(\n    prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n    condition=RegexExtractor(pattern=\"some pattern\"),\n    llm=llm,\n    post_processor=identity,\n)\nprint(pipeline(condition_text=\"some pattern\", word=\"lone\"))\nprint(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n
    Source code in libs/kotaemon/kotaemon/llms/linear.py
    class GatedLinearPipeline(SimpleLinearPipeline):\n    \"\"\"\n    A pipeline that extends the SimpleLinearPipeline class and adds a condition\n        attribute.\n\n    Attributes:\n        condition (Callable[[IO_Type], Any]): A callable function that represents the\n            condition.\n\n    Usage:\n        ```{.py3 title=\"Example Usage\"}\n        from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent\n        from kotaemon.parsers import RegexExtractor\n\n        def identity(x):\n            return x\n\n        llm = LCAzureChatOpenAI(\n            openai_api_base=\"your openai api base\",\n            openai_api_key=\"your openai api key\",\n            openai_api_version=\"your openai api version\",\n            deployment_name=\"dummy-q2-gpt35\",\n            temperature=0,\n            request_timeout=600,\n        )\n\n        pipeline = GatedLinearPipeline(\n            prompt=BasePromptComponent(template=\"what is {word} in Japanese ?\"),\n            condition=RegexExtractor(pattern=\"some pattern\"),\n            llm=llm,\n            post_processor=identity,\n        )\n        print(pipeline(condition_text=\"some pattern\", word=\"lone\"))\n        print(pipeline(condition_text=\"other pattern\", word=\"lone\"))\n        ```\n    \"\"\"\n\n    condition: Callable[[IO_Type], Any]\n\n    def run(\n        self,\n        *,\n        condition_text: Optional[str] = None,\n        llm_kwargs: Optional[dict] = {},\n        post_processor_kwargs: Optional[dict] = {},\n        **prompt_kwargs,\n    ) -> Document:\n        \"\"\"\n        Run the pipeline with the given arguments and return the final output as a\n            Document object.\n\n        Args:\n            condition_text (str): The condition text to evaluate. Default to None.\n            llm_kwargs (dict): Additional keyword arguments for the language model call.\n            post_processor_kwargs (dict): Additional keyword arguments for the\n                post-processor.\n            **prompt_kwargs: Keyword arguments for populating the prompt.\n\n        Returns:\n            Document: The final output of the pipeline as a Document object.\n\n        Raises:\n            ValueError: If condition_text is None\n        \"\"\"\n        if condition_text is None:\n            raise ValueError(\"`condition_text` must be provided\")\n\n        if self.condition(condition_text)[0]:\n            return super().run(\n                llm_kwargs=llm_kwargs,\n                post_processor_kwargs=post_processor_kwargs,\n                **prompt_kwargs,\n            )\n\n        return Document(None)\n
    "},{"location":"reference/llms/linear/#llms.linear.GatedLinearPipeline.run","title":"run","text":"
    run(\n    *,\n    condition_text=None,\n    llm_kwargs={},\n    post_processor_kwargs={},\n    **prompt_kwargs\n)\n

    Run the pipeline with the given arguments and return the final output as a Document object.

    Parameters:

    Name Type Description Default condition_text str

    The condition text to evaluate. Default to None.

    None llm_kwargs dict

    Additional keyword arguments for the language model call.

    {} post_processor_kwargs dict

    Additional keyword arguments for the post-processor.

    {} **prompt_kwargs

    Keyword arguments for populating the prompt.

    {}

    Returns:

    Name Type Description Document Document

    The final output of the pipeline as a Document object.

    Raises:

    Type Description ValueError

    If condition_text is None

    Source code in libs/kotaemon/kotaemon/llms/linear.py
    def run(\n    self,\n    *,\n    condition_text: Optional[str] = None,\n    llm_kwargs: Optional[dict] = {},\n    post_processor_kwargs: Optional[dict] = {},\n    **prompt_kwargs,\n) -> Document:\n    \"\"\"\n    Run the pipeline with the given arguments and return the final output as a\n        Document object.\n\n    Args:\n        condition_text (str): The condition text to evaluate. Default to None.\n        llm_kwargs (dict): Additional keyword arguments for the language model call.\n        post_processor_kwargs (dict): Additional keyword arguments for the\n            post-processor.\n        **prompt_kwargs: Keyword arguments for populating the prompt.\n\n    Returns:\n        Document: The final output of the pipeline as a Document object.\n\n    Raises:\n        ValueError: If condition_text is None\n    \"\"\"\n    if condition_text is None:\n        raise ValueError(\"`condition_text` must be provided\")\n\n    if self.condition(condition_text)[0]:\n        return super().run(\n            llm_kwargs=llm_kwargs,\n            post_processor_kwargs=post_processor_kwargs,\n            **prompt_kwargs,\n        )\n\n    return Document(None)\n
    "},{"location":"reference/llms/chats/","title":"Chats","text":""},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM","title":"EndpointChatLLM","text":"

    Bases: ChatLLM

    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of a OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        return self.invoke(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.run","title":"run","text":"
    run(messages, **kwargs)\n

    Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n
    "},{"location":"reference/llms/chats/#llms.chats.EndpointChatLLM.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Same as run

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/#llms.chats.LCChatMixin","title":"LCChatMixin","text":"

    Mixin for langchain based chat models

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) > 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) > 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n
    "},{"location":"reference/llms/chats/#llms.chats.LCChatMixin.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Generate response from messages

    Parameters:

    Name Type Description Default messages str | BaseMessage | list[BaseMessage]

    history of messages to generate response from

    required **kwargs

    additional arguments to pass to the langchain chat model

    {}

    Returns:

    Name Type Description LLMInterface LLMInterface

    generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n
    "},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat","title":"LlamaCppChat","text":"

    Bases: ChatLLM

    Wrapper around the llama-cpp-python's Llama model

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -> \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n
    "},{"location":"reference/llms/chats/#llms.chats.LlamaCppChat.client_object","title":"client_object","text":"
    client_object()\n

    Get the llama-cpp-python client object

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    @Param.auto()\ndef client_object(self) -> \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n
    "},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI","title":"AzureChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model provided by Microsoft Azure

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.AzureChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI","title":"ChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/llms/chats/#llms.chats.ChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/base/","title":"Base","text":""},{"location":"reference/llms/chats/endpoint_based/","title":"Endpoint Based","text":""},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM","title":"EndpointChatLLM","text":"

    Bases: ChatLLM

    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint.

    Attributes:

    Name Type Description endpoint_url str

    The url of a OpenAI API compatible endpoint.

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    class EndpointChatLLM(ChatLLM):\n    \"\"\"\n    A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API\n    compatible endpoint.\n\n    Attributes:\n        endpoint_url (str): The url of a OpenAI API compatible endpoint.\n    \"\"\"\n\n    endpoint_url: str = Param(\n        help=\"URL of the OpenAI API compatible endpoint\", required=True\n    )\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"\n        Generate response from messages\n        Args:\n            messages (str | BaseMessage | list[BaseMessage]): history of messages to\n                generate response from\n            **kwargs: additional arguments to pass to the OpenAI API\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        def decide_role(message: BaseMessage):\n            if isinstance(message, SystemMessage):\n                return \"system\"\n            elif isinstance(message, AIMessage):\n                return \"assistant\"\n            else:\n                return \"user\"\n\n        request_json = {\n            \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n        }\n\n        response = requests.post(self.endpoint_url, json=request_json).json()\n\n        content = \"\"\n        candidates = []\n        if response[\"choices\"]:\n            candidates = [\n                each[\"message\"][\"content\"]\n                for each in response[\"choices\"]\n                if each[\"message\"][\"content\"]\n            ]\n            content = candidates[0]\n\n        return LLMInterface(\n            content=content,\n            candidates=candidates,\n            completion_tokens=response[\"usage\"][\"completion_tokens\"],\n            total_tokens=response[\"usage\"][\"total_tokens\"],\n            prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Same as run\"\"\"\n        return self.run(messages, **kwargs)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        return self.invoke(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.run","title":"run","text":"
    run(messages, **kwargs)\n

    Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def run(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"\n    Generate response from messages\n    Args:\n        messages (str | BaseMessage | list[BaseMessage]): history of messages to\n            generate response from\n        **kwargs: additional arguments to pass to the OpenAI API\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    def decide_role(message: BaseMessage):\n        if isinstance(message, SystemMessage):\n            return \"system\"\n        elif isinstance(message, AIMessage):\n            return \"assistant\"\n        else:\n            return \"user\"\n\n    request_json = {\n        \"messages\": [{\"content\": m.text, \"role\": decide_role(m)} for m in input_]\n    }\n\n    response = requests.post(self.endpoint_url, json=request_json).json()\n\n    content = \"\"\n    candidates = []\n    if response[\"choices\"]:\n        candidates = [\n            each[\"message\"][\"content\"]\n            for each in response[\"choices\"]\n            if each[\"message\"][\"content\"]\n        ]\n        content = candidates[0]\n\n    return LLMInterface(\n        content=content,\n        candidates=candidates,\n        completion_tokens=response[\"usage\"][\"completion_tokens\"],\n        total_tokens=response[\"usage\"][\"total_tokens\"],\n        prompt_tokens=response[\"usage\"][\"prompt_tokens\"],\n    )\n
    "},{"location":"reference/llms/chats/endpoint_based/#llms.chats.endpoint_based.EndpointChatLLM.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Same as run

    Source code in libs/kotaemon/kotaemon/llms/chats/endpoint_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Same as run\"\"\"\n    return self.run(messages, **kwargs)\n
    "},{"location":"reference/llms/chats/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin","title":"LCChatMixin","text":"

    Mixin for langchain based chat models

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    class LCChatMixin:\n    \"\"\"Mixin for langchain based chat models\"\"\"\n\n    def _get_lc_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant Langchain class in in _get_lc_class\"\n        )\n\n    def _get_tool_call_kwargs(self):\n        return {}\n\n    def __init__(self, stream: bool = False, **params):\n        self._lc_class = self._get_lc_class()\n        self._obj = self._lc_class(**params)\n        self._kwargs: dict = params\n        self._stream = stream\n\n        super().__init__()\n\n    def run(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        if self._stream:\n            return self.stream(messages, **kwargs)  # type: ignore\n        return self.invoke(messages, **kwargs)\n\n    def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]):\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        return input_\n\n    def prepare_response(self, pred):\n        all_text = [each.text for each in pred.generations[0]]\n        all_messages = [each.message for each in pred.generations[0]]\n\n        completion_tokens, total_tokens, prompt_tokens = 0, 0, 0\n        try:\n            if pred.llm_output is not None:\n                completion_tokens = pred.llm_output[\"token_usage\"][\"completion_tokens\"]\n                total_tokens = pred.llm_output[\"token_usage\"][\"total_tokens\"]\n                prompt_tokens = pred.llm_output[\"token_usage\"][\"prompt_tokens\"]\n        except Exception:\n            pass\n\n        return LLMInterface(\n            text=all_text[0] if len(all_text) > 0 else \"\",\n            candidates=all_text,\n            completion_tokens=completion_tokens,\n            total_tokens=total_tokens,\n            prompt_tokens=prompt_tokens,\n            messages=all_messages,\n            logits=[],\n        )\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        \"\"\"Generate response from messages\n\n        Args:\n            messages: history of messages to generate response from\n            **kwargs: additional arguments to pass to the langchain chat model\n\n        Returns:\n            LLMInterface: generated response\n        \"\"\"\n        input_ = self.prepare_message(messages)\n\n        if \"tools_pydantic\" in kwargs:\n            tools = kwargs.pop(\n                \"tools_pydantic\",\n            )\n            lc_tool_call = self._obj.bind_tools(tools)\n            pred = lc_tool_call.invoke(\n                input_,\n                **self._get_tool_call_kwargs(),\n            )\n            if pred.tool_calls:\n                tool_calls = pred.tool_calls\n            else:\n                tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n            output = LLMInterface(\n                content=\"\",\n                additional_kwargs={\"tool_calls\": tool_calls},\n            )\n        else:\n            pred = self._obj.generate(messages=[input_], **kwargs)\n            output = self.prepare_response(pred)\n\n        return output\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n        input_ = self.prepare_message(messages)\n        pred = await self._obj.agenerate(messages=[input_], **kwargs)\n        return self.prepare_response(pred)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        for response in self._obj.stream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> AsyncGenerator[LLMInterface, None]:\n        async for response in self._obj.astream(input=messages, **kwargs):\n            yield LLMInterface(content=response.content)\n\n    def to_langchain_format(self):\n        return self._obj\n\n    def __repr__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = repr(value_obj)\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __str__(self):\n        kwargs = []\n        for key, value_obj in self._kwargs.items():\n            value = str(value_obj)\n            if len(value) > 20:\n                value = f\"{value[:15]}...\"\n            kwargs.append(f\"{key}={value}\")\n        kwargs_repr = \", \".join(kwargs)\n        return f\"{self.__class__.__name__}({kwargs_repr})\"\n\n    def __setattr__(self, name, value):\n        if name == \"_lc_class\":\n            return super().__setattr__(name, value)\n\n        if name in self._lc_class.__fields__:\n            self._kwargs[name] = value\n            self._obj = self._lc_class(**self._kwargs)\n        else:\n            super().__setattr__(name, value)\n\n    def __getattr__(self, name):\n        if name in self._kwargs:\n            return self._kwargs[name]\n        return getattr(self._obj, name)\n\n    def dump(self, *args, **kwargs):\n        from theflow.utils.modules import serialize\n\n        params = {key: serialize(value) for key, value in self._kwargs.items()}\n        return {\n            \"__type__\": f\"{self.__module__}.{self.__class__.__qualname__}\",\n            **params,\n        }\n\n    def specs(self, path: str):\n        path = path.strip(\".\")\n        if \".\" in path:\n            raise ValueError(\"path should not contain '.'\")\n\n        if path in self._lc_class.__fields__:\n            return {\n                \"__type__\": \"theflow.base.ParamAttr\",\n                \"refresh_on_set\": True,\n                \"strict_type\": True,\n            }\n\n        raise ValueError(f\"Invalid param {path}\")\n
    "},{"location":"reference/llms/chats/langchain_based/#llms.chats.langchain_based.LCChatMixin.invoke","title":"invoke","text":"
    invoke(messages, **kwargs)\n

    Generate response from messages

    Parameters:

    Name Type Description Default messages str | BaseMessage | list[BaseMessage]

    history of messages to generate response from

    required **kwargs

    additional arguments to pass to the langchain chat model

    {}

    Returns:

    Name Type Description LLMInterface LLMInterface

    generated response

    Source code in libs/kotaemon/kotaemon/llms/chats/langchain_based.py
    def invoke(\n    self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n) -> LLMInterface:\n    \"\"\"Generate response from messages\n\n    Args:\n        messages: history of messages to generate response from\n        **kwargs: additional arguments to pass to the langchain chat model\n\n    Returns:\n        LLMInterface: generated response\n    \"\"\"\n    input_ = self.prepare_message(messages)\n\n    if \"tools_pydantic\" in kwargs:\n        tools = kwargs.pop(\n            \"tools_pydantic\",\n        )\n        lc_tool_call = self._obj.bind_tools(tools)\n        pred = lc_tool_call.invoke(\n            input_,\n            **self._get_tool_call_kwargs(),\n        )\n        if pred.tool_calls:\n            tool_calls = pred.tool_calls\n        else:\n            tool_calls = pred.additional_kwargs.get(\"tool_calls\", [])\n\n        output = LLMInterface(\n            content=\"\",\n            additional_kwargs={\"tool_calls\": tool_calls},\n        )\n    else:\n        pred = self._obj.generate(messages=[input_], **kwargs)\n        output = self.prepare_response(pred)\n\n    return output\n
    "},{"location":"reference/llms/chats/llamacpp/","title":"Llamacpp","text":""},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat","title":"LlamaCppChat","text":"

    Bases: ChatLLM

    Wrapper around the llama-cpp-python's Llama model

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    class LlamaCppChat(ChatLLM):\n    \"\"\"Wrapper around the llama-cpp-python's Llama model\"\"\"\n\n    model_path: Optional[str] = Param(\n        help=\"Path to the model file. This is required to load the model.\",\n    )\n    repo_id: Optional[str] = Param(\n        help=\"Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`.\"\n    )\n    filename: Optional[str] = Param(\n        help=\"A filename or glob pattern to match the model file in the repo.\"\n    )\n    chat_format: str = Param(\n        help=(\n            \"Chat format to use. Please refer to llama_cpp.llama_chat_format for a \"\n            \"list of supported formats. If blank, the chat format will be auto-\"\n            \"inferred.\"\n        ),\n        required=True,\n    )\n    lora_base: Optional[str] = Param(None, help=\"Path to the base Lora model\")\n    n_ctx: Optional[int] = Param(512, help=\"Text context, 0 = from model\")\n    n_gpu_layers: Optional[int] = Param(\n        0,\n        help=\"Number of layers to offload to GPU. If -1, all layers are offloaded\",\n    )\n    use_mmap: Optional[bool] = Param(\n        True,\n        help=(),\n    )\n    vocab_only: Optional[bool] = Param(\n        False,\n        help=\"If True, only the vocabulary is loaded. This is useful for debugging.\",\n    )\n\n    _role_mapper: dict[str, str] = {\n        \"human\": \"user\",\n        \"system\": \"system\",\n        \"ai\": \"assistant\",\n    }\n\n    @Param.auto()\n    def client_object(self) -> \"Llama\":\n        \"\"\"Get the llama-cpp-python client object\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError:\n            raise ImportError(\n                \"llama-cpp-python is not installed. \"\n                \"Please install it using `pip install llama-cpp-python`\"\n            )\n\n        errors = []\n        if not self.model_path and (not self.repo_id or not self.filename):\n            errors.append(\n                \"- `model_path` or `repo_id` and `filename` are required to load the\"\n                \" model\"\n            )\n\n        if not self.chat_format:\n            errors.append(\n                \"- `chat_format` is required to know how to format the chat messages. \"\n                \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n                \"formats.\"\n            )\n        if errors:\n            raise ValueError(\"\\n\".join(errors))\n\n        if self.model_path:\n            return Llama(\n                model_path=cast(str, self.model_path),\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n        else:\n            return Llama.from_pretrained(\n                repo_id=self.repo_id,\n                filename=self.filename,\n                chat_format=self.chat_format,\n                lora_base=self.lora_base,\n                n_ctx=self.n_ctx,\n                n_gpu_layers=self.n_gpu_layers,\n                use_mmap=self.use_mmap,\n                vocab_only=self.vocab_only,\n            )\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[dict]:\n        input_: list[BaseMessage] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        output_ = [\n            {\"role\": self._role_mapper[each.type], \"content\": each.content}\n            for each in input_\n        ]\n\n        return output_\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> LLMInterface:\n\n        pred: \"CCCR\" = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=False,\n        )\n\n        return LLMInterface(\n            content=pred[\"choices\"][0][\"message\"][\"content\"] if pred[\"choices\"] else \"\",\n            candidates=[\n                c[\"message\"][\"content\"]\n                for c in pred[\"choices\"]\n                if c[\"message\"][\"content\"]\n            ],\n            completion_tokens=pred[\"usage\"][\"completion_tokens\"],\n            total_tokens=pred[\"usage\"][\"total_tokens\"],\n            prompt_tokens=pred[\"usage\"][\"prompt_tokens\"],\n        )\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], **kwargs\n    ) -> Iterator[LLMInterface]:\n        pred = self.client_object.create_chat_completion(\n            messages=self.prepare_message(messages),\n            stream=True,\n        )\n        for chunk in pred:\n            if not chunk[\"choices\"]:\n                continue\n\n            if \"content\" not in chunk[\"choices\"][0][\"delta\"]:\n                continue\n\n            yield LLMInterface(content=chunk[\"choices\"][0][\"delta\"][\"content\"])\n
    "},{"location":"reference/llms/chats/llamacpp/#llms.chats.llamacpp.LlamaCppChat.client_object","title":"client_object","text":"
    client_object()\n

    Get the llama-cpp-python client object

    Source code in libs/kotaemon/kotaemon/llms/chats/llamacpp.py
    @Param.auto()\ndef client_object(self) -> \"Llama\":\n    \"\"\"Get the llama-cpp-python client object\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError:\n        raise ImportError(\n            \"llama-cpp-python is not installed. \"\n            \"Please install it using `pip install llama-cpp-python`\"\n        )\n\n    errors = []\n    if not self.model_path and (not self.repo_id or not self.filename):\n        errors.append(\n            \"- `model_path` or `repo_id` and `filename` are required to load the\"\n            \" model\"\n        )\n\n    if not self.chat_format:\n        errors.append(\n            \"- `chat_format` is required to know how to format the chat messages. \"\n            \"Please refer to llama_cpp.llama_chat_format for a list of supported \"\n            \"formats.\"\n        )\n    if errors:\n        raise ValueError(\"\\n\".join(errors))\n\n    if self.model_path:\n        return Llama(\n            model_path=cast(str, self.model_path),\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n    else:\n        return Llama.from_pretrained(\n            repo_id=self.repo_id,\n            filename=self.filename,\n            chat_format=self.chat_format,\n            lora_base=self.lora_base,\n            n_ctx=self.n_ctx,\n            n_gpu_layers=self.n_gpu_layers,\n            use_mmap=self.use_mmap,\n            vocab_only=self.vocab_only,\n        )\n
    "},{"location":"reference/llms/chats/openai/","title":"Openai","text":""},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI","title":"BaseChatOpenAI","text":"

    Bases: ChatLLM

    Base interface for OpenAI chat model, using the openai library

    This class exposes the parameters in resources.Chat. To subclass this class:

    - Implement the `prepare_client` method to return the OpenAI client\n- Implement the `openai_response` method to return the OpenAI response\n- Implement the params relate to the OpenAI client\n
    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class BaseChatOpenAI(ChatLLM):\n    \"\"\"Base interface for OpenAI chat model, using the openai library\n\n    This class exposes the parameters in resources.Chat. To subclass this class:\n\n        - Implement the `prepare_client` method to return the OpenAI client\n        - Implement the `openai_response` method to return the OpenAI response\n        - Implement the params relate to the OpenAI client\n    \"\"\"\n\n    _dependencies = [\"openai\"]\n    _capabilities = [\"chat\", \"text\"]  # consider as mixin\n\n    api_key: str = Param(help=\"API key\", required=True)\n    timeout: Optional[float] = Param(None, help=\"Timeout for the API request\")\n    max_retries: Optional[int] = Param(\n        None, help=\"Maximum number of retries for the API request\"\n    )\n\n    temperature: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between 0 and 2 that controls the randomness of the generated \"\n            \"tokens. Lower values make the model more deterministic, while higher \"\n            \"values make the model more random.\"\n        ),\n    )\n    max_tokens: Optional[int] = Param(\n        None,\n        help=(\n            \"Maximum number of tokens to generate. The total length of input tokens \"\n            \"and generated tokens is limited by the model's context length.\"\n        ),\n    )\n    n: int = Param(\n        1,\n        help=(\n            \"Number of completions to generate. The API will generate n completion \"\n            \"for each prompt.\"\n        ),\n    )\n    stop: Optional[str | list[str]] = Param(\n        None,\n        help=(\n            \"Stop sequence. If a stop sequence is detected, generation will stop \"\n            \"at that point. If not specified, generation will continue until the \"\n            \"maximum token length is reached.\"\n        ),\n    )\n    frequency_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing frequency in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    presence_penalty: Optional[float] = Param(\n        None,\n        help=(\n            \"Number between -2.0 and 2.0. Positive values penalize new tokens \"\n            \"based on their existing presence in the text so far, decrearsing the \"\n            \"model's likelihood of repeating the same text.\"\n        ),\n    )\n    tool_choice: Optional[str] = Param(\n        None,\n        help=(\n            \"Choice of tool to use for the completion. Available choices are: \"\n            \"auto, default.\"\n        ),\n    )\n    tools: Optional[list[str]] = Param(\n        None,\n        help=\"List of tools to use for the completion.\",\n    )\n    logprobs: Optional[bool] = Param(\n        None,\n        help=(\n            \"Include log probabilities on the logprobs most likely tokens, \"\n            \"as well as the chosen token.\"\n        ),\n    )\n    logit_bias: Optional[dict] = Param(\n        None,\n        help=(\n            \"Dictionary of logit bias values to add to the logits of the tokens \"\n            \"in the vocabulary.\"\n        ),\n    )\n    top_logprobs: Optional[int] = Param(\n        None,\n        help=(\n            \"An integer between 0 and 5 specifying the number of most likely tokens \"\n            \"to return at each token position, each with an associated log \"\n            \"probability. `logprobs` must also be set to `true` if this parameter \"\n            \"is used.\"\n        ),\n    )\n    top_p: Optional[float] = Param(\n        None,\n        help=(\n            \"An alternative to sampling with temperature, called nucleus sampling, \"\n            \"where the model considers the results of the token with top_p \"\n            \"probability mass. So 0.1 means that only the tokens comprising the \"\n            \"top 10% probability mass are considered.\"\n        ),\n    )\n\n    @Param.auto(depends_on=[\"max_retries\"])\n    def max_retries_(self):\n        if self.max_retries is None:\n            from openai._constants import DEFAULT_MAX_RETRIES\n\n            return DEFAULT_MAX_RETRIES\n        return self.max_retries\n\n    def prepare_message(\n        self, messages: str | BaseMessage | list[BaseMessage]\n    ) -> list[\"ChatCompletionMessageParam\"]:\n        \"\"\"Prepare the message into OpenAI format\n\n        Returns:\n            list[dict]: List of messages in OpenAI format\n        \"\"\"\n        input_: list[BaseMessage] = []\n        output_: list[\"ChatCompletionMessageParam\"] = []\n\n        if isinstance(messages, str):\n            input_ = [HumanMessage(content=messages)]\n        elif isinstance(messages, BaseMessage):\n            input_ = [messages]\n        else:\n            input_ = messages\n\n        for message in input_:\n            output_.append(message.to_openai_format())\n\n        return output_\n\n    def prepare_output(self, resp: dict) -> LLMInterface:\n        \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n        additional_kwargs = {}\n        if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n            additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n                \"tool_calls\"\n            ]\n\n        if resp[\"choices\"][0].get(\"logprobs\") is None:\n            logprobs = []\n        else:\n            all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n            logprobs = (\n                [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n            )\n\n        output = LLMInterface(\n            candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n            content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n            total_tokens=resp[\"usage\"][\"total_tokens\"],\n            prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n            completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n            additional_kwargs=additional_kwargs,\n            messages=[\n                AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n                for _ in resp[\"choices\"]\n            ],\n            logprobs=logprobs,\n        )\n\n        return output\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        raise NotImplementedError\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        raise NotImplementedError\n\n    def invoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> LLMInterface:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n        return self.prepare_output(resp)\n\n    async def ainvoke(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> LLMInterface:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = await self.openai_response(\n            client, messages=input_messages, stream=False, **kwargs\n        ).dict()\n\n        return self.prepare_output(resp)\n\n    def stream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> Iterator[LLMInterface]:\n        client = self.prepare_client(async_version=False)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        for c in resp:\n            chunk = c.dict()\n            if not chunk[\"choices\"]:\n                continue\n            if chunk[\"choices\"][0][\"delta\"][\"content\"] is not None:\n                if chunk[\"choices\"][0].get(\"logprobs\") is None:\n                    logprobs = []\n                else:\n                    logprobs = [\n                        logprob[\"logprob\"]\n                        for logprob in chunk[\"choices\"][0][\"logprobs\"].get(\n                            \"content\", []\n                        )\n                    ]\n\n                yield LLMInterface(\n                    content=chunk[\"choices\"][0][\"delta\"][\"content\"], logprobs=logprobs\n                )\n\n    async def astream(\n        self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs\n    ) -> AsyncGenerator[LLMInterface, None]:\n        client = self.prepare_client(async_version=True)\n        input_messages = self.prepare_message(messages)\n        resp = self.openai_response(\n            client, messages=input_messages, stream=True, **kwargs\n        )\n\n        async for chunk in resp:\n            if not chunk.choices:\n                continue\n            if chunk.choices[0].delta.content is not None:\n                yield LLMInterface(content=chunk.choices[0].delta.content)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_message","title":"prepare_message","text":"
    prepare_message(messages)\n

    Prepare the message into OpenAI format

    Returns:

    Type Description list[ChatCompletionMessageParam]

    list[dict]: List of messages in OpenAI format

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_message(\n    self, messages: str | BaseMessage | list[BaseMessage]\n) -> list[\"ChatCompletionMessageParam\"]:\n    \"\"\"Prepare the message into OpenAI format\n\n    Returns:\n        list[dict]: List of messages in OpenAI format\n    \"\"\"\n    input_: list[BaseMessage] = []\n    output_: list[\"ChatCompletionMessageParam\"] = []\n\n    if isinstance(messages, str):\n        input_ = [HumanMessage(content=messages)]\n    elif isinstance(messages, BaseMessage):\n        input_ = [messages]\n    else:\n        input_ = messages\n\n    for message in input_:\n        output_.append(message.to_openai_format())\n\n    return output_\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_output","title":"prepare_output","text":"
    prepare_output(resp)\n

    Convert the OpenAI response into LLMInterface

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_output(self, resp: dict) -> LLMInterface:\n    \"\"\"Convert the OpenAI response into LLMInterface\"\"\"\n    additional_kwargs = {}\n    if \"tool_calls\" in resp[\"choices\"][0][\"message\"]:\n        additional_kwargs[\"tool_calls\"] = resp[\"choices\"][0][\"message\"][\n            \"tool_calls\"\n        ]\n\n    if resp[\"choices\"][0].get(\"logprobs\") is None:\n        logprobs = []\n    else:\n        all_logprobs = resp[\"choices\"][0][\"logprobs\"].get(\"content\")\n        logprobs = (\n            [logprob[\"logprob\"] for logprob in all_logprobs] if all_logprobs else []\n        )\n\n    output = LLMInterface(\n        candidates=[(_[\"message\"][\"content\"] or \"\") for _ in resp[\"choices\"]],\n        content=resp[\"choices\"][0][\"message\"][\"content\"] or \"\",\n        total_tokens=resp[\"usage\"][\"total_tokens\"],\n        prompt_tokens=resp[\"usage\"][\"prompt_tokens\"],\n        completion_tokens=resp[\"usage\"][\"completion_tokens\"],\n        additional_kwargs=additional_kwargs,\n        messages=[\n            AIMessage(content=(_[\"message\"][\"content\"]) or \"\")\n            for _ in resp[\"choices\"]\n        ],\n        logprobs=logprobs,\n    )\n\n    return output\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.BaseChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    raise NotImplementedError\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI","title":"ChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class ChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model\"\"\"\n\n    base_url: Optional[str] = Param(None, help=\"OpenAI base URL\")\n    organization: Optional[str] = Param(None, help=\"OpenAI organization\")\n    model: str = Param(help=\"OpenAI model\", required=True)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"api_key\": self.api_key,\n            \"organization\": self.organization,\n            \"base_url\": self.base_url,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncOpenAI\n\n            return AsyncOpenAI(**params)\n\n        from openai import OpenAI\n\n        return OpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.model,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"api_key\": self.api_key,\n        \"organization\": self.organization,\n        \"base_url\": self.base_url,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncOpenAI\n\n        return AsyncOpenAI(**params)\n\n    from openai import OpenAI\n\n    return OpenAI(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.ChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.model,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI","title":"AzureChatOpenAI","text":"

    Bases: BaseChatOpenAI

    OpenAI chat model provided by Microsoft Azure

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    class AzureChatOpenAI(BaseChatOpenAI):\n    \"\"\"OpenAI chat model provided by Microsoft Azure\"\"\"\n\n    azure_endpoint: str = Param(\n        help=(\n            \"HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, \"\n            \"azure_deployment, and api_version parameters are used to construct \"\n            \"the full URL for the Azure OpenAI model.\"\n        ),\n        required=True,\n    )\n    azure_deployment: str = Param(help=\"Azure deployment name\", required=True)\n    api_version: str = Param(help=\"Azure model version\", required=True)\n    azure_ad_token: Optional[str] = Param(None, help=\"Azure AD token\")\n    azure_ad_token_provider: Optional[str] = Param(None, help=\"Azure AD token provider\")\n\n    @Param.auto(depends_on=[\"azure_ad_token_provider\"])\n    def azure_ad_token_provider_(self):\n        if isinstance(self.azure_ad_token_provider, str):\n            return import_dotted_string(self.azure_ad_token_provider, safe=False)\n\n    def prepare_client(self, async_version: bool = False):\n        \"\"\"Get the OpenAI client\n\n        Args:\n            async_version (bool): Whether to get the async version of the client\n        \"\"\"\n        params = {\n            \"azure_endpoint\": self.azure_endpoint,\n            \"api_version\": self.api_version,\n            \"api_key\": self.api_key,\n            \"azure_ad_token\": self.azure_ad_token,\n            \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n            \"timeout\": self.timeout,\n            \"max_retries\": self.max_retries_,\n        }\n        if async_version:\n            from openai import AsyncAzureOpenAI\n\n            return AsyncAzureOpenAI(**params)\n\n        from openai import AzureOpenAI\n\n        return AzureOpenAI(**params)\n\n    def openai_response(self, client, **kwargs):\n        \"\"\"Get the openai response\"\"\"\n        if \"tools_pydantic\" in kwargs:\n            kwargs.pop(\"tools_pydantic\")\n\n        params_ = {\n            \"model\": self.azure_deployment,\n            \"temperature\": self.temperature,\n            \"max_tokens\": self.max_tokens,\n            \"n\": self.n,\n            \"stop\": self.stop,\n            \"frequency_penalty\": self.frequency_penalty,\n            \"presence_penalty\": self.presence_penalty,\n            \"tool_choice\": self.tool_choice,\n            \"tools\": self.tools,\n            \"logprobs\": self.logprobs,\n            \"logit_bias\": self.logit_bias,\n            \"top_logprobs\": self.top_logprobs,\n            \"top_p\": self.top_p,\n        }\n        params = {k: v for k, v in params_.items() if v is not None}\n        params.update(kwargs)\n\n        return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.prepare_client","title":"prepare_client","text":"
    prepare_client(async_version=False)\n

    Get the OpenAI client

    Parameters:

    Name Type Description Default async_version bool

    Whether to get the async version of the client

    False Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def prepare_client(self, async_version: bool = False):\n    \"\"\"Get the OpenAI client\n\n    Args:\n        async_version (bool): Whether to get the async version of the client\n    \"\"\"\n    params = {\n        \"azure_endpoint\": self.azure_endpoint,\n        \"api_version\": self.api_version,\n        \"api_key\": self.api_key,\n        \"azure_ad_token\": self.azure_ad_token,\n        \"azure_ad_token_provider\": self.azure_ad_token_provider_,\n        \"timeout\": self.timeout,\n        \"max_retries\": self.max_retries_,\n    }\n    if async_version:\n        from openai import AsyncAzureOpenAI\n\n        return AsyncAzureOpenAI(**params)\n\n    from openai import AzureOpenAI\n\n    return AzureOpenAI(**params)\n
    "},{"location":"reference/llms/chats/openai/#llms.chats.openai.AzureChatOpenAI.openai_response","title":"openai_response","text":"
    openai_response(client, **kwargs)\n

    Get the openai response

    Source code in libs/kotaemon/kotaemon/llms/chats/openai.py
    def openai_response(self, client, **kwargs):\n    \"\"\"Get the openai response\"\"\"\n    if \"tools_pydantic\" in kwargs:\n        kwargs.pop(\"tools_pydantic\")\n\n    params_ = {\n        \"model\": self.azure_deployment,\n        \"temperature\": self.temperature,\n        \"max_tokens\": self.max_tokens,\n        \"n\": self.n,\n        \"stop\": self.stop,\n        \"frequency_penalty\": self.frequency_penalty,\n        \"presence_penalty\": self.presence_penalty,\n        \"tool_choice\": self.tool_choice,\n        \"tools\": self.tools,\n        \"logprobs\": self.logprobs,\n        \"logit_bias\": self.logit_bias,\n        \"top_logprobs\": self.top_logprobs,\n        \"top_p\": self.top_p,\n    }\n    params = {k: v for k, v in params_.items() if v is not None}\n    params.update(kwargs)\n\n    return client.chat.completions.create(**params)\n
    "},{"location":"reference/llms/completions/","title":"Completions","text":""},{"location":"reference/llms/completions/#llms.completions.AzureOpenAI","title":"AzureOpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's AzureOpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n
    "},{"location":"reference/llms/completions/#llms.completions.LlamaCpp","title":"LlamaCpp","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's LlamaCpp class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n
    "},{"location":"reference/llms/completions/#llms.completions.OpenAI","title":"OpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's OpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n
    "},{"location":"reference/llms/completions/base/","title":"Base","text":""},{"location":"reference/llms/completions/langchain_based/","title":"Langchain Based","text":""},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.OpenAI","title":"OpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's OpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class OpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's OpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        openai_api_key: Optional[str] = None,\n        openai_api_base: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            openai_api_key=openai_api_key,\n            openai_api_base=openai_api_base,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import OpenAI\n        except ImportError:\n            from langchain.llms import OpenAI\n\n        return OpenAI\n
    "},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.AzureOpenAI","title":"AzureOpenAI","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's AzureOpenAI class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class AzureOpenAI(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's AzureOpenAI class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        azure_endpoint: Optional[str] = None,\n        deployment_name: Optional[str] = None,\n        openai_api_version: str = \"\",\n        openai_api_key: Optional[str] = None,\n        model_name: str = \"text-davinci-003\",\n        temperature: float = 0.7,\n        max_tokens: int = 256,\n        top_p: float = 1,\n        frequency_penalty: float = 0,\n        n: int = 1,\n        best_of: int = 1,\n        request_timeout: Optional[float] = None,\n        max_retries: int = 2,\n        streaming: bool = False,\n        **params,\n    ):\n        super().__init__(\n            azure_endpoint=azure_endpoint,\n            deployment_name=deployment_name,\n            openai_api_version=openai_api_version,\n            openai_api_key=openai_api_key,\n            model_name=model_name,\n            temperature=temperature,\n            max_tokens=max_tokens,\n            top_p=top_p,\n            frequency_penalty=frequency_penalty,\n            n=n,\n            best_of=best_of,\n            request_timeout=request_timeout,\n            max_retries=max_retries,\n            streaming=streaming,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_openai import AzureOpenAI\n        except ImportError:\n            from langchain.llms import AzureOpenAI\n\n        return AzureOpenAI\n
    "},{"location":"reference/llms/completions/langchain_based/#llms.completions.langchain_based.LlamaCpp","title":"LlamaCpp","text":"

    Bases: LCCompletionMixin, LLM

    Wrapper around Langchain's LlamaCpp class, focusing on key parameters

    Source code in libs/kotaemon/kotaemon/llms/completions/langchain_based.py
    class LlamaCpp(LCCompletionMixin, LLM):\n    \"\"\"Wrapper around Langchain's LlamaCpp class, focusing on key parameters\"\"\"\n\n    def __init__(\n        self,\n        model_path: str,\n        lora_base: Optional[str] = None,\n        n_ctx: int = 512,\n        n_gpu_layers: Optional[int] = None,\n        use_mmap: bool = True,\n        **params,\n    ):\n        super().__init__(\n            model_path=model_path,\n            lora_base=lora_base,\n            n_ctx=n_ctx,\n            n_gpu_layers=n_gpu_layers,\n            use_mmap=use_mmap,\n            **params,\n        )\n\n    def _get_lc_class(self):\n        try:\n            from langchain_community.llms import LlamaCpp\n        except ImportError:\n            from langchain.llms import LlamaCpp\n\n        return LlamaCpp\n
    "},{"location":"reference/llms/prompts/","title":"Prompts","text":""},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent","title":"BasePromptComponent","text":"

    Bases: BaseComponent

    Base class for prompt components.

    Parameters:

    Name Type Description Default template PromptTemplate

    The prompt template.

    required **kwargs

    Any additional keyword arguments that will be used to populate the given template.

    {} Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n
    "},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.set_value","title":"set_value","text":"
    set_value(**kwargs)\n

    Similar to __set but for external use.

    Set the values of the attributes in the object based on the provided keyword arguments.

    Parameters:

    Name Type Description Default kwargs dict

    A dictionary with the attribute names as keys and the new values as values.

    {}

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n
    "},{"location":"reference/llms/prompts/#llms.prompts.BasePromptComponent.run","title":"run","text":"
    run(**kwargs)\n

    Run the function with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to pass to the function.

    {}

    Returns:

    Type Description

    The result of calling the populate method of the template object

    with the given keyword arguments.

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate","title":"PromptTemplate","text":"

    Base class for prompt templates.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -> str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"
    check_missing_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"
    check_redundant_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.populate","title":"populate","text":"
    populate(**kwargs)\n

    Strictly populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Type Description str

    The populated template.

    Raises:

    Type Description ValueError

    If an unknown placeholder is provided.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def populate(self, **kwargs) -> str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n
    "},{"location":"reference/llms/prompts/#llms.prompts.PromptTemplate.partial_populate","title":"partial_populate","text":"
    partial_populate(**kwargs)\n

    Partially populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Name Type Description str

    The populated template.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n
    "},{"location":"reference/llms/prompts/base/","title":"Base","text":""},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent","title":"BasePromptComponent","text":"

    Bases: BaseComponent

    Base class for prompt components.

    Parameters:

    Name Type Description Default template PromptTemplate

    The prompt template.

    required **kwargs

    Any additional keyword arguments that will be used to populate the given template.

    {} Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    class BasePromptComponent(BaseComponent):\n    \"\"\"\n    Base class for prompt components.\n\n    Args:\n        template (PromptTemplate): The prompt template.\n        **kwargs: Any additional keyword arguments that will be used to populate the\n            given template.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n        allow_extra = True\n\n    template: str | PromptTemplate\n\n    @Param.auto(depends_on=\"template\")\n    def template__(self):\n        return (\n            self.template\n            if isinstance(self.template, PromptTemplate)\n            else PromptTemplate(self.template)\n        )\n\n    def __init__(self, **kwargs):\n        super().__init__(**kwargs)\n        self.__set(**kwargs)\n\n    def __check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check for redundant keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments.\n\n        Raises:\n            ValueError: If any keys provided are not in the template.\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_redundant_kwargs(**kwargs)\n\n    def __check_unset_placeholders(self):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        self.template__.check_missing_kwargs(**self.__dict__)\n\n    def __validate_value_type(self, **kwargs):\n        \"\"\"\n        Validates the value types of the given keyword arguments.\n\n        Parameters:\n            **kwargs (dict): A dictionary of keyword arguments to be validated.\n\n        Raises:\n            ValueError: If any of the values in the kwargs dictionary have an\n                unsupported type.\n\n        Returns:\n            None\n        \"\"\"\n        type_error = []\n        for k, v in kwargs.items():\n            if k.startswith(\"template\"):\n                continue\n            if not isinstance(v, (str, int, Document, Callable)):  # type: ignore\n                type_error.append((k, type(v)))\n\n        if type_error:\n            raise ValueError(\n                \"Type of values must be either int, str, Document, Callable, \"\n                f\"found unsupported type for (key, type): {type_error}\"\n            )\n\n    def __set(self, **kwargs):\n        \"\"\"\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__check_redundant_kwargs(**kwargs)\n        self.__validate_value_type(**kwargs)\n\n        self.__dict__.update(kwargs)\n\n    def __prepare_value(self):\n        \"\"\"\n        Generate a dictionary of keyword arguments based on the template's placeholders\n            and the current instance's attributes.\n\n        Returns:\n            dict: A dictionary of keyword arguments.\n        \"\"\"\n\n        def __prepare(key, value):\n            if isinstance(value, str):\n                return value\n            if isinstance(value, (int, Document)):\n                return str(value)\n\n            raise ValueError(\n                f\"Unsupported type {type(value)} for template value of key {key}\"\n            )\n\n        kwargs = {}\n        for k in self.template__.placeholders:\n            v = getattr(self, k)\n\n            # if get a callable, execute to get its output\n            if isinstance(v, Callable):  # type: ignore[arg-type]\n                v = v()\n\n            if isinstance(v, list):\n                v = str([__prepare(k, each) for each in v])\n            elif isinstance(v, (str, int, Document)):\n                v = __prepare(k, v)\n            else:\n                raise ValueError(\n                    f\"Unsupported type {type(v)} for template value of key `{k}`\"\n                )\n            kwargs[k] = v\n\n        return kwargs\n\n    def set_value(self, **kwargs):\n        \"\"\"\n        Similar to `__set` but for external use.\n\n        Set the values of the attributes in the object based on the provided keyword\n            arguments.\n\n        Args:\n            kwargs (dict): A dictionary with the attribute names as keys and the new\n                values as values.\n\n        Returns:\n            None\n        \"\"\"\n        self.__set(**kwargs)\n\n    def run(self, **kwargs):\n        \"\"\"\n        Run the function with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to pass to the function.\n\n        Returns:\n            The result of calling the `populate` method of the `template` object\n            with the given keyword arguments.\n        \"\"\"\n        self.__set(**kwargs)\n        self.__check_unset_placeholders()\n        prepared_kwargs = self.__prepare_value()\n\n        text = self.template__.populate(**prepared_kwargs)\n        return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n\n    def flow(self):\n        return self.__call__()\n
    "},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.set_value","title":"set_value","text":"
    set_value(**kwargs)\n

    Similar to __set but for external use.

    Set the values of the attributes in the object based on the provided keyword arguments.

    Parameters:

    Name Type Description Default kwargs dict

    A dictionary with the attribute names as keys and the new values as values.

    {}

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def set_value(self, **kwargs):\n    \"\"\"\n    Similar to `__set` but for external use.\n\n    Set the values of the attributes in the object based on the provided keyword\n        arguments.\n\n    Args:\n        kwargs (dict): A dictionary with the attribute names as keys and the new\n            values as values.\n\n    Returns:\n        None\n    \"\"\"\n    self.__set(**kwargs)\n
    "},{"location":"reference/llms/prompts/base/#llms.prompts.base.BasePromptComponent.run","title":"run","text":"
    run(**kwargs)\n

    Run the function with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to pass to the function.

    {}

    Returns:

    Type Description

    The result of calling the populate method of the template object

    with the given keyword arguments.

    Source code in libs/kotaemon/kotaemon/llms/prompts/base.py
    def run(self, **kwargs):\n    \"\"\"\n    Run the function with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to pass to the function.\n\n    Returns:\n        The result of calling the `populate` method of the `template` object\n        with the given keyword arguments.\n    \"\"\"\n    self.__set(**kwargs)\n    self.__check_unset_placeholders()\n    prepared_kwargs = self.__prepare_value()\n\n    text = self.template__.populate(**prepared_kwargs)\n    return Document(text=text, metadata={\"origin\": \"PromptComponent\"})\n
    "},{"location":"reference/llms/prompts/template/","title":"Template","text":""},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate","title":"PromptTemplate","text":"

    Base class for prompt templates.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    class PromptTemplate:\n    \"\"\"\n    Base class for prompt templates.\n    \"\"\"\n\n    def __init__(self, template: str, ignore_invalid=True):\n        template = template\n        formatter = Formatter()\n        parsed_template = list(formatter.parse(template))\n\n        placeholders = set()\n        for _, key, _, _ in parsed_template:\n            if key is None:\n                continue\n            if not key.isidentifier():\n                if ignore_invalid:\n                    warnings.warn(f\"Ignore invalid placeholder: {key}.\", UserWarning)\n                else:\n                    raise ValueError(\n                        \"Placeholder name must be a valid Python identifier, found:\"\n                        f\" {key}.\"\n                    )\n            placeholders.add(key)\n\n        self.template = template\n        self.placeholders = placeholders\n        self.__formatter = formatter\n        self.__parsed_template = parsed_template\n\n    def check_missing_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        missing_keys = self.placeholders.difference(kwargs.keys())\n        if missing_keys:\n            raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n\n    def check_redundant_kwargs(self, **kwargs):\n        \"\"\"\n        Check if all the placeholders in the template are set.\n\n        This function checks if all the expected placeholders in the template are set as\n            attributes of the object. If any placeholders are missing, a `ValueError`\n            is raised with the names of the missing keys.\n\n        Parameters:\n            None\n\n        Returns:\n            None\n        \"\"\"\n        provided_keys = set(kwargs.keys())\n        redundant_keys = provided_keys - self.placeholders\n\n        if redundant_keys:\n            warnings.warn(\n                f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n                UserWarning,\n            )\n\n    def populate(self, **kwargs) -> str:\n        \"\"\"\n        Strictly populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            The populated template.\n\n        Raises:\n            ValueError: If an unknown placeholder is provided.\n        \"\"\"\n        self.check_missing_kwargs(**kwargs)\n\n        return self.partial_populate(**kwargs)\n\n    def partial_populate(self, **kwargs):\n        \"\"\"\n        Partially populate the template with the given keyword arguments.\n\n        Args:\n            **kwargs: The keyword arguments to populate the template.\n                      Each keyword corresponds to a placeholder in the template.\n\n        Returns:\n            str: The populated template.\n        \"\"\"\n        self.check_redundant_kwargs(**kwargs)\n\n        prompt = []\n        for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n            prompt.append(literal_text)\n\n            if field_name is None:\n                continue\n\n            if field_name not in kwargs:\n                if conversion:\n                    value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n                else:\n                    value = f\"{{{field_name}:{format_spec}}}\"\n            else:\n                value = kwargs[field_name]\n                if conversion is not None:\n                    value = self.__formatter.convert_field(value, conversion)\n                if format_spec is not None:\n                    value = self.__formatter.format_field(value, format_spec)\n\n            prompt.append(value)\n\n        return \"\".join(prompt)\n\n    def __add__(self, other):\n        \"\"\"\n        Create a new PromptTemplate object by concatenating the template of the current\n            object with the template of another PromptTemplate object.\n\n        Parameters:\n            other (PromptTemplate): Another PromptTemplate object.\n\n        Returns:\n            PromptTemplate: A new PromptTemplate object with the concatenated templates.\n        \"\"\"\n        return PromptTemplate(self.template + \"\\n\" + other.template)\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_missing_kwargs","title":"check_missing_kwargs","text":"
    check_missing_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_missing_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    missing_keys = self.placeholders.difference(kwargs.keys())\n    if missing_keys:\n        raise ValueError(f\"Missing keys in template: {','.join(missing_keys)}\")\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.check_redundant_kwargs","title":"check_redundant_kwargs","text":"
    check_redundant_kwargs(**kwargs)\n

    Check if all the placeholders in the template are set.

    This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a ValueError is raised with the names of the missing keys.

    Returns:

    Type Description

    None

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def check_redundant_kwargs(self, **kwargs):\n    \"\"\"\n    Check if all the placeholders in the template are set.\n\n    This function checks if all the expected placeholders in the template are set as\n        attributes of the object. If any placeholders are missing, a `ValueError`\n        is raised with the names of the missing keys.\n\n    Parameters:\n        None\n\n    Returns:\n        None\n    \"\"\"\n    provided_keys = set(kwargs.keys())\n    redundant_keys = provided_keys - self.placeholders\n\n    if redundant_keys:\n        warnings.warn(\n            f\"Keys provided but not in template: {','.join(redundant_keys)}\",\n            UserWarning,\n        )\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.populate","title":"populate","text":"
    populate(**kwargs)\n

    Strictly populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Type Description str

    The populated template.

    Raises:

    Type Description ValueError

    If an unknown placeholder is provided.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def populate(self, **kwargs) -> str:\n    \"\"\"\n    Strictly populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        The populated template.\n\n    Raises:\n        ValueError: If an unknown placeholder is provided.\n    \"\"\"\n    self.check_missing_kwargs(**kwargs)\n\n    return self.partial_populate(**kwargs)\n
    "},{"location":"reference/llms/prompts/template/#llms.prompts.template.PromptTemplate.partial_populate","title":"partial_populate","text":"
    partial_populate(**kwargs)\n

    Partially populate the template with the given keyword arguments.

    Parameters:

    Name Type Description Default **kwargs

    The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template.

    {}

    Returns:

    Name Type Description str

    The populated template.

    Source code in libs/kotaemon/kotaemon/llms/prompts/template.py
    def partial_populate(self, **kwargs):\n    \"\"\"\n    Partially populate the template with the given keyword arguments.\n\n    Args:\n        **kwargs: The keyword arguments to populate the template.\n                  Each keyword corresponds to a placeholder in the template.\n\n    Returns:\n        str: The populated template.\n    \"\"\"\n    self.check_redundant_kwargs(**kwargs)\n\n    prompt = []\n    for literal_text, field_name, format_spec, conversion in self.__parsed_template:\n        prompt.append(literal_text)\n\n        if field_name is None:\n            continue\n\n        if field_name not in kwargs:\n            if conversion:\n                value = f\"{{{field_name}}}!{conversion}:{format_spec}\"\n            else:\n                value = f\"{{{field_name}:{format_spec}}}\"\n        else:\n            value = kwargs[field_name]\n            if conversion is not None:\n                value = self.__formatter.convert_field(value, conversion)\n            if format_spec is not None:\n                value = self.__formatter.format_field(value, format_spec)\n\n        prompt.append(value)\n\n    return \"\".join(prompt)\n
    "},{"location":"reference/loaders/","title":"Loaders","text":""},{"location":"reference/loaders/#loaders.AdobeReader","title":"AdobeReader","text":"

    Bases: BaseReader

    Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy

    Example
    >> from kotaemon.loaders import AdobeReader\n>> reader = AdobeReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Args: endpoint: URL to the Vision Language Model endpoint. If not provided, will use the default kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT

    max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n
    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import AdobeReader\n        >> reader = AdobeReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n
    "},{"location":"reference/loaders/#loaders.AdobeReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, **kwargs)\n

    Load data by calling to the Adobe's API

    Parameters:

    Name Type Description Default file Path

    Path to the PDF file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image

    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n
    "},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"

    Bases: BaseReader

    Utilize Azure AI Document Intelligence to parse document

    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read & crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/#loaders.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Extract the input file, allowing multi-modal extraction

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read & crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/#loaders.AutoReader","title":"AutoReader","text":"

    Bases: BaseReader

    General auto reader for a variety of files. (based on llama-hub)

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -> None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        return self.load_data(file=file, **kwargs)\n
    "},{"location":"reference/loaders/#loaders.BaseReader","title":"BaseReader","text":"

    Bases: BaseComponent

    The base class for all readers

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n
    "},{"location":"reference/loaders/#loaders.DirectoryReader","title":"DirectoryReader","text":"

    Bases: LIReaderMixin, BaseReader

    Wrap around llama-index SimpleDirectoryReader

    Parameters:

    Name Type Description Default input_dir str

    Path to the directory.

    required input_files List

    List of file paths to read (Optional; overrides input_dir, exclude)

    required exclude List

    glob of python file paths to exclude (Optional)

    required exclude_hidden bool

    Whether to exclude hidden files (dotfiles).

    required encoding str

    Encoding of the files. Default is utf-8.

    required errors str

    how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open

    required recursive bool

    Whether to recursively search in subdirectories. False by default.

    required filename_as_id bool

    Whether to use the filename as the document id. False by default.

    required required_exts Optional[List[str]]

    List of required extensions. Default is None.

    required file_extractor Optional[Dict[str, BaseReader]]

    A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.

    required num_files_limit Optional[int]

    Maximum number of files to read. Default is None.

    required file_metadata Optional[Callable[str, Dict]]

    A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.

    required Source code in libs/kotaemon/kotaemon/loaders/composite_loader.py
    class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -> Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n
    "},{"location":"reference/loaders/#loaders.DocxReader","title":"DocxReader","text":"

    Bases: BaseReader

    Read Docx files that respect table, using python-docx library

    Reader behavior Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -> List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.DocxReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Docx reader

    Parameters:

    Name Type Description Default file_path Path

    Path to .docx file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.ExcelReader","title":"ExcelReader","text":"

    Bases: BaseReader

    Spreadsheet exporter respecting multiple worksheets

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n
    "},{"location":"reference/loaders/#loaders.ExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    True sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n
    "},{"location":"reference/loaders/#loaders.PandasExcelReader","title":"PandasExcelReader","text":"

    Bases: BaseReader

    Pandas-based CSV parser.

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n
    "},{"location":"reference/loaders/#loaders.PandasExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    False sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n
    "},{"location":"reference/loaders/#loaders.HtmlReader","title":"HtmlReader","text":"

    Bases: BaseReader

    Reader HTML usimg html2text

    Reader behavior

    Parameters:

    Name Type Description Default page_break_pattern str

    Pattern to split the HTML into pages

    None Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.HtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Html reader

    Parameters:

    Name Type Description Default file_path Path | str

    path to HTML file

    required extra_info Optional[dict]

    extra information passed to this reader during extracting data

    None

    Returns:

    Type Description list[Document]

    list[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.MhtmlReader","title":"MhtmlReader","text":"

    Bases: BaseReader

    Parse MHTML files with BeautifulSoup.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -> None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/#loaders.MhtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load MHTML document into document objects.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/#loaders.MathpixPDFReader","title":"MathpixPDFReader","text":"

    Bases: BaseReader

    Load PDF files using Mathpix service.

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -> Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -> str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -> dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -> str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -> None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -> str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -> str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"
    wait_for_processing(pdf_id)\n

    Wait for processing to complete.

    Parameters:

    Name Type Description Default pdf_id str

    a PDF id.

    required

    Returns: None

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def wait_for_processing(self, pdf_id: str) -> None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n
    "},{"location":"reference/loaders/#loaders.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"
    clean_pdf(contents)\n

    Clean the PDF file.

    Parameters:

    Name Type Description Default contents str

    a PDF file contents.

    required

    Returns:

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def clean_pdf(self, contents: str) -> str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n
    "},{"location":"reference/loaders/#loaders.ImageReader","title":"ImageReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from knowledgehub.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    required Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from knowledgehub.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n
    "},{"location":"reference/loaders/#loaders.ImageReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n
    "},{"location":"reference/loaders/#loaders.OCRReader","title":"OCRReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from kotaemon.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    True Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.OCRReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.PDFThumbnailReader","title":"PDFThumbnailReader","text":"

    Bases: PDFReader

    PDF parser with thumbnail for each page.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -> List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/#loaders.PDFThumbnailReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, fs=None)\n

    Parse file.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -> List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/#loaders.UnstructuredReader","title":"UnstructuredReader","text":"

    Bases: BaseReader

    General unstructured text reader for a variety of files.

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n
    "},{"location":"reference/loaders/#loaders.UnstructuredReader.load_data","title":"load_data","text":"
    load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n

    If api is set, parse through api

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n
    "},{"location":"reference/loaders/adobe_loader/","title":"Adobe Loader","text":""},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader","title":"AdobeReader","text":"

    Bases: BaseReader

    Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy

    Example
    >> from kotaemon.loaders import AdobeReader\n>> reader = AdobeReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Args: endpoint: URL to the Vision Language Model endpoint. If not provided, will use the default kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT

    max_figures_to_caption: an int decides how many figured will be captioned.\nThe rest will be ignored (are indexed without captions).\n
    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    class AdobeReader(BaseReader):\n    \"\"\"Read PDF using the Adobe's PDF Services.\n    Be able to extract text, table, and figure with high accuracy\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import AdobeReader\n        >> reader = AdobeReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n    Args:\n        endpoint: URL to the Vision Language Model endpoint. If not provided,\n        will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT`\n\n        max_figures_to_caption: an int decides how many figured will be captioned.\n        The rest will be ignored (are indexed without captions).\n    \"\"\"\n\n    def __init__(\n        self,\n        vlm_endpoint: Optional[str] = None,\n        max_figures_to_caption: int = 100,\n        *args: Any,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params\"\"\"\n        super().__init__(*args)\n        self.table_regex = r\"/Table(\\[\\d+\\])?$\"\n        self.figure_regex = r\"/Figure(\\[\\d+\\])?$\"\n        self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT\n        self.max_figures_to_caption = max_figures_to_caption\n\n    def load_data(\n        self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data by calling to the Adobe's API\n\n        Args:\n            file (Path): Path to the PDF file\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file,\n                includes 3 types: text, table, and image\n\n        \"\"\"\n        from .utils.adobe import (\n            generate_figure_captions,\n            load_json,\n            parse_figure_paths,\n            parse_table_paths,\n            request_adobe_service,\n        )\n\n        filename = file.name\n        filepath = str(Path(file).resolve())\n        output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n        results_path = os.path.join(output_path, \"structuredData.json\")\n\n        if not os.path.exists(results_path):\n            logger.exception(\"Fail to parse the document.\")\n            return []\n\n        data = load_json(results_path)\n\n        texts = defaultdict(list)\n        tables = []\n        figures = []\n\n        elements = data[\"elements\"]\n        for item_id, item in enumerate(elements):\n            page_number = item.get(\"Page\", -1) + 1\n            item_path = item[\"Path\"]\n            item_text = item.get(\"Text\", \"\")\n\n            file_paths = [\n                Path(output_path) / path for path in item.get(\"filePaths\", [])\n            ]\n            prev_item = elements[item_id - 1]\n            title = prev_item.get(\"Text\", \"\")\n\n            if re.search(self.table_regex, item_path):\n                table_content = parse_table_paths(file_paths)\n                if not table_content:\n                    continue\n                table_caption = (\n                    table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                    + f\"\\n(Table in Page {page_number}. {title})\"\n                )\n                tables.append((page_number, table_content, table_caption))\n\n            elif re.search(self.figure_regex, item_path):\n                figure_caption = (\n                    item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n                )\n                figure_content = parse_figure_paths(file_paths)\n                if not figure_content:\n                    continue\n                figures.append([page_number, figure_content, figure_caption])\n\n            else:\n                if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                    texts[page_number].append(item_text)\n\n        # get figure caption using GPT-4V\n        figure_captions = generate_figure_captions(\n            self.vlm_endpoint,\n            [item[1] for item in figures],\n            self.max_figures_to_caption,\n        )\n        for item, caption in zip(figures, figure_captions):\n            # update figure caption\n            item[2] += \" \" + caption\n\n        # Wrap elements with Document\n        documents = []\n\n        # join plain text elements\n        for page_number, txts in texts.items():\n            documents.append(\n                Document(\n                    text=\"\\n\".join(txts),\n                    metadata={\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                )\n            )\n\n        # table elements\n        for page_number, table_content, table_caption in tables:\n            documents.append(\n                Document(\n                    text=table_content,\n                    metadata={\n                        \"table_origin\": table_content,\n                        \"type\": \"table\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        # figure elements\n        for page_number, figure_content, figure_caption in figures:\n            documents.append(\n                Document(\n                    text=figure_caption,\n                    metadata={\n                        \"image_origin\": figure_content,\n                        \"type\": \"image\",\n                        \"page_label\": page_number,\n                        \"file_name\": filename,\n                        \"file_path\": filepath,\n                    },\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n        return documents\n
    "},{"location":"reference/loaders/adobe_loader/#loaders.adobe_loader.AdobeReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, **kwargs)\n

    Load data by calling to the Adobe's API

    Parameters:

    Name Type Description Default file Path

    Path to the PDF file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image

    Source code in libs/kotaemon/kotaemon/loaders/adobe_loader.py
    def load_data(\n    self, file: Path, extra_info: Optional[Dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data by calling to the Adobe's API\n\n    Args:\n        file (Path): Path to the PDF file\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file,\n            includes 3 types: text, table, and image\n\n    \"\"\"\n    from .utils.adobe import (\n        generate_figure_captions,\n        load_json,\n        parse_figure_paths,\n        parse_table_paths,\n        request_adobe_service,\n    )\n\n    filename = file.name\n    filepath = str(Path(file).resolve())\n    output_path = request_adobe_service(file_path=str(file), output_path=\"\")\n    results_path = os.path.join(output_path, \"structuredData.json\")\n\n    if not os.path.exists(results_path):\n        logger.exception(\"Fail to parse the document.\")\n        return []\n\n    data = load_json(results_path)\n\n    texts = defaultdict(list)\n    tables = []\n    figures = []\n\n    elements = data[\"elements\"]\n    for item_id, item in enumerate(elements):\n        page_number = item.get(\"Page\", -1) + 1\n        item_path = item[\"Path\"]\n        item_text = item.get(\"Text\", \"\")\n\n        file_paths = [\n            Path(output_path) / path for path in item.get(\"filePaths\", [])\n        ]\n        prev_item = elements[item_id - 1]\n        title = prev_item.get(\"Text\", \"\")\n\n        if re.search(self.table_regex, item_path):\n            table_content = parse_table_paths(file_paths)\n            if not table_content:\n                continue\n            table_caption = (\n                table_content.replace(\"|\", \"\").replace(\"---\", \"\")\n                + f\"\\n(Table in Page {page_number}. {title})\"\n            )\n            tables.append((page_number, table_content, table_caption))\n\n        elif re.search(self.figure_regex, item_path):\n            figure_caption = (\n                item_text + f\"\\n(Figure in Page {page_number}. {title})\"\n            )\n            figure_content = parse_figure_paths(file_paths)\n            if not figure_content:\n                continue\n            figures.append([page_number, figure_content, figure_caption])\n\n        else:\n            if item_text and \"Table\" not in item_path and \"Figure\" not in item_path:\n                texts[page_number].append(item_text)\n\n    # get figure caption using GPT-4V\n    figure_captions = generate_figure_captions(\n        self.vlm_endpoint,\n        [item[1] for item in figures],\n        self.max_figures_to_caption,\n    )\n    for item, caption in zip(figures, figure_captions):\n        # update figure caption\n        item[2] += \" \" + caption\n\n    # Wrap elements with Document\n    documents = []\n\n    # join plain text elements\n    for page_number, txts in texts.items():\n        documents.append(\n            Document(\n                text=\"\\n\".join(txts),\n                metadata={\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n            )\n        )\n\n    # table elements\n    for page_number, table_content, table_caption in tables:\n        documents.append(\n            Document(\n                text=table_content,\n                metadata={\n                    \"table_origin\": table_content,\n                    \"type\": \"table\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n\n    # figure elements\n    for page_number, figure_content, figure_caption in figures:\n        documents.append(\n            Document(\n                text=figure_caption,\n                metadata={\n                    \"image_origin\": figure_content,\n                    \"type\": \"image\",\n                    \"page_label\": page_number,\n                    \"file_name\": filename,\n                    \"file_path\": filepath,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n        )\n    return documents\n
    "},{"location":"reference/loaders/azureai_document_intelligence_loader/","title":"Azureai Document Intelligence Loader","text":""},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader","title":"AzureAIDocumentIntelligenceLoader","text":"

    Bases: BaseReader

    Utilize Azure AI Document Intelligence to parse document

    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html.

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    class AzureAIDocumentIntelligenceLoader(BaseReader):\n    \"\"\"Utilize Azure AI Document Intelligence to parse document\n\n    As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,\n    heif, docx, xlsx, pptx and html.\n    \"\"\"\n\n    _dependencies = [\"azure-ai-documentintelligence\", \"PyMuPDF\", \"Pillow\"]\n\n    endpoint: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT\", None),\n        help=\"Endpoint of Azure AI Document Intelligence\",\n    )\n    credential: str = Param(\n        os.environ.get(\"AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL\", None),\n        help=\"Credential of Azure AI Document Intelligence\",\n    )\n    model: str = Param(\n        \"prebuilt-layout\",\n        help=(\n            \"Model to use for document analysis. Default is prebuilt-layout. \"\n            \"As of April 24, you can view the supported models [here]\"\n            \"(https://learn.microsoft.com/en-us/azure/ai-services/\"\n            \"document-intelligence/concept-model-overview?view=doc-intel-4.0.0\"\n            \"#model-analysis-features)\"\n        ),\n    )\n    output_content_format: str = Param(\n        \"markdown\",\n        help=\"Output content format. Can be 'markdown' or 'text'.Default is markdown\",\n    )\n    vlm_endpoint: str = Param(\n        help=(\n            \"Default VLM endpoint for figure captioning. If not provided, will not \"\n            \"caption the figures\"\n        )\n    )\n    figure_friendly_filetypes: list[str] = Param(\n        [\".pdf\", \".jpeg\", \".jpg\", \".png\", \".bmp\", \".tiff\", \".heif\", \".tif\"],\n        help=(\n            \"File types that we can reliably open and extract figures. \"\n            \"For files like .docx or .html, the visual layout may be different \"\n            \"when viewed from different tools, hence we cannot use Azure DI \"\n            \"location to extract figures.\"\n        ),\n    )\n    cache_dir: str = Param(\n        None,\n        help=\"Directory to cache the downloaded files. Default is None\",\n    )\n\n    @Param.auto(depends_on=[\"endpoint\", \"credential\"])\n    def client_(self):\n        try:\n            from azure.ai.documentintelligence import DocumentIntelligenceClient\n            from azure.core.credentials import AzureKeyCredential\n        except ImportError:\n            raise ImportError(\"Please install azure-ai-documentintelligence\")\n\n        return DocumentIntelligenceClient(\n            self.endpoint, AzureKeyCredential(self.credential)\n        )\n\n    def run(\n        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n        metadata = extra_info or {}\n        file_name = Path(file_path)\n        with open(file_path, \"rb\") as fi:\n            poller = self.client_.begin_analyze_document(\n                self.model,\n                analyze_request=fi,\n                content_type=\"application/octet-stream\",\n                output_content_format=self.output_content_format,\n            )\n            result = poller.result()\n\n        # the total text content of the document in `output_content_format` format\n        text_content = result.content\n        removed_spans: list[dict] = []\n\n        # extract the figures\n        figures = []\n        for figure_desc in result.get(\"figures\", []):\n            if not self.vlm_endpoint:\n                continue\n            if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n                continue\n\n            # read & crop the image\n            page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n            page_width = result.pages[page_number - 1][\"width\"]\n            page_height = result.pages[page_number - 1][\"height\"]\n            polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n            xs = [polygon[i] for i in range(0, len(polygon), 2)]\n            ys = [polygon[i] for i in range(1, len(polygon), 2)]\n            bbox = [\n                min(xs) / page_width,\n                min(ys) / page_height,\n                max(xs) / page_width,\n                max(ys) / page_height,\n            ]\n            img = crop_image(file_path, bbox, page_number - 1)\n\n            # convert the image into base64\n            img_bytes = BytesIO()\n            img.save(img_bytes, format=\"PNG\")\n            img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n            img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n            # caption the image\n            caption = generate_single_figure_caption(\n                figure=img_base64, vlm_endpoint=self.vlm_endpoint\n            )\n\n            # store the image into document\n            figure_metadata = {\n                \"image_origin\": img_base64,\n                \"type\": \"image\",\n                \"page_label\": page_number,\n            }\n            figure_metadata.update(metadata)\n\n            figures.append(\n                Document(\n                    text=caption,\n                    metadata=figure_metadata,\n                )\n            )\n            removed_spans += figure_desc[\"spans\"]\n\n        # extract the tables\n        tables = []\n        for table_desc in result.get(\"tables\", []):\n            if not table_desc[\"spans\"]:\n                continue\n\n            # convert the tables into markdown format\n            boundingRegions = table_desc[\"boundingRegions\"]\n            if boundingRegions:\n                page_number = boundingRegions[0][\"pageNumber\"]\n            else:\n                page_number = 1\n\n            # store the tables into document\n            offset = table_desc[\"spans\"][0][\"offset\"]\n            length = table_desc[\"spans\"][0][\"length\"]\n            table_metadata = {\n                \"type\": \"table\",\n                \"page_label\": page_number,\n                \"table_origin\": text_content[offset : offset + length],\n            }\n            table_metadata.update(metadata)\n\n            tables.append(\n                Document(\n                    text=text_content[offset : offset + length],\n                    metadata=table_metadata,\n                )\n            )\n            removed_spans += table_desc[\"spans\"]\n        # save the text content into markdown format\n        if self.cache_dir is not None:\n            with open(\n                Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n            ) as f:\n                f.write(text_content)\n\n        removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n        for span in removed_spans:\n            text_content = (\n                text_content[: span[\"offset\"]]\n                + text_content[span[\"offset\"] + span[\"length\"] :]\n            )\n\n        return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.AzureAIDocumentIntelligenceLoader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Extract the input file, allowing multi-modal extraction

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Extract the input file, allowing multi-modal extraction\"\"\"\n    metadata = extra_info or {}\n    file_name = Path(file_path)\n    with open(file_path, \"rb\") as fi:\n        poller = self.client_.begin_analyze_document(\n            self.model,\n            analyze_request=fi,\n            content_type=\"application/octet-stream\",\n            output_content_format=self.output_content_format,\n        )\n        result = poller.result()\n\n    # the total text content of the document in `output_content_format` format\n    text_content = result.content\n    removed_spans: list[dict] = []\n\n    # extract the figures\n    figures = []\n    for figure_desc in result.get(\"figures\", []):\n        if not self.vlm_endpoint:\n            continue\n        if file_path.suffix.lower() not in self.figure_friendly_filetypes:\n            continue\n\n        # read & crop the image\n        page_number = figure_desc[\"boundingRegions\"][0][\"pageNumber\"]\n        page_width = result.pages[page_number - 1][\"width\"]\n        page_height = result.pages[page_number - 1][\"height\"]\n        polygon = figure_desc[\"boundingRegions\"][0][\"polygon\"]\n        xs = [polygon[i] for i in range(0, len(polygon), 2)]\n        ys = [polygon[i] for i in range(1, len(polygon), 2)]\n        bbox = [\n            min(xs) / page_width,\n            min(ys) / page_height,\n            max(xs) / page_width,\n            max(ys) / page_height,\n        ]\n        img = crop_image(file_path, bbox, page_number - 1)\n\n        # convert the image into base64\n        img_bytes = BytesIO()\n        img.save(img_bytes, format=\"PNG\")\n        img_base64 = base64.b64encode(img_bytes.getvalue()).decode(\"utf-8\")\n        img_base64 = f\"data:image/png;base64,{img_base64}\"\n\n        # caption the image\n        caption = generate_single_figure_caption(\n            figure=img_base64, vlm_endpoint=self.vlm_endpoint\n        )\n\n        # store the image into document\n        figure_metadata = {\n            \"image_origin\": img_base64,\n            \"type\": \"image\",\n            \"page_label\": page_number,\n        }\n        figure_metadata.update(metadata)\n\n        figures.append(\n            Document(\n                text=caption,\n                metadata=figure_metadata,\n            )\n        )\n        removed_spans += figure_desc[\"spans\"]\n\n    # extract the tables\n    tables = []\n    for table_desc in result.get(\"tables\", []):\n        if not table_desc[\"spans\"]:\n            continue\n\n        # convert the tables into markdown format\n        boundingRegions = table_desc[\"boundingRegions\"]\n        if boundingRegions:\n            page_number = boundingRegions[0][\"pageNumber\"]\n        else:\n            page_number = 1\n\n        # store the tables into document\n        offset = table_desc[\"spans\"][0][\"offset\"]\n        length = table_desc[\"spans\"][0][\"length\"]\n        table_metadata = {\n            \"type\": \"table\",\n            \"page_label\": page_number,\n            \"table_origin\": text_content[offset : offset + length],\n        }\n        table_metadata.update(metadata)\n\n        tables.append(\n            Document(\n                text=text_content[offset : offset + length],\n                metadata=table_metadata,\n            )\n        )\n        removed_spans += table_desc[\"spans\"]\n    # save the text content into markdown format\n    if self.cache_dir is not None:\n        with open(\n            Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\", encoding=\"utf-8\"\n        ) as f:\n            f.write(text_content)\n\n    removed_spans = sorted(removed_spans, key=lambda x: x[\"offset\"], reverse=True)\n    for span in removed_spans:\n        text_content = (\n            text_content[: span[\"offset\"]]\n            + text_content[span[\"offset\"] + span[\"length\"] :]\n        )\n\n    return [Document(content=text_content, metadata=metadata)] + figures + tables\n
    "},{"location":"reference/loaders/azureai_document_intelligence_loader/#loaders.azureai_document_intelligence_loader.crop_image","title":"crop_image","text":"
    crop_image(file_path, bbox, page_number=0)\n

    Crop the image based on the bounding box

    Parameters:

    Name Type Description Default file_path Path

    path to the image file

    required bbox list[float]

    bounding box of the image (in percentage [x0, y0, x1, y1])

    required page_number int

    page number of the image. Defaults to 0.

    0

    Returns:

    Type Description Image

    Image.Image: cropped image

    Source code in libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py
    def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Image.Image:\n    \"\"\"Crop the image based on the bounding box\n\n    Args:\n        file_path (Path): path to the image file\n        bbox (list[float]): bounding box of the image (in percentage [x0, y0, x1, y1])\n        page_number (int, optional): page number of the image. Defaults to 0.\n\n    Returns:\n        Image.Image: cropped image\n    \"\"\"\n    left, upper, right, lower = bbox\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    if suffix == \".pdf\":\n        try:\n            import fitz\n        except ImportError:\n            raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n        doc = fitz.open(file_path)\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=150)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n    elif suffix in [\".tif\", \".tiff\"]:\n        img = Image.open(file_path)\n        img.seek(page_number)\n    else:\n        img = Image.open(file_path)\n\n    return img.crop(\n        (\n            int(left * img.width),\n            int(upper * img.height),\n            int(right * img.width),\n            int(lower * img.height),\n        )\n    )\n
    "},{"location":"reference/loaders/base/","title":"Base","text":""},{"location":"reference/loaders/base/#loaders.base.BaseReader","title":"BaseReader","text":"

    Bases: BaseComponent

    The base class for all readers

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class BaseReader(BaseComponent):\n    \"\"\"The base class for all readers\"\"\"\n\n    ...\n
    "},{"location":"reference/loaders/base/#loaders.base.AutoReader","title":"AutoReader","text":"

    Bases: BaseReader

    General auto reader for a variety of files. (based on llama-hub)

    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class AutoReader(BaseReader):\n    \"\"\"General auto reader for a variety of files. (based on llama-hub)\"\"\"\n\n    def __init__(self, reader_type: Union[str, Type[\"LIBaseReader\"]]) -> None:\n        \"\"\"Init reader using string identifier or class name from llama-hub\"\"\"\n\n        if isinstance(reader_type, str):\n            from llama_index.core import download_loader\n\n            self._reader = download_loader(reader_type)()\n        else:\n            self._reader = reader_type()\n        super().__init__()\n\n    def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        documents = self._reader.load_data(file=file, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:\n        return self.load_data(file=file, **kwargs)\n
    "},{"location":"reference/loaders/base/#loaders.base.LIReaderMixin","title":"LIReaderMixin","text":"

    Bases: BaseComponent

    Base wrapper around llama-index reader

    To use the LIBaseReader, you need to implement the _get_wrapped_class method to return the relevant llama-index reader class that you want to wrap.

    Example:

    ```python\nclass DirectoryReader(LIBaseReader):\n    def _get_wrapped_class(self) -> Type[\"BaseReader\"]:\n        from llama_index import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n```\n
    Source code in libs/kotaemon/kotaemon/loaders/base.py
    class LIReaderMixin(BaseComponent):\n    \"\"\"Base wrapper around llama-index reader\n\n    To use the LIBaseReader, you need to implement the _get_wrapped_class method to\n    return the relevant llama-index reader class that you want to wrap.\n\n    Example:\n\n        ```python\n        class DirectoryReader(LIBaseReader):\n            def _get_wrapped_class(self) -> Type[\"BaseReader\"]:\n                from llama_index import SimpleDirectoryReader\n\n                return SimpleDirectoryReader\n        ```\n    \"\"\"\n\n    def _get_wrapped_class(self) -> Type[\"LIBaseReader\"]:\n        raise NotImplementedError(\n            \"Please return the relevant llama-index class in in _get_wrapped_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        self._reader_class = self._get_wrapped_class()\n        self._reader = self._reader_class(*args, **kwargs)\n        super().__init__()\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._reader, name, value)\n\n    def __getattr__(self, name: str) -> Any:\n        return getattr(self._reader, name)\n\n    def load_data(self, *args, **kwargs: Any) -> List[Document]:\n        documents = self._reader.load_data(*args, **kwargs)\n\n        # convert Document to new base class from kotaemon\n        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]\n        return converted_documents\n\n    def run(self, *args, **kwargs: Any) -> List[Document]:\n        return self.load_data(*args, **kwargs)\n
    "},{"location":"reference/loaders/composite_loader/","title":"Composite Loader","text":""},{"location":"reference/loaders/composite_loader/#loaders.composite_loader.DirectoryReader","title":"DirectoryReader","text":"

    Bases: LIReaderMixin, BaseReader

    Wrap around llama-index SimpleDirectoryReader

    Parameters:

    Name Type Description Default input_dir str

    Path to the directory.

    required input_files List

    List of file paths to read (Optional; overrides input_dir, exclude)

    required exclude List

    glob of python file paths to exclude (Optional)

    required exclude_hidden bool

    Whether to exclude hidden files (dotfiles).

    required encoding str

    Encoding of the files. Default is utf-8.

    required errors str

    how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open

    required recursive bool

    Whether to recursively search in subdirectories. False by default.

    required filename_as_id bool

    Whether to use the filename as the document id. False by default.

    required required_exts Optional[List[str]]

    List of required extensions. Default is None.

    required file_extractor Optional[Dict[str, BaseReader]]

    A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS.

    required num_files_limit Optional[int]

    Maximum number of files to read. Default is None.

    required file_metadata Optional[Callable[str, Dict]]

    A function that takes in a filename and returns a Dict of metadata for the Document. Default is None.

    required Source code in libs/kotaemon/kotaemon/loaders/composite_loader.py
    class DirectoryReader(LIReaderMixin, BaseReader):\n    \"\"\"Wrap around llama-index SimpleDirectoryReader\n\n    Args:\n        input_dir (str): Path to the directory.\n        input_files (List): List of file paths to read\n            (Optional; overrides input_dir, exclude)\n        exclude (List): glob of python file paths to exclude (Optional)\n        exclude_hidden (bool): Whether to exclude hidden files (dotfiles).\n        encoding (str): Encoding of the files.\n            Default is utf-8.\n        errors (str): how encoding and decoding errors are to be handled,\n              see https://docs.python.org/3/library/functions.html#open\n        recursive (bool): Whether to recursively search in subdirectories.\n            False by default.\n        filename_as_id (bool): Whether to use the filename as the document id.\n            False by default.\n        required_exts (Optional[List[str]]): List of required extensions.\n            Default is None.\n        file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file\n            extension to a BaseReader class that specifies how to convert that file\n            to text. If not specified, use default from DEFAULT_FILE_READER_CLS.\n        num_files_limit (Optional[int]): Maximum number of files to read.\n            Default is None.\n        file_metadata (Optional[Callable[str, Dict]]): A function that takes\n            in a filename and returns a Dict of metadata for the Document.\n            Default is None.\n    \"\"\"\n\n    input_dir: Optional[str] = None\n    input_files: Optional[List] = None\n    exclude: Optional[List] = None\n    exclude_hidden: bool = True\n    errors: str = \"ignore\"\n    recursive: bool = False\n    encoding: str = \"utf-8\"\n    filename_as_id: bool = False\n    required_exts: Optional[list[str]] = None\n    file_extractor: Optional[dict[str, \"LIBaseReader\"]] = None\n    num_files_limit: Optional[int] = None\n    file_metadata: Optional[Callable[[str], dict]] = None\n\n    def _get_wrapped_class(self) -> Type[\"LIBaseReader\"]:\n        from llama_index.core import SimpleDirectoryReader\n\n        return SimpleDirectoryReader\n
    "},{"location":"reference/loaders/docx_loader/","title":"Docx Loader","text":""},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader","title":"DocxReader","text":"

    Bases: BaseReader

    Read Docx files that respect table, using python-docx library

    Reader behavior Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    class DocxReader(BaseReader):\n    \"\"\"Read Docx files that respect table, using python-docx library\n\n    Reader behavior:\n        - All paragraphs are extracted as a Document\n        - Each table is extracted as a Document, rendered as a CSV string\n        - The output is a list of Documents, concatenating the above\n        (tables + paragraphs)\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        try:\n            import docx  # noqa\n        except ImportError:\n            raise ImportError(\n                \"docx is not installed. \"\n                \"Please install it using `pip install python-docx`\"\n            )\n\n    def _load_single_table(self, table) -> List[List[str]]:\n        \"\"\"Extract content from tables. Return a list of columns: list[str]\n        Some merged cells will share duplicated content.\n        \"\"\"\n        n_row = len(table.rows)\n        n_col = len(table.columns)\n\n        arrays = [[\"\" for _ in range(n_row)] for _ in range(n_col)]\n\n        for i, row in enumerate(table.rows):\n            for j, cell in enumerate(row.cells):\n                arrays[j][i] = cell.text\n\n        return arrays\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using Docx reader\n\n        Args:\n            file_path (Path): Path to .docx file\n\n        Returns:\n            List[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import docx\n\n        file_path = Path(file_path).resolve()\n\n        doc = docx.Document(str(file_path))\n        all_text = \"\\n\".join(\n            [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n        )\n        pages = [all_text]  # 1 page only\n\n        tables = []\n        for t in doc.tables:\n            # return list of columns: list of string\n            arrays = self._load_single_table(t)\n\n            tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=table.to_csv(\n                    index=False\n                ).strip(),  # strip_special_chars_markdown()\n                metadata={\n                    \"table_origin\": table.to_csv(index=False),\n                    \"type\": \"table\",\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for table in tables  # page_id\n        ]\n\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text.strip(),\n                    metadata={\"page_label\": 1, **extra_info},\n                )\n                for _, non_table_text in enumerate(pages)\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/docx_loader/#loaders.docx_loader.DocxReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Docx reader

    Parameters:

    Name Type Description Default file_path Path

    Path to .docx file

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/docx_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using Docx reader\n\n    Args:\n        file_path (Path): Path to .docx file\n\n    Returns:\n        List[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import docx\n\n    file_path = Path(file_path).resolve()\n\n    doc = docx.Document(str(file_path))\n    all_text = \"\\n\".join(\n        [unicodedata.normalize(\"NFKC\", p.text) for p in doc.paragraphs]\n    )\n    pages = [all_text]  # 1 page only\n\n    tables = []\n    for t in doc.tables:\n        # return list of columns: list of string\n        arrays = self._load_single_table(t)\n\n        tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays}))\n\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=table.to_csv(\n                index=False\n            ).strip(),  # strip_special_chars_markdown()\n            metadata={\n                \"table_origin\": table.to_csv(index=False),\n                \"type\": \"table\",\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for table in tables  # page_id\n    ]\n\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text.strip(),\n                metadata={\"page_label\": 1, **extra_info},\n            )\n            for _, non_table_text in enumerate(pages)\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/excel_loader/","title":"Excel Loader","text":"

    Pandas Excel reader.

    Pandas parser for .xlsx files.

    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader","title":"PandasExcelReader","text":"

    Bases: BaseReader

    Pandas-based CSV parser.

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class PandasExcelReader(BaseReader):\n    r\"\"\"Pandas-based CSV parser.\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = False,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n        import itertools\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        df_sheets = []\n\n        for key in sheet_names:\n            sheet = []\n            if include_sheetname:\n                sheet.append([key])\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key].fillna(\"\", inplace=True)\n            sheet.extend(dfs[key].values.astype(str).tolist())\n            df_sheets.append(sheet)\n\n        text_list = list(\n            itertools.chain.from_iterable(df_sheets)\n        )  # flatten list of lists\n\n        output = [\n            Document(\n                text=self._row_joiner.join(\n                    self._col_joiner.join(sublist) for sublist in text_list\n                ),\n                metadata=extra_info or {},\n            )\n        ]\n\n        return output\n
    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.PandasExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=False,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    False sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = False,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n    import itertools\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    df_sheets = []\n\n    for key in sheet_names:\n        sheet = []\n        if include_sheetname:\n            sheet.append([key])\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key].fillna(\"\", inplace=True)\n        sheet.extend(dfs[key].values.astype(str).tolist())\n        df_sheets.append(sheet)\n\n    text_list = list(\n        itertools.chain.from_iterable(df_sheets)\n    )  # flatten list of lists\n\n    output = [\n        Document(\n            text=self._row_joiner.join(\n                self._col_joiner.join(sublist) for sublist in text_list\n            ),\n            metadata=extra_info or {},\n        )\n    ]\n\n    return output\n
    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader","title":"ExcelReader","text":"

    Bases: BaseReader

    Spreadsheet exporter respecting multiple worksheets

    Parses CSVs using the separator detection from Pandas read_csv function. If special parameters are required, use the pandas_config dict.

    Args:

    pandas_config (dict): Options for the `pandas.read_excel` function call.\n    Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n    for more information. Set to empty dict by default,\n    this means defaults will be used.\n
    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    class ExcelReader(BaseReader):\n    r\"\"\"Spreadsheet exporter respecting multiple worksheets\n\n    Parses CSVs using the separator detection from Pandas `read_csv` function.\n    If special parameters are required, use the `pandas_config` dict.\n\n    Args:\n\n        pandas_config (dict): Options for the `pandas.read_excel` function call.\n            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html\n            for more information. Set to empty dict by default,\n            this means defaults will be used.\n\n    \"\"\"\n\n    def __init__(\n        self,\n        *args: Any,\n        pandas_config: Optional[dict] = None,\n        row_joiner: str = \"\\n\",\n        col_joiner: str = \" \",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args, **kwargs)\n        self._pandas_config = pandas_config or {}\n        self._row_joiner = row_joiner if row_joiner else \"\\n\"\n        self._col_joiner = col_joiner if col_joiner else \" \"\n\n    def load_data(\n        self,\n        file: Path,\n        include_sheetname: bool = True,\n        sheet_name: Optional[Union[str, int, list]] = None,\n        extra_info: Optional[dict] = None,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"Parse file and extract values from a specific column.\n\n        Args:\n            file (Path): The path to the Excel file to read.\n            include_sheetname (bool): Whether to include the sheet name in the output.\n            sheet_name (Union[str, int, None]): The specific sheet to read from,\n                default is None which reads all sheets.\n\n        Returns:\n            List[Document]: A list of`Document objects containing the\n                values from the specified column in the Excel file.\n        \"\"\"\n\n        try:\n            import pandas as pd\n        except ImportError:\n            raise ImportError(\n                \"install pandas using `pip3 install pandas` to use this loader\"\n            )\n\n        if sheet_name is not None:\n            sheet_name = (\n                [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n            )\n\n        # clean up input\n        file = Path(file)\n        extra_info = extra_info or {}\n\n        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n        sheet_names = dfs.keys()\n        output = []\n\n        for idx, key in enumerate(sheet_names):\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n            dfs[key] = dfs[key].astype(\"object\")\n            dfs[key].fillna(\"\", inplace=True)\n\n            rows = dfs[key].values.astype(str).tolist()\n            content = self._row_joiner.join(\n                self._col_joiner.join(row).strip() for row in rows\n            ).strip()\n            if include_sheetname:\n                content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n            metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n            output.append(Document(text=content, metadata=metadata))\n\n        return output\n
    "},{"location":"reference/loaders/excel_loader/#loaders.excel_loader.ExcelReader.load_data","title":"load_data","text":"
    load_data(\n    file,\n    include_sheetname=True,\n    sheet_name=None,\n    extra_info=None,\n    **kwargs\n)\n

    Parse file and extract values from a specific column.

    Parameters:

    Name Type Description Default file Path

    The path to the Excel file to read.

    required include_sheetname bool

    Whether to include the sheet name in the output.

    True sheet_name Union[str, int, None]

    The specific sheet to read from, default is None which reads all sheets.

    None

    Returns:

    Type Description List[Document]

    List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.

    Source code in libs/kotaemon/kotaemon/loaders/excel_loader.py
    def load_data(\n    self,\n    file: Path,\n    include_sheetname: bool = True,\n    sheet_name: Optional[Union[str, int, list]] = None,\n    extra_info: Optional[dict] = None,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"Parse file and extract values from a specific column.\n\n    Args:\n        file (Path): The path to the Excel file to read.\n        include_sheetname (bool): Whether to include the sheet name in the output.\n        sheet_name (Union[str, int, None]): The specific sheet to read from,\n            default is None which reads all sheets.\n\n    Returns:\n        List[Document]: A list of`Document objects containing the\n            values from the specified column in the Excel file.\n    \"\"\"\n\n    try:\n        import pandas as pd\n    except ImportError:\n        raise ImportError(\n            \"install pandas using `pip3 install pandas` to use this loader\"\n        )\n\n    if sheet_name is not None:\n        sheet_name = (\n            [sheet_name] if not isinstance(sheet_name, list) else sheet_name\n        )\n\n    # clean up input\n    file = Path(file)\n    extra_info = extra_info or {}\n\n    dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)\n    sheet_names = dfs.keys()\n    output = []\n\n    for idx, key in enumerate(sheet_names):\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].dropna(axis=0, how=\"all\")\n        dfs[key] = dfs[key].astype(\"object\")\n        dfs[key].fillna(\"\", inplace=True)\n\n        rows = dfs[key].values.astype(str).tolist()\n        content = self._row_joiner.join(\n            self._col_joiner.join(row).strip() for row in rows\n        ).strip()\n        if include_sheetname:\n            content = f\"(Sheet {key} of file {file.name})\\n{content}\"\n        metadata = {\"page_label\": idx + 1, \"sheet_name\": key, **extra_info}\n        output.append(Document(text=content, metadata=metadata))\n\n    return output\n
    "},{"location":"reference/loaders/html_loader/","title":"Html Loader","text":""},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader","title":"HtmlReader","text":"

    Bases: BaseReader

    Reader HTML usimg html2text

    Reader behavior

    Parameters:

    Name Type Description Default page_break_pattern str

    Pattern to split the HTML into pages

    None Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class HtmlReader(BaseReader):\n    \"\"\"Reader HTML usimg html2text\n\n    Reader behavior:\n        - HTML is read with html2text.\n        - All of the texts will be split by `page_break_pattern`\n        - Each page is extracted as a Document\n        - The output is a list of Documents\n\n    Args:\n        page_break_pattern (str): Pattern to split the HTML into pages\n    \"\"\"\n\n    def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):\n        try:\n            import html2text  # noqa\n        except ImportError:\n            raise ImportError(\n                \"html2text is not installed. \"\n                \"Please install it using `pip install html2text`\"\n            )\n\n        self._page_break_pattern: Optional[str] = page_break_pattern\n        super().__init__()\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load data using Html reader\n\n        Args:\n            file_path: path to HTML file\n            extra_info: extra information passed to this reader during extracting data\n\n        Returns:\n            list[Document]: list of documents extracted from the HTML file\n        \"\"\"\n        import html2text\n\n        file_path = Path(file_path).resolve()\n\n        with file_path.open(\"r\") as f:\n            html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n        # read HTML\n        all_text = html2text.html2text(html_text)\n        pages = (\n            all_text.split(self._page_break_pattern)\n            if self._page_break_pattern\n            else [all_text]\n        )\n\n        extra_info = extra_info or {}\n\n        # create Document from non-table text\n        documents = [\n            Document(\n                text=page.strip(),\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, page in enumerate(pages)\n        ]\n\n        return documents\n
    "},{"location":"reference/loaders/html_loader/#loaders.html_loader.HtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using Html reader

    Parameters:

    Name Type Description Default file_path Path | str

    path to HTML file

    required extra_info Optional[dict]

    extra information passed to this reader during extracting data

    None

    Returns:

    Type Description list[Document]

    list[Document]: list of documents extracted from the HTML file

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load data using Html reader\n\n    Args:\n        file_path: path to HTML file\n        extra_info: extra information passed to this reader during extracting data\n\n    Returns:\n        list[Document]: list of documents extracted from the HTML file\n    \"\"\"\n    import html2text\n\n    file_path = Path(file_path).resolve()\n\n    with file_path.open(\"r\") as f:\n        html_text = \"\".join([line[:-1] for line in f.readlines()])\n\n    # read HTML\n    all_text = html2text.html2text(html_text)\n    pages = (\n        all_text.split(self._page_break_pattern)\n        if self._page_break_pattern\n        else [all_text]\n    )\n\n    extra_info = extra_info or {}\n\n    # create Document from non-table text\n    documents = [\n        Document(\n            text=page.strip(),\n            metadata={\"page_label\": page_id + 1, **extra_info},\n        )\n        for page_id, page in enumerate(pages)\n    ]\n\n    return documents\n
    "},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader","title":"MhtmlReader","text":"

    Bases: BaseReader

    Parse MHTML files with BeautifulSoup.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    class MhtmlReader(BaseReader):\n    \"\"\"Parse `MHTML` files with `BeautifulSoup`.\"\"\"\n\n    def __init__(\n        self,\n        cache_dir: Optional[str] = getattr(\n            flowsettings, \"KH_MARKDOWN_OUTPUT_DIR\", None\n        ),\n        open_encoding: Optional[str] = None,\n        bs_kwargs: Optional[dict] = None,\n        get_text_separator: str = \"\",\n    ) -> None:\n        \"\"\"initialize with path, and optionally, file encoding to use, and any kwargs\n        to pass to the BeautifulSoup object.\n\n        Args:\n            cache_dir: Path for markdwon format.\n            file_path: Path to file to load.\n            open_encoding: The encoding to use when opening the file.\n            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.\n            get_text_separator: The separator to use when getting the text\n                from the soup.\n        \"\"\"\n        try:\n            import bs4  # noqa:F401\n        except ImportError:\n            raise ImportError(\n                \"beautifulsoup4 package not found, please install it with \"\n                \"`pip install beautifulsoup4`\"\n            )\n\n        self.cache_dir = cache_dir\n        self.open_encoding = open_encoding\n        if bs_kwargs is None:\n            bs_kwargs = {\"features\": \"lxml\"}\n        self.bs_kwargs = bs_kwargs\n        self.get_text_separator = get_text_separator\n\n    def load_data(\n        self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n    ) -> list[Document]:\n        \"\"\"Load MHTML document into document objects.\"\"\"\n\n        from bs4 import BeautifulSoup\n\n        extra_info = extra_info or {}\n        metadata: dict = extra_info\n        page = []\n        file_name = Path(file_path)\n        with open(file_path, \"r\", encoding=self.open_encoding) as f:\n            message = email.message_from_string(f.read())\n            parts = message.get_payload()\n\n            if not isinstance(parts, list):\n                parts = [message]\n\n            for part in parts:\n                if part.get_content_type() == \"text/html\":\n                    html = part.get_payload(decode=True).decode()\n\n                    soup = BeautifulSoup(html, **self.bs_kwargs)\n                    text = soup.get_text(self.get_text_separator)\n\n                    if soup.title:\n                        title = str(soup.title.string)\n                    else:\n                        title = \"\"\n\n                    metadata = {\n                        \"source\": str(file_path),\n                        \"title\": title,\n                        **extra_info,\n                    }\n                    lines = [line for line in text.split(\"\\n\") if line.strip()]\n                    text = \"\\n\\n\".join(lines)\n                    if text:\n                        page.append(text)\n        # save the page into markdown format\n        print(self.cache_dir)\n        if self.cache_dir is not None:\n            print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n            with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n                f.write(page[0])\n\n        return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/html_loader/#loaders.html_loader.MhtmlReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load MHTML document into document objects.

    Source code in libs/kotaemon/kotaemon/loaders/html_loader.py
    def load_data(\n    self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs\n) -> list[Document]:\n    \"\"\"Load MHTML document into document objects.\"\"\"\n\n    from bs4 import BeautifulSoup\n\n    extra_info = extra_info or {}\n    metadata: dict = extra_info\n    page = []\n    file_name = Path(file_path)\n    with open(file_path, \"r\", encoding=self.open_encoding) as f:\n        message = email.message_from_string(f.read())\n        parts = message.get_payload()\n\n        if not isinstance(parts, list):\n            parts = [message]\n\n        for part in parts:\n            if part.get_content_type() == \"text/html\":\n                html = part.get_payload(decode=True).decode()\n\n                soup = BeautifulSoup(html, **self.bs_kwargs)\n                text = soup.get_text(self.get_text_separator)\n\n                if soup.title:\n                    title = str(soup.title.string)\n                else:\n                    title = \"\"\n\n                metadata = {\n                    \"source\": str(file_path),\n                    \"title\": title,\n                    **extra_info,\n                }\n                lines = [line for line in text.split(\"\\n\") if line.strip()]\n                text = \"\\n\\n\".join(lines)\n                if text:\n                    page.append(text)\n    # save the page into markdown format\n    print(self.cache_dir)\n    if self.cache_dir is not None:\n        print(Path(self.cache_dir) / f\"{file_name.stem}.md\")\n        with open(Path(self.cache_dir) / f\"{file_name.stem}.md\", \"w\") as f:\n            f.write(page[0])\n\n    return [Document(text=\"\\n\\n\".join(page), metadata=metadata)]\n
    "},{"location":"reference/loaders/mathpix_loader/","title":"Mathpix Loader","text":""},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader","title":"MathpixPDFReader","text":"

    Bases: BaseReader

    Load PDF files using Mathpix service.

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    class MathpixPDFReader(BaseReader):\n    \"\"\"Load `PDF` files using `Mathpix` service.\"\"\"\n\n    def __init__(\n        self,\n        processed_file_format: str = \"md\",\n        max_wait_time_seconds: int = 500,\n        should_clean_pdf: bool = True,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize with a file path.\n\n        Args:\n            processed_file_format: a format of the processed file. Default is   \"mmd\".\n            max_wait_time_seconds: a maximum time to wait for the response from\n                the server. Default is 500.\n            should_clean_pdf: a flag to clean the PDF file. Default is False.\n            **kwargs: additional keyword arguments.\n        \"\"\"\n        self.mathpix_api_key = get_from_dict_or_env(\n            kwargs, \"mathpix_api_key\", \"MATHPIX_API_KEY\", default=\"empty\"\n        )\n        self.mathpix_api_id = get_from_dict_or_env(\n            kwargs, \"mathpix_api_id\", \"MATHPIX_API_ID\", default=\"empty\"\n        )\n        self.processed_file_format = processed_file_format\n        self.max_wait_time_seconds = max_wait_time_seconds\n        self.should_clean_pdf = should_clean_pdf\n        super().__init__()\n\n    @property\n    def _mathpix_headers(self) -> Dict[str, str]:\n        return {\"app_id\": self.mathpix_api_id, \"app_key\": self.mathpix_api_key}\n\n    @property\n    def url(self) -> str:\n        return \"https://api.mathpix.com/v3/pdf\"\n\n    @property\n    def data(self) -> dict:\n        options = {\n            \"conversion_formats\": {self.processed_file_format: True},\n            \"enable_tables_fallback\": True,\n        }\n        return {\"options_json\": json.dumps(options)}\n\n    def send_pdf(self, file_path) -> str:\n        with open(file_path, \"rb\") as f:\n            files = {\"file\": f}\n            response = requests.post(\n                self.url, headers=self._mathpix_headers, files=files, data=self.data\n            )\n        response_data = response.json()\n        if \"pdf_id\" in response_data:\n            pdf_id = response_data[\"pdf_id\"]\n            return pdf_id\n        else:\n            raise ValueError(\"Unable to send PDF to Mathpix.\")\n\n    def wait_for_processing(self, pdf_id: str) -> None:\n        \"\"\"Wait for processing to complete.\n\n        Args:\n            pdf_id: a PDF id.\n\n        Returns: None\n        \"\"\"\n        url = self.url + \"/\" + pdf_id\n        for _ in range(0, self.max_wait_time_seconds, 5):\n            response = requests.get(url, headers=self._mathpix_headers)\n            response_data = response.json()\n            status = response_data.get(\"status\", None)\n\n            if status == \"completed\":\n                return\n            elif status == \"error\":\n                raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n            else:\n                print(response_data)\n                print(url)\n                time.sleep(5)\n        raise TimeoutError\n\n    def get_processed_pdf(self, pdf_id: str) -> str:\n        self.wait_for_processing(pdf_id)\n        url = f\"{self.url}/{pdf_id}.{self.processed_file_format}\"\n        response = requests.get(url, headers=self._mathpix_headers)\n        return response.content.decode(\"utf-8\")\n\n    def clean_pdf(self, contents: str) -> str:\n        \"\"\"Clean the PDF file.\n\n        Args:\n            contents: a PDF file contents.\n\n        Returns:\n\n        \"\"\"\n        contents = \"\\n\".join(\n            [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n        )\n        # replace \\section{Title} with # Title\n        contents = contents.replace(\"\\\\section{\", \"# \")\n        # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n        # http:// or https:// followed by anything but a closing paren\n        url_regex = \"http[s]?://[^)]+\"\n        markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n        contents = (\n            contents.replace(r\"\\$\", \"$\")\n            .replace(r\"\\%\", \"%\")\n            .replace(r\"\\(\", \"(\")\n            .replace(r\"\\)\", \")\")\n            .replace(\"$\\\\begin{array}\", \"\")\n            .replace(\"\\\\end{array}$\", \"\")\n            .replace(\"\\\\\\\\\", \"\")\n            .replace(\"\\\\text\", \"\")\n            .replace(\"}\", \"\")\n            .replace(\"{\", \"\")\n            .replace(\"\\\\mathrm\", \"\")\n        )\n        contents = re.sub(markup_regex, \"\", contents)\n        return contents\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            content = kwargs[\"response_content\"]\n        else:\n            # call original API\n            pdf_id = self.send_pdf(file_path)\n            content = self.get_processed_pdf(pdf_id)\n\n        if self.should_clean_pdf:\n            content = self.clean_pdf(content)\n        tables, texts = parse_markdown_text_to_tables(content)\n        documents = []\n        for table in tables:\n            text = strip_special_chars_markdown(table)\n            metadata = {\n                \"table_origin\": table,\n                \"type\": \"table\",\n            }\n            if extra_info:\n                metadata.update(extra_info)\n            documents.append(\n                Document(\n                    text=text,\n                    metadata=metadata,\n                    metadata_template=\"\",\n                    metadata_seperator=\"\",\n                )\n            )\n\n        for text in texts:\n            metadata = {\"source\": file_path.name, \"type\": \"text\"}\n            documents.append(Document(text=text, metadata=metadata))\n\n        return documents\n
    "},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.wait_for_processing","title":"wait_for_processing","text":"
    wait_for_processing(pdf_id)\n

    Wait for processing to complete.

    Parameters:

    Name Type Description Default pdf_id str

    a PDF id.

    required

    Returns: None

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def wait_for_processing(self, pdf_id: str) -> None:\n    \"\"\"Wait for processing to complete.\n\n    Args:\n        pdf_id: a PDF id.\n\n    Returns: None\n    \"\"\"\n    url = self.url + \"/\" + pdf_id\n    for _ in range(0, self.max_wait_time_seconds, 5):\n        response = requests.get(url, headers=self._mathpix_headers)\n        response_data = response.json()\n        status = response_data.get(\"status\", None)\n\n        if status == \"completed\":\n            return\n        elif status == \"error\":\n            raise ValueError(\"Unable to retrieve PDF from Mathpix\")\n        else:\n            print(response_data)\n            print(url)\n            time.sleep(5)\n    raise TimeoutError\n
    "},{"location":"reference/loaders/mathpix_loader/#loaders.mathpix_loader.MathpixPDFReader.clean_pdf","title":"clean_pdf","text":"
    clean_pdf(contents)\n

    Clean the PDF file.

    Parameters:

    Name Type Description Default contents str

    a PDF file contents.

    required

    Returns:

    Source code in libs/kotaemon/kotaemon/loaders/mathpix_loader.py
    def clean_pdf(self, contents: str) -> str:\n    \"\"\"Clean the PDF file.\n\n    Args:\n        contents: a PDF file contents.\n\n    Returns:\n\n    \"\"\"\n    contents = \"\\n\".join(\n        [line for line in contents.split(\"\\n\") if not line.startswith(\"![]\")]\n    )\n    # replace \\section{Title} with # Title\n    contents = contents.replace(\"\\\\section{\", \"# \")\n    # replace the \"\\\" slash that Mathpix adds to escape $, %, (, etc.\n\n    # http:// or https:// followed by anything but a closing paren\n    url_regex = \"http[s]?://[^)]+\"\n    markup_regex = r\"\\[]\\(\\s*({0})\\s*\\)\".format(url_regex)\n    contents = (\n        contents.replace(r\"\\$\", \"$\")\n        .replace(r\"\\%\", \"%\")\n        .replace(r\"\\(\", \"(\")\n        .replace(r\"\\)\", \")\")\n        .replace(\"$\\\\begin{array}\", \"\")\n        .replace(\"\\\\end{array}$\", \"\")\n        .replace(\"\\\\\\\\\", \"\")\n        .replace(\"\\\\text\", \"\")\n        .replace(\"}\", \"\")\n        .replace(\"{\", \"\")\n        .replace(\"\\\\mathrm\", \"\")\n    )\n    contents = re.sub(markup_regex, \"\", contents)\n    return contents\n
    "},{"location":"reference/loaders/ocr_loader/","title":"Ocr Loader","text":""},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader","title":"OCRReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from kotaemon.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    True Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class OCRReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from kotaemon.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None, use_ocr=True):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n        self.use_ocr = use_ocr\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        debug_path = kwargs.pop(\"debug_path\", None)\n        artifact_path = kwargs.pop(\"artifact_path\", None)\n\n        # read PDF through normal reader (unstructured)\n        pdf_page_items = read_pdf_unstructured(file_path)\n        # merge PDF text output with OCR output\n        tables, texts = parse_ocr_output(\n            ocr_results,\n            pdf_page_items,\n            debug_path=debug_path,\n            artifact_path=artifact_path,\n        )\n        extra_info = extra_info or {}\n\n        # create output Document with metadata from table\n        documents = [\n            Document(\n                text=strip_special_chars_markdown(table_text),\n                metadata={\n                    \"table_origin\": table_text,\n                    \"type\": \"table\",\n                    \"page_label\": page_id + 1,\n                    **extra_info,\n                },\n                metadata_template=\"\",\n                metadata_seperator=\"\",\n            )\n            for page_id, table_text in tables\n        ]\n        # create Document from non-table text\n        documents.extend(\n            [\n                Document(\n                    text=non_table_text,\n                    metadata={\"page_label\": page_id + 1, **extra_info},\n                )\n                for page_id, non_table_text in texts\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.OCRReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    debug_path = kwargs.pop(\"debug_path\", None)\n    artifact_path = kwargs.pop(\"artifact_path\", None)\n\n    # read PDF through normal reader (unstructured)\n    pdf_page_items = read_pdf_unstructured(file_path)\n    # merge PDF text output with OCR output\n    tables, texts = parse_ocr_output(\n        ocr_results,\n        pdf_page_items,\n        debug_path=debug_path,\n        artifact_path=artifact_path,\n    )\n    extra_info = extra_info or {}\n\n    # create output Document with metadata from table\n    documents = [\n        Document(\n            text=strip_special_chars_markdown(table_text),\n            metadata={\n                \"table_origin\": table_text,\n                \"type\": \"table\",\n                \"page_label\": page_id + 1,\n                **extra_info,\n            },\n            metadata_template=\"\",\n            metadata_seperator=\"\",\n        )\n        for page_id, table_text in tables\n    ]\n    # create Document from non-table text\n    documents.extend(\n        [\n            Document(\n                text=non_table_text,\n                metadata={\"page_label\": page_id + 1, **extra_info},\n            )\n            for page_id, non_table_text in texts\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader","title":"ImageReader","text":"

    Bases: BaseReader

    Read PDF using OCR, with high focus on table extraction

    Example
    >> from knowledgehub.loaders import OCRReader\n>> reader = OCRReader()\n>> documents = reader.load_data(\"path/to/pdf\")\n

    Parameters:

    Name Type Description Default endpoint Optional[str]

    URL to FullOCR endpoint. If not provided, will look for environment variable OCR_READER_ENDPOINT or use the default knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT (http://127.0.0.1:8000/v2/ai/infer/)

    None use_ocr

    whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted.

    required Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    class ImageReader(BaseReader):\n    \"\"\"Read PDF using OCR, with high focus on table extraction\n\n    Example:\n        ```python\n        >> from knowledgehub.loaders import OCRReader\n        >> reader = OCRReader()\n        >> documents = reader.load_data(\"path/to/pdf\")\n        ```\n\n    Args:\n        endpoint: URL to FullOCR endpoint. If not provided, will look for\n            environment variable `OCR_READER_ENDPOINT` or use the default\n            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`\n            (http://127.0.0.1:8000/v2/ai/infer/)\n        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF\n            If False, only the table and text within table cells will be extracted.\n    \"\"\"\n\n    def __init__(self, endpoint: Optional[str] = None):\n        \"\"\"Init the OCR reader with OCR endpoint (FullOCR pipeline)\"\"\"\n        super().__init__()\n        self.ocr_endpoint = endpoint or os.getenv(\n            \"OCR_READER_ENDPOINT\", DEFAULT_OCR_ENDPOINT\n        )\n\n    def load_data(\n        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n    ) -> List[Document]:\n        \"\"\"Load data using OCR reader\n\n        Args:\n            file_path (Path): Path to PDF file\n            debug_path (Path): Path to store debug image output\n            artifact_path (Path): Path to OCR endpoints artifacts directory\n\n        Returns:\n            List[Document]: list of documents extracted from the PDF file\n        \"\"\"\n        file_path = Path(file_path).resolve()\n\n        # call the API from FullOCR endpoint\n        if \"response_content\" in kwargs:\n            # overriding response content if specified\n            ocr_results = kwargs[\"response_content\"]\n        else:\n            # call original API\n            resp = tenacious_api_post(\n                url=self.ocr_endpoint, file_path=file_path, table_only=False\n            )\n            ocr_results = resp.json()[\"result\"]\n\n        extra_info = extra_info or {}\n        result = []\n        for ocr_result in ocr_results:\n            result.append(\n                Document(\n                    content=ocr_result[\"csv_string\"],\n                    metadata=extra_info,\n                )\n            )\n\n        return result\n
    "},{"location":"reference/loaders/ocr_loader/#loaders.ocr_loader.ImageReader.load_data","title":"load_data","text":"
    load_data(file_path, extra_info=None, **kwargs)\n

    Load data using OCR reader

    Parameters:

    Name Type Description Default file_path Path

    Path to PDF file

    required debug_path Path

    Path to store debug image output

    required artifact_path Path

    Path to OCR endpoints artifacts directory

    required

    Returns:

    Type Description List[Document]

    List[Document]: list of documents extracted from the PDF file

    Source code in libs/kotaemon/kotaemon/loaders/ocr_loader.py
    def load_data(\n    self, file_path: Path, extra_info: Optional[dict] = None, **kwargs\n) -> List[Document]:\n    \"\"\"Load data using OCR reader\n\n    Args:\n        file_path (Path): Path to PDF file\n        debug_path (Path): Path to store debug image output\n        artifact_path (Path): Path to OCR endpoints artifacts directory\n\n    Returns:\n        List[Document]: list of documents extracted from the PDF file\n    \"\"\"\n    file_path = Path(file_path).resolve()\n\n    # call the API from FullOCR endpoint\n    if \"response_content\" in kwargs:\n        # overriding response content if specified\n        ocr_results = kwargs[\"response_content\"]\n    else:\n        # call original API\n        resp = tenacious_api_post(\n            url=self.ocr_endpoint, file_path=file_path, table_only=False\n        )\n        ocr_results = resp.json()[\"result\"]\n\n    extra_info = extra_info or {}\n    result = []\n    for ocr_result in ocr_results:\n        result.append(\n            Document(\n                content=ocr_result[\"csv_string\"],\n                metadata=extra_info,\n            )\n        )\n\n    return result\n
    "},{"location":"reference/loaders/pdf_loader/","title":"Pdf Loader","text":""},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader","title":"PDFThumbnailReader","text":"

    Bases: PDFReader

    PDF parser with thumbnail for each page.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    class PDFThumbnailReader(PDFReader):\n    \"\"\"PDF parser with thumbnail for each page.\"\"\"\n\n    def __init__(self) -> None:\n        \"\"\"\n        Initialize PDFReader.\n        \"\"\"\n        super().__init__(return_full_document=False)\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        fs: Optional[AbstractFileSystem] = None,\n    ) -> List[Document]:\n        \"\"\"Parse file.\"\"\"\n        documents = super().load_data(file, extra_info, fs)\n\n        page_numbers_str = []\n        filtered_docs = []\n        is_int_page_number: dict[str, bool] = {}\n\n        for doc in documents:\n            if \"page_label\" in doc.metadata:\n                page_num_str = doc.metadata[\"page_label\"]\n                page_numbers_str.append(page_num_str)\n                try:\n                    _ = int(page_num_str)\n                    is_int_page_number[page_num_str] = True\n                    filtered_docs.append(doc)\n                except ValueError:\n                    is_int_page_number[page_num_str] = False\n                    continue\n\n        documents = filtered_docs\n        page_numbers = list(range(len(page_numbers_str)))\n\n        print(\"Page numbers:\", len(page_numbers))\n        page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n        documents.extend(\n            [\n                Document(\n                    text=\"Page thumbnail\",\n                    metadata={\n                        \"image_origin\": page_thumbnail,\n                        \"type\": \"thumbnail\",\n                        \"page_label\": page_number,\n                        **(extra_info if extra_info is not None else {}),\n                    },\n                )\n                for (page_thumbnail, page_number) in zip(\n                    page_thumbnails, page_numbers_str\n                )\n                if is_int_page_number[page_number]\n            ]\n        )\n\n        return documents\n
    "},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.PDFThumbnailReader.load_data","title":"load_data","text":"
    load_data(file, extra_info=None, fs=None)\n

    Parse file.

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    fs: Optional[AbstractFileSystem] = None,\n) -> List[Document]:\n    \"\"\"Parse file.\"\"\"\n    documents = super().load_data(file, extra_info, fs)\n\n    page_numbers_str = []\n    filtered_docs = []\n    is_int_page_number: dict[str, bool] = {}\n\n    for doc in documents:\n        if \"page_label\" in doc.metadata:\n            page_num_str = doc.metadata[\"page_label\"]\n            page_numbers_str.append(page_num_str)\n            try:\n                _ = int(page_num_str)\n                is_int_page_number[page_num_str] = True\n                filtered_docs.append(doc)\n            except ValueError:\n                is_int_page_number[page_num_str] = False\n                continue\n\n    documents = filtered_docs\n    page_numbers = list(range(len(page_numbers_str)))\n\n    print(\"Page numbers:\", len(page_numbers))\n    page_thumbnails = get_page_thumbnails(file, page_numbers)\n\n    documents.extend(\n        [\n            Document(\n                text=\"Page thumbnail\",\n                metadata={\n                    \"image_origin\": page_thumbnail,\n                    \"type\": \"thumbnail\",\n                    \"page_label\": page_number,\n                    **(extra_info if extra_info is not None else {}),\n                },\n            )\n            for (page_thumbnail, page_number) in zip(\n                page_thumbnails, page_numbers_str\n            )\n            if is_int_page_number[page_number]\n        ]\n    )\n\n    return documents\n
    "},{"location":"reference/loaders/pdf_loader/#loaders.pdf_loader.get_page_thumbnails","title":"get_page_thumbnails","text":"
    get_page_thumbnails(file_path, pages, dpi=80)\n

    Get image thumbnails of the pages in the PDF file.

    Parameters:

    Name Type Description Default file_path Path

    path to the image file

    required page_number list[int]

    list of page numbers to extract

    required

    Returns:

    Type Description List[Image]

    list[Image.Image]: list of page thumbnails

    Source code in libs/kotaemon/kotaemon/loaders/pdf_loader.py
    def get_page_thumbnails(\n    file_path: Path, pages: list[int], dpi: int = 80\n) -> List[Image.Image]:\n    \"\"\"Get image thumbnails of the pages in the PDF file.\n\n    Args:\n        file_path (Path): path to the image file\n        page_number (list[int]): list of page numbers to extract\n\n    Returns:\n        list[Image.Image]: list of page thumbnails\n    \"\"\"\n\n    img: Image.Image\n    suffix = file_path.suffix.lower()\n    assert suffix == \".pdf\", \"This function only supports PDF files.\"\n    try:\n        import fitz\n    except ImportError:\n        raise ImportError(\"Please install PyMuPDF: 'pip install PyMuPDF'\")\n\n    doc = fitz.open(file_path)\n\n    output_imgs = []\n    for page_number in pages:\n        page = doc.load_page(page_number)\n        pm = page.get_pixmap(dpi=dpi)\n        img = Image.frombytes(\"RGB\", [pm.width, pm.height], pm.samples)\n        output_imgs.append(convert_image_to_base64(img))\n\n    return output_imgs\n
    "},{"location":"reference/loaders/txt_loader/","title":"Txt Loader","text":""},{"location":"reference/loaders/unstructured_loader/","title":"Unstructured Loader","text":"

    Unstructured file reader.

    A parser for unstructured text files using Unstructured.io. Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.

    To use .doc and .xls parser, install

    sudo apt-get install -y libmagic-dev poppler-utils libreoffice pip install xlrd

    "},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader","title":"UnstructuredReader","text":"

    Bases: BaseReader

    General unstructured text reader for a variety of files.

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    class UnstructuredReader(BaseReader):\n    \"\"\"General unstructured text reader for a variety of files.\"\"\"\n\n    def __init__(self, *args: Any, **kwargs: Any) -> None:\n        \"\"\"Init params.\"\"\"\n        super().__init__(*args)  # not passing kwargs to parent bc it cannot accept it\n\n        self.api = False  # we default to local\n        if \"url\" in kwargs:\n            self.server_url = str(kwargs[\"url\"])\n            self.api = True  # is url was set, switch to api\n        else:\n            self.server_url = \"http://localhost:8000\"\n\n        if \"api\" in kwargs:\n            self.api = kwargs[\"api\"]\n\n        self.api_key = \"\"\n        if \"api_key\" in kwargs:\n            self.api_key = kwargs[\"api_key\"]\n\n    \"\"\" Loads data using Unstructured.io\n\n        Depending on the construction if url is set or api = True\n        it'll parse file using API call, else parse it locally\n        additional_metadata is extended by the returned metadata if\n        split_documents is True\n\n        Returns list of documents\n    \"\"\"\n\n    def load_data(\n        self,\n        file: Path,\n        extra_info: Optional[Dict] = None,\n        split_documents: Optional[bool] = False,\n        **kwargs,\n    ) -> List[Document]:\n        \"\"\"If api is set, parse through api\"\"\"\n        file_path_str = str(file)\n        if self.api:\n            from unstructured.partition.api import partition_via_api\n\n            elements = partition_via_api(\n                filename=file_path_str,\n                api_key=self.api_key,\n                api_url=self.server_url + \"/general/v0/general\",\n            )\n        else:\n            \"\"\"Parse file locally\"\"\"\n            from unstructured.partition.auto import partition\n\n            elements = partition(filename=file_path_str)\n\n        \"\"\" Process elements \"\"\"\n        docs = []\n        file_name = Path(file).name\n        file_path = str(Path(file).resolve())\n        if split_documents:\n            for node in elements:\n                metadata = {\"file_name\": file_name, \"file_path\": file_path}\n                if hasattr(node, \"metadata\"):\n                    \"\"\"Load metadata fields\"\"\"\n                    for field, val in vars(node.metadata).items():\n                        if field == \"_known_field_names\":\n                            continue\n                        # removing coordinates because it does not serialize\n                        # and dont want to bother with it\n                        if field == \"coordinates\":\n                            continue\n                        # removing bc it might cause interference\n                        if field == \"parent_id\":\n                            continue\n                        metadata[field] = val\n\n                if extra_info is not None:\n                    metadata.update(extra_info)\n\n                metadata[\"file_name\"] = file_name\n                docs.append(Document(text=node.text, metadata=metadata))\n\n        else:\n            text_chunks = [\" \".join(str(el).split()) for el in elements]\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            # Create a single document by joining all the texts\n            docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n        return docs\n
    "},{"location":"reference/loaders/unstructured_loader/#loaders.unstructured_loader.UnstructuredReader.load_data","title":"load_data","text":"
    load_data(\n    file, extra_info=None, split_documents=False, **kwargs\n)\n

    If api is set, parse through api

    Source code in libs/kotaemon/kotaemon/loaders/unstructured_loader.py
    def load_data(\n    self,\n    file: Path,\n    extra_info: Optional[Dict] = None,\n    split_documents: Optional[bool] = False,\n    **kwargs,\n) -> List[Document]:\n    \"\"\"If api is set, parse through api\"\"\"\n    file_path_str = str(file)\n    if self.api:\n        from unstructured.partition.api import partition_via_api\n\n        elements = partition_via_api(\n            filename=file_path_str,\n            api_key=self.api_key,\n            api_url=self.server_url + \"/general/v0/general\",\n        )\n    else:\n        \"\"\"Parse file locally\"\"\"\n        from unstructured.partition.auto import partition\n\n        elements = partition(filename=file_path_str)\n\n    \"\"\" Process elements \"\"\"\n    docs = []\n    file_name = Path(file).name\n    file_path = str(Path(file).resolve())\n    if split_documents:\n        for node in elements:\n            metadata = {\"file_name\": file_name, \"file_path\": file_path}\n            if hasattr(node, \"metadata\"):\n                \"\"\"Load metadata fields\"\"\"\n                for field, val in vars(node.metadata).items():\n                    if field == \"_known_field_names\":\n                        continue\n                    # removing coordinates because it does not serialize\n                    # and dont want to bother with it\n                    if field == \"coordinates\":\n                        continue\n                    # removing bc it might cause interference\n                    if field == \"parent_id\":\n                        continue\n                    metadata[field] = val\n\n            if extra_info is not None:\n                metadata.update(extra_info)\n\n            metadata[\"file_name\"] = file_name\n            docs.append(Document(text=node.text, metadata=metadata))\n\n    else:\n        text_chunks = [\" \".join(str(el).split()) for el in elements]\n        metadata = {\"file_name\": file_name, \"file_path\": file_path}\n\n        if extra_info is not None:\n            metadata.update(extra_info)\n\n        # Create a single document by joining all the texts\n        docs.append(Document(text=\"\\n\\n\".join(text_chunks), metadata=metadata))\n\n    return docs\n
    "},{"location":"reference/loaders/utils/","title":"Utils","text":""},{"location":"reference/loaders/utils/adobe/","title":"Adobe","text":""},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.request_adobe_service","title":"request_adobe_service","text":"
    request_adobe_service(file_path, output_path='')\n

    Main function to call the adobe service, and unzip the results. Args: file_path (str): path to the pdf file output_path (str): path to store the results

    Returns:

    Name Type Description output_path str

    path to the results

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def request_adobe_service(file_path: str, output_path: str = \"\") -> str:\n    \"\"\"Main function to call the adobe service, and unzip the results.\n    Args:\n        file_path (str): path to the pdf file\n        output_path (str): path to store the results\n\n    Returns:\n        output_path (str): path to the results\n\n    \"\"\"\n    try:\n        from adobe.pdfservices.operation.auth.credentials import Credentials\n        from adobe.pdfservices.operation.exception.exceptions import (\n            SdkException,\n            ServiceApiException,\n            ServiceUsageException,\n        )\n        from adobe.pdfservices.operation.execution_context import ExecutionContext\n        from adobe.pdfservices.operation.io.file_ref import FileRef\n        from adobe.pdfservices.operation.pdfops.extract_pdf_operation import (\n            ExtractPDFOperation,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import (  # noqa: E501\n            ExtractElementType,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import (  # noqa: E501\n            ExtractPDFOptions,\n        )\n        from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import (  # noqa: E501\n            ExtractRenditionsElementType,\n        )\n    except ImportError:\n        raise ImportError(\n            \"pdfservices-sdk is not installed. \"\n            \"Please install it by running `pip install pdfservices-sdk\"\n            \"@git+https://github.com/niallcm/pdfservices-python-sdk.git\"\n            \"@bump-and-unfreeze-requirements`\"\n        )\n\n    if not output_path:\n        output_path = tempfile.mkdtemp()\n\n    try:\n        # Initial setup, create credentials instance.\n        credentials = (\n            Credentials.service_principal_credentials_builder()\n            .with_client_id(config(\"PDF_SERVICES_CLIENT_ID\", default=\"\"))\n            .with_client_secret(config(\"PDF_SERVICES_CLIENT_SECRET\", default=\"\"))\n            .build()\n        )\n\n        # Create an ExecutionContext using credentials\n        # and create a new operation instance.\n        execution_context = ExecutionContext.create(credentials)\n        extract_pdf_operation = ExtractPDFOperation.create_new()\n\n        # Set operation input from a source file.\n        source = FileRef.create_from_local_file(file_path)\n        extract_pdf_operation.set_input(source)\n\n        # Build ExtractPDF options and set them into the operation\n        extract_pdf_options: ExtractPDFOptions = (\n            ExtractPDFOptions.builder()\n            .with_elements_to_extract(\n                [ExtractElementType.TEXT, ExtractElementType.TABLES]\n            )\n            .with_elements_to_extract_renditions(\n                [\n                    ExtractRenditionsElementType.TABLES,\n                    ExtractRenditionsElementType.FIGURES,\n                ]\n            )\n            .build()\n        )\n        extract_pdf_operation.set_options(extract_pdf_options)\n\n        # Execute the operation.\n        result: FileRef = extract_pdf_operation.execute(execution_context)\n\n        # Save the result to the specified location.\n        zip_file_path = os.path.join(\n            output_path, \"ExtractTextTableWithFigureTableRendition.zip\"\n        )\n        result.save_as(zip_file_path)\n        # Open the ZIP file\n        with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n            # Extract all contents to the destination folder\n            zip_ref.extractall(output_path)\n    except (ServiceApiException, ServiceUsageException, SdkException):\n        logging.exception(\"Exception encountered while executing operation\")\n\n    return output_path\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.make_markdown_table","title":"make_markdown_table","text":"
    make_markdown_table(table_as_list)\n

    Convert table from python list representation to markdown format. The input list consists of rows of tables, the first row is the header.

    Parameters:

    Name Type Description Default table_as_list List[str]

    list of table rows Example: [[\"Name\", \"Age\", \"Height\"], [\"Jake\", 20, 5'10], [\"Mary\", 21, 5'7]]

    required

    Returns: markdown representation of the table

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def make_markdown_table(table_as_list: List[str]) -> str:\n    \"\"\"\n    Convert table from python list representation to markdown format.\n    The input list consists of rows of tables, the first row is the header.\n\n    Args:\n        table_as_list: list of table rows\n            Example: [[\"Name\", \"Age\", \"Height\"],\n                    [\"Jake\", 20, 5'10],\n                    [\"Mary\", 21, 5'7]]\n    Returns:\n        markdown representation of the table\n    \"\"\"\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in table_as_list[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(table_as_list[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in table_as_list[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_json","title":"load_json","text":"
    load_json(input_path)\n

    Load json file

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def load_json(input_path: Union[str | Path]) -> dict:\n    \"\"\"Load json file\"\"\"\n    with open(input_path, \"r\") as fi:\n        data = json.load(fi)\n\n    return data\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.load_excel","title":"load_excel","text":"
    load_excel(input_path)\n

    Load excel file and convert to markdown

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def load_excel(input_path: Union[str | Path]) -> str:\n    \"\"\"Load excel file and convert to markdown\"\"\"\n\n    df = pd.read_excel(input_path).fillna(\"\")\n    # Convert dataframe to a list of rows\n    row_list = [df.columns.values.tolist()] + df.values.tolist()\n\n    for item_id, item in enumerate(row_list[0]):\n        if \"Unnamed\" in item:\n            row_list[0][item_id] = \"\"\n\n    for row in row_list:\n        for item_id, item in enumerate(row):\n            row[item_id] = str(item).replace(\"_x000D_\", \" \").replace(\"\\n\", \" \").strip()\n\n    markdown_str = make_markdown_table(row_list)\n    return markdown_str\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.encode_image_base64","title":"encode_image_base64","text":"
    encode_image_base64(image_path)\n

    Convert image to base64

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def encode_image_base64(image_path: Union[str | Path]) -> Union[bytes, str]:\n    \"\"\"Convert image to base64\"\"\"\n\n    with open(image_path, \"rb\") as image_file:\n        return base64.b64encode(image_file.read()).decode(\"utf-8\")\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_table_paths","title":"parse_table_paths","text":"
    parse_table_paths(file_paths)\n

    Read the table stored in an excel file given the file path

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def parse_table_paths(file_paths: List[Path]) -> str:\n    \"\"\"Read the table stored in an excel file given the file path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".xlsx\":\n            content = load_excel(path)\n            break\n    return content\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.parse_figure_paths","title":"parse_figure_paths","text":"
    parse_figure_paths(file_paths)\n

    Read and convert an image to base64 given the image path

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:\n    \"\"\"Read and convert an image to base64 given the image path\"\"\"\n\n    content = \"\"\n    for path in file_paths:\n        if path.suffix == \".png\":\n            base64_image = encode_image_base64(path)\n            content = f\"data:image/png;base64,{base64_image}\"  # type: ignore\n            break\n    return content\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_single_figure_caption","title":"generate_single_figure_caption","text":"
    generate_single_figure_caption(vlm_endpoint, figure)\n

    Summarize a single figure using GPT-4V

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:\n    \"\"\"Summarize a single figure using GPT-4V\"\"\"\n    if figure:\n        output = generate_gpt4v(\n            endpoint=vlm_endpoint,\n            prompt=\"Provide a short 2 sentence summary of this image?\",\n            images=figure,\n        )\n        if \"sorry\" in output.lower():\n            output = \"\"\n    else:\n        output = \"\"\n    return output\n
    "},{"location":"reference/loaders/utils/adobe/#loaders.utils.adobe.generate_figure_captions","title":"generate_figure_captions","text":"
    generate_figure_captions(\n    vlm_endpoint, figures, max_figures_to_process\n)\n

    Summarize several figures using GPT-4V. Args: vlm_endpoint (str): endpoint to the vision language model service figures (List): list of base64 images max_figures_to_process (int): the maximum number of figures will be summarized, the rest are ignored.

    Returns:

    Name Type Description results List[str]

    list of all figure captions and empty strings for

    List

    ignored figures.

    Source code in libs/kotaemon/kotaemon/loaders/utils/adobe.py
    def generate_figure_captions(\n    vlm_endpoint: str, figures: List, max_figures_to_process: int\n) -> List:\n    \"\"\"Summarize several figures using GPT-4V.\n    Args:\n        vlm_endpoint (str): endpoint to the vision language model service\n        figures (List): list of base64 images\n        max_figures_to_process (int): the maximum number of figures will be summarized,\n        the rest are ignored.\n\n    Returns:\n        results (List[str]): list of all figure captions and empty strings for\n        ignored figures.\n    \"\"\"\n    to_gen_figures = figures[:max_figures_to_process]\n    other_figures = figures[max_figures_to_process:]\n\n    with ThreadPoolExecutor() as executor:\n        futures = [\n            executor.submit(\n                lambda: generate_single_figure_caption(vlm_endpoint, figure)\n            )\n            for figure in to_gen_figures\n        ]\n\n    results = [future.result() for future in futures]\n    return results + [\"\"] * len(other_figures)\n
    "},{"location":"reference/loaders/utils/box/","title":"Box","text":""},{"location":"reference/loaders/utils/box/#loaders.utils.box.bbox_to_points","title":"bbox_to_points","text":"
    bbox_to_points(box)\n

    Convert bounding box to list of points

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def bbox_to_points(box: List[int]):\n    \"\"\"Convert bounding box to list of points\"\"\"\n    x1, y1, x2, y2 = box\n    return [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.points_to_bbox","title":"points_to_bbox","text":"
    points_to_bbox(points)\n

    Convert list of points to bounding box

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def points_to_bbox(points: List[Tuple[int, int]]):\n    \"\"\"Convert list of points to bounding box\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    return [min(all_x), min(all_y), max(all_x), max(all_y)]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_points","title":"scale_points","text":"
    scale_points(points, scale_factor=1.0)\n

    Scale points by a scale factor

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def scale_points(points: List[Tuple[int, int]], scale_factor: float = 1.0):\n    \"\"\"Scale points by a scale factor\"\"\"\n    return [(int(pos[0] * scale_factor), int(pos[1] * scale_factor)) for pos in points]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.union_points","title":"union_points","text":"
    union_points(points)\n

    Return union bounding box of list of points

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def union_points(points: List[Tuple[int, int]]):\n    \"\"\"Return union bounding box of list of points\"\"\"\n    all_x = [p[0] for p in points]\n    all_y = [p[1] for p in points]\n    bbox = (min(all_x), min(all_y), max(all_x), max(all_y))\n    return bbox\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.scale_box","title":"scale_box","text":"
    scale_box(box, scale_factor=1.0)\n

    Scale box by a scale factor

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def scale_box(box: List[int], scale_factor: float = 1.0):\n    \"\"\"Scale box by a scale factor\"\"\"\n    return [int(pos * scale_factor) for pos in box]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_h","title":"box_h","text":"
    box_h(box)\n

    Return box height

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def box_h(box: List[int]):\n    \"Return box height\"\n    return box[3] - box[1]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_w","title":"box_w","text":"
    box_w(box)\n

    Return box width

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def box_w(box: List[int]):\n    \"Return box width\"\n    return box[2] - box[0]\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.box_area","title":"box_area","text":"
    box_area(box)\n

    Return box area

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def box_area(box: List[int]):\n    \"Return box area\"\n    x1, y1, x2, y2 = box\n    return (x2 - x1) * (y2 - y1)\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.get_rect_iou","title":"get_rect_iou","text":"
    get_rect_iou(gt_box, pd_box, iou_type=0)\n

    Intersection over union on layout rectangle

    Parameters:

    Name Type Description Default gt_box List[tuple]

    List[tuple] A list contains bounding box coordinates of ground truth

    required pd_box List[tuple]

    List[tuple] A list contains bounding box coordinates of prediction

    required iou_type

    int 0: intersection / union, normal IOU 1: intersection / min(areas), useful when boxes are under/over-segmented

    0 Input format

    [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]

    required Annotation for each element in bbox required

    Returns:

    Type Description int

    Intersection over union value

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int:\n    \"\"\"Intersection over union on layout rectangle\n\n    Args:\n        gt_box: List[tuple]\n            A list contains bounding box coordinates of ground truth\n        pd_box: List[tuple]\n            A list contains bounding box coordinates of prediction\n        iou_type: int\n            0: intersection / union, normal IOU\n            1: intersection / min(areas), useful when boxes are under/over-segmented\n\n        Input format: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n        Annotation for each element in bbox:\n        (x1, y1)        (x2, y1)\n            +-------+\n            |       |\n            |       |\n            +-------+\n        (x1, y2)        (x2, y2)\n\n    Returns:\n        Intersection over union value\n    \"\"\"\n\n    assert iou_type in [0, 1], \"Only support 0: origin iou, 1: intersection / min(area)\"\n\n    # determine the (x, y)-coordinates of the intersection rectangle\n    # gt_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    # pd_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]\n    x_left = max(gt_box[0][0], pd_box[0][0])\n    y_top = max(gt_box[0][1], pd_box[0][1])\n    x_right = min(gt_box[2][0], pd_box[2][0])\n    y_bottom = min(gt_box[2][1], pd_box[2][1])\n\n    # compute the area of intersection rectangle\n    interArea = max(0, x_right - x_left) * max(0, y_bottom - y_top)\n\n    # compute the area of both the prediction and ground-truth\n    # rectangles\n    gt_area = (gt_box[2][0] - gt_box[0][0]) * (gt_box[2][1] - gt_box[0][1])\n    pd_area = (pd_box[2][0] - pd_box[0][0]) * (pd_box[2][1] - pd_box[0][1])\n\n    # compute the intersection over union by taking the intersection\n    # area and dividing it by the sum of prediction + ground-truth\n    # areas - the intersection area\n    if iou_type == 0:\n        iou = interArea / float(gt_area + pd_area - interArea)\n    elif iou_type == 1:\n        iou = interArea / max(min(gt_area, pd_area), 1)\n\n    # return the intersection over union value\n    return iou\n
    "},{"location":"reference/loaders/utils/box/#loaders.utils.box.sort_funsd_reading_order","title":"sort_funsd_reading_order","text":"
    sort_funsd_reading_order(lines, box_key_name='box')\n

    Sort cell list to create the right reading order using their locations

    Parameters:

    Name Type Description Default lines List[dict]

    list of cells to sort

    required

    Returns:

    Type Description

    a list of cell lists in the right reading order that contain

    no key or start with a key and contain no other key

    Source code in libs/kotaemon/kotaemon/loaders/utils/box.py
    def sort_funsd_reading_order(lines: List[dict], box_key_name: str = \"box\"):\n    \"\"\"Sort cell list to create the right reading order using their locations\n\n    Args:\n        lines: list of cells to sort\n\n    Returns:\n        a list of cell lists in the right reading order that contain\n        no key or start with a key and contain no other key\n    \"\"\"\n    sorted_list = []\n\n    if len(lines) == 0:\n        return lines\n\n    while len(lines) > 1:\n        topleft_line = lines[0]\n        for line in lines[1:]:\n            topleft_line_pos = topleft_line[box_key_name]\n            topleft_line_center_y = (topleft_line_pos[1] + topleft_line_pos[3]) / 2\n            x1, y1, x2, y2 = line[box_key_name]\n            box_center_x = (x1 + x2) / 2\n            box_center_y = (y1 + y2) / 2\n            cell_h = y2 - y1\n            if box_center_y <= topleft_line_center_y - cell_h / 2:\n                topleft_line = line\n                continue\n            if (\n                box_center_x < topleft_line_pos[2]\n                and box_center_y < topleft_line_pos[3]\n            ):\n                topleft_line = line\n                continue\n        sorted_list.append(topleft_line)\n        lines.remove(topleft_line)\n\n    sorted_list.append(lines[0])\n\n    return sorted_list\n
    "},{"location":"reference/loaders/utils/gpt4v/","title":"Gpt4V","text":""},{"location":"reference/loaders/utils/pdf_ocr/","title":"Pdf Ocr","text":""},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.read_pdf_unstructured","title":"read_pdf_unstructured","text":"
    read_pdf_unstructured(input_path)\n

    Convert PDF from specified path to list of text items with location information

    Parameters:

    Name Type Description Default input_path Union[Path, str]

    path to input file

    required

    Returns:

    Type Description

    Dict page_number: list of text boxes

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def read_pdf_unstructured(input_path: Union[Path, str]):\n    \"\"\"Convert PDF from specified path to list of text items with\n    location information\n\n    Args:\n        input_path: path to input file\n\n    Returns:\n        Dict page_number: list of text boxes\n    \"\"\"\n    try:\n        from unstructured.partition.auto import partition\n    except ImportError as e:\n        raise ImportError(\n            \"Please install unstructured PDF reader `pip install unstructured[pdf]`: \"\n            f\"{e}\"\n        )\n\n    page_items = defaultdict(list)\n    items = partition(input_path)\n    for item in items:\n        page_number = item.metadata.page_number\n        bbox = points_to_bbox(item.metadata.coordinates.points)\n        coord_system = item.metadata.coordinates.system\n        max_w, max_h = coord_system.width, coord_system.height\n        page_items[page_number - 1].append(\n            {\n                \"text\": item.text,\n                \"box\": bbox,\n                \"location\": bbox_to_points(bbox),\n                \"page_shape\": (max_w, max_h),\n            }\n        )\n\n    return page_items\n
    "},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_ocr_and_pdf_texts","title":"merge_ocr_and_pdf_texts","text":"
    merge_ocr_and_pdf_texts(\n    ocr_list, pdf_text_list, debug_info=None\n)\n

    Merge PDF and OCR text using IOU overlapping location Args: ocr_list: List of OCR items {\"text\", \"box\", \"location\"} pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}

    Returns:

    Type Description

    Combined list of PDF text and non-overlap OCR text

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def merge_ocr_and_pdf_texts(\n    ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None\n):\n    \"\"\"Merge PDF and OCR text using IOU overlapping location\n    Args:\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_text_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        Combined list of PDF text and non-overlap OCR text\n    \"\"\"\n    not_matched_ocr = []\n\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    for ocr_item in ocr_list:\n        matched = False\n        for pdf_item in pdf_text_list:\n            if (\n                get_rect_iou(ocr_item[\"location\"], pdf_item[\"location\"], iou_type=1)\n                > IOU_THRES\n            ):\n                matched = True\n                break\n\n        color = (255, 0, 0)\n        if not matched:\n            ocr_item[\"matched\"] = False\n            not_matched_ocr.append(ocr_item)\n            color = (0, 255, 255)\n\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                ocr_item[\"location\"][0],\n                ocr_item[\"location\"][2],\n                color=color,\n                thickness=1,\n            )\n\n    if debug_info is not None:\n        for pdf_item in pdf_text_list:\n            cv2.rectangle(\n                debug_im,\n                pdf_item[\"location\"][0],\n                pdf_item[\"location\"][2],\n                color=(0, 255, 0),\n                thickness=2,\n            )\n\n    return pdf_text_list + not_matched_ocr\n
    "},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.merge_table_cell_and_ocr","title":"merge_table_cell_and_ocr","text":"
    merge_table_cell_and_ocr(\n    table_list, ocr_list, pdf_list, debug_info=None\n)\n

    Merge table items with OCR text using IOU overlapping location Args: table_list: List of table items \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"} ocr_list: List of OCR items {\"text\", \"box\", \"location\"} pdf_list: List of PDF items {\"text\", \"box\", \"location\"}

    Returns:

    Name Type Description all_table_cells

    List of tables, each of table is represented by list of cells with combined text from OCR

    not_matched_items

    List of PDF text which is not overlapped by table region

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def merge_table_cell_and_ocr(\n    table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None\n):\n    \"\"\"Merge table items with OCR text using IOU overlapping location\n    Args:\n        table_list: List of table items\n            \"type\": (\"table\", \"cell\", \"text\"), \"text\", \"box\", \"location\"}\n        ocr_list: List of OCR items {\"text\", \"box\", \"location\"}\n        pdf_list: List of PDF items {\"text\", \"box\", \"location\"}\n\n    Returns:\n        all_table_cells: List of tables, each of table is represented\n            by list of cells with combined text from OCR\n        not_matched_items: List of PDF text which is not overlapped by table region\n    \"\"\"\n    # check for debug info\n    if debug_info is not None:\n        cv2, debug_im = debug_info\n\n    cell_list = [item for item in table_list if item[\"type\"] == \"cell\"]\n    table_list = [item for item in table_list if item[\"type\"] == \"table\"]\n\n    # sort table by area\n    table_list = sorted(table_list, key=lambda item: box_area(item[\"bbox\"]))\n\n    all_tables = []\n    matched_pdf_ids = []\n    matched_cell_ids = []\n\n    for table in table_list:\n        if debug_info is not None:\n            cv2.rectangle(\n                debug_im,\n                table[\"location\"][0],\n                table[\"location\"][2],\n                color=[0, 0, 255],\n                thickness=5,\n            )\n\n        cur_table_cells = []\n        for cell_id, cell in enumerate(cell_list):\n            if cell_id in matched_cell_ids:\n                continue\n\n            if get_rect_iou(\n                table[\"location\"], cell[\"location\"], iou_type=1\n            ) > IOU_THRES and box_area(table[\"bbox\"]) > box_area(cell[\"bbox\"]):\n                color = [128, 0, 128]\n                # cell matched to table\n                for item_list, item_type in [(pdf_list, \"pdf\"), (ocr_list, \"ocr\")]:\n                    cell[\"ocr\"] = []\n                    for item_id, item in enumerate(item_list):\n                        if item_type == \"pdf\" and item_id in matched_pdf_ids:\n                            continue\n                        if (\n                            get_rect_iou(item[\"location\"], cell[\"location\"], iou_type=1)\n                            > IOU_THRES\n                        ):\n                            cell[\"ocr\"].append(item)\n                            if item_type == \"pdf\":\n                                matched_pdf_ids.append(item_id)\n\n                    if len(cell[\"ocr\"]) > 0:\n                        # check if union of matched ocr does\n                        # not extend over cell boundary,\n                        # if True, continue to use OCR_list to match\n                        all_box_points_in_cell = []\n                        for item in cell[\"ocr\"]:\n                            all_box_points_in_cell.extend(item[\"location\"])\n                        union_box = union_points(all_box_points_in_cell)\n                        cell_okay = (\n                            box_h(union_box) <= box_h(cell[\"bbox\"]) * PADDING_THRES\n                            and box_w(union_box) <= box_w(cell[\"bbox\"]) * PADDING_THRES\n                        )\n                    else:\n                        cell_okay = False\n\n                    if cell_okay:\n                        if item_type == \"pdf\":\n                            color = [255, 0, 255]\n                        break\n\n                if debug_info is not None:\n                    cv2.rectangle(\n                        debug_im,\n                        cell[\"location\"][0],\n                        cell[\"location\"][2],\n                        color=color,\n                        thickness=3,\n                    )\n\n                matched_cell_ids.append(cell_id)\n                cur_table_cells.append(cell)\n\n        all_tables.append(cur_table_cells)\n\n    not_matched_items = [\n        item for _id, item in enumerate(pdf_list) if _id not in matched_pdf_ids\n    ]\n    if debug_info is not None:\n        for item in not_matched_items:\n            cv2.rectangle(\n                debug_im,\n                item[\"location\"][0],\n                item[\"location\"][2],\n                color=[128, 128, 128],\n                thickness=3,\n            )\n\n    return all_tables, not_matched_items\n
    "},{"location":"reference/loaders/utils/pdf_ocr/#loaders.utils.pdf_ocr.parse_ocr_output","title":"parse_ocr_output","text":"
    parse_ocr_output(\n    ocr_page_items,\n    pdf_page_items,\n    artifact_path=None,\n    debug_path=None,\n)\n

    Main function to combine OCR output and PDF text to form list of table / non-table regions Args: ocr_page_items: List of OCR items by page pdf_page_items: Dict of PDF texts (page number as key) debug_path: If specified, use OpenCV to plot debug image and save to debug_path

    Source code in libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py
    def parse_ocr_output(\n    ocr_page_items: List[dict],\n    pdf_page_items: Dict[int, List[dict]],\n    artifact_path: Optional[str] = None,\n    debug_path: Optional[str] = None,\n):\n    \"\"\"Main function to combine OCR output and PDF text to\n    form list of table / non-table regions\n    Args:\n        ocr_page_items: List of OCR items by page\n        pdf_page_items: Dict of PDF texts (page number as key)\n        debug_path: If specified, use OpenCV to plot debug image and save to debug_path\n    \"\"\"\n    all_tables = []\n    all_texts = []\n\n    for page_id, page in enumerate(ocr_page_items):\n        ocr_list = page[\"json\"][\"ocr\"]\n        table_list = page[\"json\"][\"table\"]\n        page_shape = page[\"image_shape\"]\n        pdf_item_list = pdf_page_items[page_id]\n\n        # create bbox additional information\n        for item in ocr_list:\n            item[\"box\"] = points_to_bbox(item[\"location\"])\n\n        # re-scale pdf items according to new image size\n        for item in pdf_item_list:\n            scale_factor = page_shape[0] / item[\"page_shape\"][0]\n            item[\"box\"] = scale_box(item[\"box\"], scale_factor=scale_factor)\n            item[\"location\"] = scale_points(item[\"location\"], scale_factor=scale_factor)\n\n        # if using debug mode, openCV must be installed\n        if debug_path and artifact_path is not None:\n            try:\n                import cv2\n            except ImportError:\n                raise ImportError(\n                    \"Please install openCV first to use OCRReader debug mode\"\n                )\n            image_path = Path(artifact_path) / page[\"image\"]\n            image = cv2.imread(str(image_path))\n            debug_info = (cv2, image)\n        else:\n            debug_info = None\n\n        new_pdf_list = merge_ocr_and_pdf_texts(\n            ocr_list, pdf_item_list, debug_info=debug_info\n        )\n\n        # sort by reading order\n        ocr_list = sort_funsd_reading_order(ocr_list)\n        new_pdf_list = sort_funsd_reading_order(new_pdf_list)\n\n        all_table_cells, non_table_text_list = merge_table_cell_and_ocr(\n            table_list, ocr_list, new_pdf_list, debug_info=debug_info\n        )\n\n        table_texts = [table_cells_to_markdown(cells) for cells in all_table_cells]\n        all_tables.extend([(page_id, text) for text in table_texts])\n        all_texts.append(\n            (page_id, \" \".join(item[\"text\"] for item in non_table_text_list))\n        )\n\n        # export debug image to debug_path\n        if debug_path:\n            cv2.imwrite(str(Path(debug_path) / \"page_{}.png\".format(page_id)), image)\n\n    return all_tables, all_texts\n
    "},{"location":"reference/loaders/utils/table/","title":"Table","text":""},{"location":"reference/loaders/utils/table/#loaders.utils.table.check_col_conflicts","title":"check_col_conflicts","text":"
    check_col_conflicts(col_a, col_b, thres=0.15)\n

    Check if 2 columns A and B has non-empty content in the same row (to be used with merge_cols)

    Parameters:

    Name Type Description Default col_a List[str]

    column A (list of str)

    required col_b List[str]

    column B (list of str)

    required thres float

    percentage of overlapping allowed

    0.15

    Returns: if number of overlapping greater than threshold

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def check_col_conflicts(\n    col_a: List[str], col_b: List[str], thres: float = 0.15\n) -> bool:\n    \"\"\"Check if 2 columns A and B has non-empty content in the same row\n    (to be used with merge_cols)\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n        thres: percentage of overlapping allowed\n    Returns:\n        if number of overlapping greater than threshold\n    \"\"\"\n    num_rows = len([cell for cell in col_a if cell])\n    assert len(col_a) == len(col_b)\n    conflict_count = 0\n    for cell_a, cell_b in zip(col_a, col_b):\n        if cell_a and cell_b:\n            conflict_count += 1\n    return conflict_count > num_rows * thres\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.merge_cols","title":"merge_cols","text":"
    merge_cols(col_a, col_b)\n

    Merge column A and B if they do not have conflict rows

    Parameters:

    Name Type Description Default col_a List[str]

    column A (list of str)

    required col_b List[str]

    column B (list of str)

    required

    Returns: merged column

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def merge_cols(col_a: List[str], col_b: List[str]) -> List[str]:\n    \"\"\"Merge column A and B if they do not have conflict rows\n\n    Args:\n        col_a: column A (list of str)\n        col_b: column B (list of str)\n    Returns:\n        merged column\n    \"\"\"\n    for r_id in range(len(col_a)):\n        if col_b[r_id]:\n            col_a[r_id] = col_a[r_id] + \" \" + col_b[r_id]\n    return col_a\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.add_index_col","title":"add_index_col","text":"
    add_index_col(csv_rows)\n

    Add index column as the first column of the table csv_rows

    Parameters:

    Name Type Description Default csv_rows List[List[str]]

    input table

    required

    Returns: output table with index column

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def add_index_col(csv_rows: List[List[str]]) -> List[List[str]]:\n    \"\"\"Add index column as the first column of the table csv_rows\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output table with index column\n    \"\"\"\n    new_csv_rows = [[\"row id\"] + [\"\"] * len(csv_rows[0])]\n    for r_id, row in enumerate(csv_rows):\n        new_csv_rows.append([str(r_id + 1)] + row)\n    return new_csv_rows\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.compress_csv","title":"compress_csv","text":"
    compress_csv(csv_rows)\n

    Compress table csv_rows by merging sparse columns (merge_cols)

    Parameters:

    Name Type Description Default csv_rows List[List[str]]

    input table

    required

    Returns: output: compressed table

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def compress_csv(csv_rows: List[List[str]]) -> List[List[str]]:\n    \"\"\"Compress table csv_rows by merging sparse columns (merge_cols)\n\n    Args:\n        csv_rows: input table\n    Returns:\n        output: compressed table\n    \"\"\"\n    csv_cols = [[r[c_id] for r in csv_rows] for c_id in range(len(csv_rows[0]))]\n    to_remove_col_ids = []\n    last_c_id = 0\n    for c_id in range(1, len(csv_cols)):\n        if not check_col_conflicts(csv_cols[last_c_id], csv_cols[c_id]):\n            to_remove_col_ids.append(c_id)\n            csv_cols[last_c_id] = merge_cols(csv_cols[last_c_id], csv_cols[c_id])\n        else:\n            last_c_id = c_id\n\n    csv_cols = [r for c_id, r in enumerate(csv_cols) if c_id not in to_remove_col_ids]\n    csv_rows = [[c[r_id] for c in csv_cols] for r_id in range(len(csv_cols[0]))]\n    return csv_rows\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.get_table_from_ocr","title":"get_table_from_ocr","text":"
    get_table_from_ocr(ocr_list, table_list)\n

    Get list of text lines belong to table regions specified by table_list

    Parameters:

    Name Type Description Default ocr_list List[dict]

    list of OCR output in Casia format (Flax)

    required table_list List[dict]

    list of table output in Casia format (Flax)

    required

    Returns:

    Name Type Description _type_

    description

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def get_table_from_ocr(ocr_list: List[dict], table_list: List[dict]):\n    \"\"\"Get list of text lines belong to table regions specified by table_list\n\n    Args:\n        ocr_list: list of OCR output in Casia format (Flax)\n        table_list: list of table output in Casia format (Flax)\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    table_texts = []\n    for table in table_list:\n        if table[\"type\"] != \"table\":\n            continue\n        cur_table_texts = []\n        for ocr in ocr_list:\n            _iou = get_rect_iou(table[\"location\"], ocr[\"location\"], iou_type=1)\n            if _iou > 0.8:\n                cur_table_texts.append(ocr[\"text\"])\n        table_texts.append(cur_table_texts)\n\n    return table_texts\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.make_markdown_table","title":"make_markdown_table","text":"
    make_markdown_table(array)\n

    Convert table rows in list format to markdown string

    Parameters:

    Name Type Description Default Example Input
    [[\"Name\", \"Age\", \"Height\"],\n[\"Jake\", 20, 5'10],\n[\"Mary\", 21, 5'7]]\n
    required

    Returns: String to put into a .md file

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def make_markdown_table(array: List[List[str]]) -> str:\n    \"\"\"Convert table rows in list format to markdown string\n\n    Args:\n        Python list with rows of table as lists\n        First element as header.\n        Example Input:\n                [[\"Name\", \"Age\", \"Height\"],\n                [\"Jake\", 20, 5'10],\n                [\"Mary\", 21, 5'7]]\n    Returns:\n        String to put into a .md file\n    \"\"\"\n    array = compress_csv(array)\n    array = add_index_col(array)\n    markdown = \"\\n\" + str(\"| \")\n\n    for e in array[0]:\n        to_add = \" \" + str(e) + str(\" |\")\n        markdown += to_add\n    markdown += \"\\n\"\n\n    markdown += \"| \"\n    for i in range(len(array[0])):\n        markdown += str(\"--- | \")\n    markdown += \"\\n\"\n\n    for entry in array[1:]:\n        markdown += str(\"| \")\n        for e in entry:\n            to_add = str(e) + str(\" | \")\n            markdown += to_add\n        markdown += \"\\n\"\n\n    return markdown + \"\\n\"\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_csv_string_to_list","title":"parse_csv_string_to_list","text":"
    parse_csv_string_to_list(csv_str)\n

    Convert CSV string to list of rows

    Parameters:

    Name Type Description Default csv_str str

    input CSV string

    required

    Returns:

    Type Description List[List[str]]

    Output table in list format

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def parse_csv_string_to_list(csv_str: str) -> List[List[str]]:\n    \"\"\"Convert CSV string to list of rows\n\n    Args:\n        csv_str: input CSV string\n\n    Returns:\n        Output table in list format\n    \"\"\"\n    io = StringIO(csv_str)\n    csv_reader = csv.reader(io, delimiter=\",\")\n    rows = [row for row in csv_reader]\n    return rows\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.format_cell","title":"format_cell","text":"
    format_cell(cell, length_limit=None)\n

    Format cell content by remove redundant character and enforce length limit

    Parameters:

    Name Type Description Default cell str

    input cell text

    required length_limit Optional[int]

    limit of text length.

    None

    Returns:

    Type Description str

    new cell text

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def format_cell(cell: str, length_limit: Optional[int] = None) -> str:\n    \"\"\"Format cell content by remove redundant character and enforce length limit\n\n    Args:\n        cell: input cell text\n        length_limit: limit of text length.\n\n    Returns:\n        new cell text\n    \"\"\"\n    cell = cell.replace(\"\\n\", \" \")\n    if length_limit:\n        cell = cell[:length_limit]\n    return cell\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.extract_tables_from_csv_string","title":"extract_tables_from_csv_string","text":"
    extract_tables_from_csv_string(csv_content, table_texts)\n

    Extract list of table from FullOCR output (csv_content) with the specified table_texts

    Parameters:

    Name Type Description Default csv_content str

    CSV output from FullOCR pipeline

    required table_texts List[List[str]]

    list of table texts extracted

    required

    Returns:

    Type Description Tuple[List[str], str]

    List of tables and non-text content

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def extract_tables_from_csv_string(\n    csv_content: str, table_texts: List[List[str]]\n) -> Tuple[List[str], str]:\n    \"\"\"Extract list of table from FullOCR output\n    (csv_content) with the specified table_texts\n\n    Args:\n        csv_content: CSV output from FullOCR pipeline\n        table_texts: list of table texts extracted\n        from get_table_from_ocr()\n\n    Returns:\n        List of tables and non-text content\n    \"\"\"\n    rows = parse_csv_string_to_list(csv_content)\n    used_row_ids = []\n    table_csv_list = []\n    for table in table_texts:\n        cur_rows = []\n        for row_id, row in enumerate(rows):\n            scores = [\n                any(cell in cell_reference for cell in table)\n                for cell_reference in row\n                if cell_reference\n            ]\n            score = sum(scores) / len(scores)\n            if score > 0.5 and row_id not in used_row_ids:\n                used_row_ids.append(row_id)\n                cur_rows.append([format_cell(cell) for cell in row])\n        if cur_rows:\n            table_csv_list.append(make_markdown_table(cur_rows))\n        else:\n            print(\"table not matched\", table)\n\n    non_table_rows = [\n        row for row_id, row in enumerate(rows) if row_id not in used_row_ids\n    ]\n    non_table_text = \"\\n\".join(\n        \" \".join(format_cell(cell) for cell in row) for row in non_table_rows\n    )\n    return table_csv_list, non_table_text\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.strip_special_chars_markdown","title":"strip_special_chars_markdown","text":"
    strip_special_chars_markdown(text)\n

    Strip special characters from input text in markdown table format

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def strip_special_chars_markdown(text: str) -> str:\n    \"\"\"Strip special characters from input text in markdown table format\"\"\"\n    return text.replace(\"|\", \"\").replace(\":---:\", \"\").replace(\"---\", \"\")\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.parse_markdown_text_to_tables","title":"parse_markdown_text_to_tables","text":"
    parse_markdown_text_to_tables(text)\n

    Convert markdown text to list of non-table spans and table spans

    Parameters:

    Name Type Description Default text str

    input markdown text

    required

    Returns:

    Type Description Tuple[List[str], List[str]]

    list of table spans and non-table spans

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def parse_markdown_text_to_tables(text: str) -> Tuple[List[str], List[str]]:\n    \"\"\"Convert markdown text to list of non-table spans and table spans\n\n    Args:\n        text: input markdown text\n\n    Returns:\n        list of table spans and non-table spans\n    \"\"\"\n    # init empty tables and texts list\n    tables = []\n    texts = []\n\n    # split input by line break\n    lines = text.split(\"\\n\")\n    cur_table = []\n    cur_text: List[str] = []\n    for line in lines:\n        line = line.strip()\n        if line.startswith(\"|\"):\n            if len(cur_text) > 0:\n                texts.append(cur_text)\n                cur_text = []\n            cur_table.append(line)\n        else:\n            # add new table to the list\n            if len(cur_table) > 0:\n                tables.append(cur_table)\n                cur_table = []\n            cur_text.append(line)\n\n    table_texts = [\"\\n\".join(table) for table in tables]\n    non_table_texts = [\"\\n\".join(text) for text in texts]\n    return table_texts, non_table_texts\n
    "},{"location":"reference/loaders/utils/table/#loaders.utils.table.table_cells_to_markdown","title":"table_cells_to_markdown","text":"
    table_cells_to_markdown(cells)\n

    Convert list of cells with attached text to Markdown table

    Source code in libs/kotaemon/kotaemon/loaders/utils/table.py
    def table_cells_to_markdown(cells: List[dict]):\n    \"\"\"Convert list of cells with attached text to Markdown table\"\"\"\n\n    if len(cells) == 0:\n        return \"\"\n\n    all_row_ids = []\n    all_col_ids = []\n    for cell in cells:\n        all_row_ids.extend(cell[\"rows\"])\n        all_col_ids.extend(cell[\"columns\"])\n\n    num_rows, num_cols = max(all_row_ids) + 1, max(all_col_ids) + 1\n    table_rows = [[\"\" for c in range(num_cols)] for r in range(num_rows)]\n\n    # start filling in the grid\n    for cell in cells:\n        cell_text = \" \".join(item[\"text\"] for item in cell[\"ocr\"])\n        start_row_id, end_row_id = cell[\"rows\"]\n        start_col_id, end_col_id = cell[\"columns\"]\n        span_cell = end_row_id != start_row_id or end_col_id != start_col_id\n\n        # do not repeat long text in span cell to prevent context length issue\n        if span_cell and len(cell_text.replace(\" \", \"\")) < 20 and start_row_id > 0:\n            for row in range(start_row_id, end_row_id + 1):\n                for col in range(start_col_id, end_col_id + 1):\n                    table_rows[row][col] += cell_text + \" \"\n        else:\n            table_rows[start_row_id][start_col_id] += cell_text + \" \"\n\n    return make_markdown_table(table_rows)\n
    "},{"location":"reference/parsers/","title":"Parsers","text":""},{"location":"reference/parsers/#parsers.RegexExtractor","title":"RegexExtractor","text":"

    Bases: BaseComponent

    Simple class for extracting text from a document using a regex pattern.

    Parameters:

    Name Type Description Default pattern List[str]

    The regex pattern(s) to use.

    required output_map dict

    A mapping from extracted text to the desired output. Defaults to None.

    required Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -> list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -> str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -> ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            >>> document1 = Document(...)\n            >>> document2 = Document(...)\n            >>> document_batch = [document1, document2]\n            >>> batch_output = self(document_batch)\n            >>> print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw_static","title":"run_raw_static staticmethod","text":"
    run_raw_static(pattern, text)\n

    Finds all non-overlapping occurrences of a pattern in a string.

    Parameters:

    Name Type Description Default pattern str

    The regular expression pattern to search for.

    required text str

    The input string to search in.

    required

    Returns:

    Type Description list[str]

    List[str]: A list of all non-overlapping occurrences of the pattern in the string.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef run_raw_static(pattern: str, text: str) -> list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.map_output","title":"map_output staticmethod","text":"
    map_output(text, output_map)\n

    Maps the given text to its corresponding value in the output_map dictionary.

    Parameters:

    Name Type Description Default text str

    The input text to be mapped.

    required output_map dict

    A dictionary containing mapping of input text to output values.

    required

    Returns:

    Name Type Description str str

    The corresponding value from the output_map if text is found in the dictionary, otherwise returns the original text.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef map_output(text, output_map) -> str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.run_raw","title":"run_raw","text":"
    run_raw(text)\n

    Matches the raw text against the pattern and rans the output mapping, returning an instance of ExtractorOutput.

    Parameters:

    Name Type Description Default text str

    The raw text to be processed.

    required

    Returns:

    Name Type Description ExtractorOutput ExtractorOutput

    The processed output as a list of ExtractorOutput.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run_raw(self, text: str) -> ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n
    "},{"location":"reference/parsers/#parsers.RegexExtractor.run","title":"run","text":"
    run(text)\n

    Match the input against a pattern and return the output for each input

    Parameters:

    Name Type Description Default text str | list[str] | Document | list[Document]

    contains the input string to be processed

    required

    Returns:

    Type Description list[ExtractorOutput]

    A list contains the output ExtractorOutput for each input

    Example
    >>> document1 = Document(...)\n>>> document2 = Document(...)\n>>> document_batch = [document1, document2]\n>>> batch_output = self(document_batch)\n>>> print(batch_output)\n[output1_document1, output1_document2]\n
    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        >>> document1 = Document(...)\n        >>> document2 = Document(...)\n        >>> document_batch = [document1, document2]\n        >>> batch_output = self(document_batch)\n        >>> print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n
    "},{"location":"reference/parsers/regex_extractor/","title":"Regex Extractor","text":""},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor","title":"RegexExtractor","text":"

    Bases: BaseComponent

    Simple class for extracting text from a document using a regex pattern.

    Parameters:

    Name Type Description Default pattern List[str]

    The regex pattern(s) to use.

    required output_map dict

    A mapping from extracted text to the desired output. Defaults to None.

    required Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    class RegexExtractor(BaseComponent):\n    \"\"\"\n    Simple class for extracting text from a document using a regex pattern.\n\n    Args:\n        pattern (List[str]): The regex pattern(s) to use.\n        output_map (dict, optional): A mapping from extracted text to the\n            desired output. Defaults to None.\n    \"\"\"\n\n    class Config:\n        middleware_switches = {\"theflow.middleware.CachingMiddleware\": False}\n\n    pattern: list[str]\n    output_map: dict[str, str] | Callable[[str], str] = Param(\n        default_callback=lambda *_: {}\n    )\n\n    def __init__(self, pattern: str | list[str], **kwargs):\n        if isinstance(pattern, str):\n            pattern = [pattern]\n        super().__init__(pattern=pattern, **kwargs)\n\n    @staticmethod\n    def run_raw_static(pattern: str, text: str) -> list[str]:\n        \"\"\"\n        Finds all non-overlapping occurrences of a pattern in a string.\n\n        Parameters:\n            pattern (str): The regular expression pattern to search for.\n            text (str): The input string to search in.\n\n        Returns:\n            List[str]: A list of all non-overlapping occurrences of the pattern in the\n                string.\n        \"\"\"\n        return re.findall(pattern, text)\n\n    @staticmethod\n    def map_output(text, output_map) -> str:\n        \"\"\"\n        Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n        Parameters:\n            text (str): The input text to be mapped.\n            output_map (dict): A dictionary containing mapping of input text to output\n                values.\n\n        Returns:\n            str: The corresponding value from the `output_map` if `text` is found in the\n                dictionary, otherwise returns the original `text`.\n        \"\"\"\n        if not output_map:\n            return text\n\n        if isinstance(output_map, dict):\n            return output_map.get(text, text)\n\n        return output_map(text)\n\n    def run_raw(self, text: str) -> ExtractorOutput:\n        \"\"\"\n        Matches the raw text against the pattern and rans the output mapping, returning\n            an instance of ExtractorOutput.\n\n        Args:\n            text (str): The raw text to be processed.\n\n        Returns:\n            ExtractorOutput: The processed output as a list of ExtractorOutput.\n        \"\"\"\n        output: list[str] = sum(\n            [self.run_raw_static(p, text) for p in self.pattern], []\n        )\n        output = [self.map_output(text, self.output_map) for text in output]\n\n        return ExtractorOutput(\n            text=output[0] if output else \"\",\n            matches=output,\n            metadata={\"origin\": \"RegexExtractor\"},\n        )\n\n    def run(\n        self, text: str | list[str] | Document | list[Document]\n    ) -> list[ExtractorOutput]:\n        \"\"\"Match the input against a pattern and return the output for each input\n\n        Parameters:\n            text: contains the input string to be processed\n\n        Returns:\n            A list contains the output ExtractorOutput for each input\n\n        Example:\n            ```pycon\n            >>> document1 = Document(...)\n            >>> document2 = Document(...)\n            >>> document_batch = [document1, document2]\n            >>> batch_output = self(document_batch)\n            >>> print(batch_output)\n            [output1_document1, output1_document2]\n            ```\n        \"\"\"\n        # TODO: this conversion seems common\n        input_: list[str] = []\n        if not isinstance(text, list):\n            text = [text]\n\n        for item in text:\n            if isinstance(item, str):\n                input_.append(item)\n            elif isinstance(item, Document):\n                input_.append(item.text)\n            else:\n                raise ValueError(\n                    f\"Invalid input type {type(item)}, should be str or Document\"\n                )\n\n        output = []\n        for each_input in input_:\n            output.append(self.run_raw(each_input))\n\n        return output\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw_static","title":"run_raw_static staticmethod","text":"
    run_raw_static(pattern, text)\n

    Finds all non-overlapping occurrences of a pattern in a string.

    Parameters:

    Name Type Description Default pattern str

    The regular expression pattern to search for.

    required text str

    The input string to search in.

    required

    Returns:

    Type Description list[str]

    List[str]: A list of all non-overlapping occurrences of the pattern in the string.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef run_raw_static(pattern: str, text: str) -> list[str]:\n    \"\"\"\n    Finds all non-overlapping occurrences of a pattern in a string.\n\n    Parameters:\n        pattern (str): The regular expression pattern to search for.\n        text (str): The input string to search in.\n\n    Returns:\n        List[str]: A list of all non-overlapping occurrences of the pattern in the\n            string.\n    \"\"\"\n    return re.findall(pattern, text)\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.map_output","title":"map_output staticmethod","text":"
    map_output(text, output_map)\n

    Maps the given text to its corresponding value in the output_map dictionary.

    Parameters:

    Name Type Description Default text str

    The input text to be mapped.

    required output_map dict

    A dictionary containing mapping of input text to output values.

    required

    Returns:

    Name Type Description str str

    The corresponding value from the output_map if text is found in the dictionary, otherwise returns the original text.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    @staticmethod\ndef map_output(text, output_map) -> str:\n    \"\"\"\n    Maps the given `text` to its corresponding value in the `output_map` dictionary.\n\n    Parameters:\n        text (str): The input text to be mapped.\n        output_map (dict): A dictionary containing mapping of input text to output\n            values.\n\n    Returns:\n        str: The corresponding value from the `output_map` if `text` is found in the\n            dictionary, otherwise returns the original `text`.\n    \"\"\"\n    if not output_map:\n        return text\n\n    if isinstance(output_map, dict):\n        return output_map.get(text, text)\n\n    return output_map(text)\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run_raw","title":"run_raw","text":"
    run_raw(text)\n

    Matches the raw text against the pattern and rans the output mapping, returning an instance of ExtractorOutput.

    Parameters:

    Name Type Description Default text str

    The raw text to be processed.

    required

    Returns:

    Name Type Description ExtractorOutput ExtractorOutput

    The processed output as a list of ExtractorOutput.

    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run_raw(self, text: str) -> ExtractorOutput:\n    \"\"\"\n    Matches the raw text against the pattern and rans the output mapping, returning\n        an instance of ExtractorOutput.\n\n    Args:\n        text (str): The raw text to be processed.\n\n    Returns:\n        ExtractorOutput: The processed output as a list of ExtractorOutput.\n    \"\"\"\n    output: list[str] = sum(\n        [self.run_raw_static(p, text) for p in self.pattern], []\n    )\n    output = [self.map_output(text, self.output_map) for text in output]\n\n    return ExtractorOutput(\n        text=output[0] if output else \"\",\n        matches=output,\n        metadata={\"origin\": \"RegexExtractor\"},\n    )\n
    "},{"location":"reference/parsers/regex_extractor/#parsers.regex_extractor.RegexExtractor.run","title":"run","text":"
    run(text)\n

    Match the input against a pattern and return the output for each input

    Parameters:

    Name Type Description Default text str | list[str] | Document | list[Document]

    contains the input string to be processed

    required

    Returns:

    Type Description list[ExtractorOutput]

    A list contains the output ExtractorOutput for each input

    Example
    >>> document1 = Document(...)\n>>> document2 = Document(...)\n>>> document_batch = [document1, document2]\n>>> batch_output = self(document_batch)\n>>> print(batch_output)\n[output1_document1, output1_document2]\n
    Source code in libs/kotaemon/kotaemon/parsers/regex_extractor.py
    def run(\n    self, text: str | list[str] | Document | list[Document]\n) -> list[ExtractorOutput]:\n    \"\"\"Match the input against a pattern and return the output for each input\n\n    Parameters:\n        text: contains the input string to be processed\n\n    Returns:\n        A list contains the output ExtractorOutput for each input\n\n    Example:\n        ```pycon\n        >>> document1 = Document(...)\n        >>> document2 = Document(...)\n        >>> document_batch = [document1, document2]\n        >>> batch_output = self(document_batch)\n        >>> print(batch_output)\n        [output1_document1, output1_document2]\n        ```\n    \"\"\"\n    # TODO: this conversion seems common\n    input_: list[str] = []\n    if not isinstance(text, list):\n        text = [text]\n\n    for item in text:\n        if isinstance(item, str):\n            input_.append(item)\n        elif isinstance(item, Document):\n            input_.append(item.text)\n        else:\n            raise ValueError(\n                f\"Invalid input type {type(item)}, should be str or Document\"\n            )\n\n    output = []\n    for each_input in input_:\n        output.append(self.run_raw(each_input))\n\n    return output\n
    "},{"location":"reference/storages/","title":"Storages","text":""},{"location":"reference/storages/#storages.BaseDocumentStore","title":"BaseDocumentStore","text":"

    Bases: ABC

    A document store is in charged of storing and managing documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.add","title":"add abstractmethod","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    Document or list of documents

    required ids Optional[Union[List[str], str]]

    List of ids of the documents. Optional, if not set will use doc.doc_id

    None Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.get","title":"get abstractmethod","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.get_all","title":"get_all abstractmethod","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.count","title":"count abstractmethod","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.query","title":"query abstractmethod","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search document store using search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.delete","title":"delete abstractmethod","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseDocumentStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -> List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None refresh_indices bool

    request Elasticsearch to update its index (default to True)

    True Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"
    query_raw(query)\n

    Query Elasticsearch store using query format of ES client

    Parameters:

    Name Type Description Default query dict

    Elasticsearch query format

    required

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query_raw(self, query: dict) -> List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search Elasticsearch docstore using search query (BM25)

    Parameters:

    Name Type Description Default query str

    query text

    required top_k int

    number of top documents to return. Defaults to 10.

    10

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/#storages.ElasticsearchDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.save","title":"save","text":"
    save(path)\n

    Save document to path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.load","title":"load","text":"
    load(path)\n

    Load document store from path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Perform full-text search on document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n
    "},{"location":"reference/storages/#storages.InMemoryDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"

    Bases: BaseDocumentStore

    LancdDB document store which support full-text search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def get_all(self) -> List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Load documents into lancedb storage.

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.delete","title":"delete","text":"
    delete(ids, refresh_indices=True)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/#storages.LanceDBDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"

    Bases: InMemoryDocumentStore

    Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/#storages.SimpleFileDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n
    "},{"location":"reference/storages/#storages.BaseVectorStore","title":"BaseVectorStore","text":"

    Bases: ABC

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -> list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.add","title":"add abstractmethod","text":"
    add(embeddings, metadatas=None, ids=None)\n

    Add vector embeddings to vector stores

    Parameters:

    Name Type Description Default embeddings list[list[float]] | list[DocumentWithEmbedding]

    List of embeddings

    required metadatas Optional[list[dict]]

    List of metadata of the embeddings

    None ids Optional[list[str]]

    List of ids of the embeddings

    None kwargs

    meant for vectorstore-specific parameters

    required

    Returns:

    Type Description list[str]

    List of ids of the embeddings

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -> list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.delete","title":"delete abstractmethod","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids list[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.query","title":"query abstractmethod","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.BaseVectorStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the vector store

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n
    "},{"location":"reference/storages/#storages.ChromaVectorStore","title":"ChromaVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -> int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/#storages.ChromaVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n
    "},{"location":"reference/storages/#storages.ChromaVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore","title":"InMemoryVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore.save","title":"save","text":"
    save(save_path, fs=None, **kwargs)\n

    save a simpleVectorStore to a dictionary.

    Parameters:

    Name Type Description Default save_path str

    Path of saving vector to disk.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore.load","title":"load","text":"
    load(load_path, fs=None)\n

    Create a SimpleKVStore from a load directory.

    Parameters:

    Name Type Description Default load_path str

    Path of loading vector.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n
    "},{"location":"reference/storages/#storages.InMemoryVectorStore.drop","title":"drop","text":"
    drop()\n

    Clear the old data

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n
    "},{"location":"reference/storages/#storages.LanceDBVectorStore","title":"LanceDBVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/#storages.LanceDBVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n
    "},{"location":"reference/storages/#storages.LanceDBVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/#storages.MilvusVectorStore","title":"MilvusVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
    class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -> int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/#storages.QdrantVectorStore","title":"QdrantVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -> int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/#storages.QdrantVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n
    "},{"location":"reference/storages/#storages.QdrantVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n
    "},{"location":"reference/storages/#storages.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Similar to InMemoryVectorStore but is backed by file by default

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
    class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/docstores/","title":"Docstores","text":""},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore","title":"BaseDocumentStore","text":"

    Bases: ABC

    A document store is in charged of storing and managing documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.add","title":"add abstractmethod","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    Document or list of documents

    required ids Optional[Union[List[str], str]]

    List of ids of the documents. Optional, if not set will use doc.doc_id

    None Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get","title":"get abstractmethod","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.get_all","title":"get_all abstractmethod","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.count","title":"count abstractmethod","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.query","title":"query abstractmethod","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search document store using search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.delete","title":"delete abstractmethod","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.BaseDocumentStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -> List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None refresh_indices bool

    request Elasticsearch to update its index (default to True)

    True Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"
    query_raw(query)\n

    Query Elasticsearch store using query format of ES client

    Parameters:

    Name Type Description Default query dict

    Elasticsearch query format

    required

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query_raw(self, query: dict) -> List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search Elasticsearch docstore using search query (BM25)

    Parameters:

    Name Type Description Default query str

    query text

    required top_k int

    number of top documents to return. Defaults to 10.

    10

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.ElasticsearchDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.save","title":"save","text":"
    save(path)\n

    Save document to path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.load","title":"load","text":"
    load(path)\n

    Load document store from path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Perform full-text search on document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n
    "},{"location":"reference/storages/docstores/#storages.docstores.InMemoryDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"

    Bases: BaseDocumentStore

    LancdDB document store which support full-text search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def get_all(self) -> List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Load documents into lancedb storage.

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.delete","title":"delete","text":"
    delete(ids, refresh_indices=True)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/#storages.docstores.LanceDBDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"

    Bases: InMemoryDocumentStore

    Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/#storages.docstores.SimpleFileDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n
    "},{"location":"reference/storages/docstores/base/","title":"Base","text":""},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore","title":"BaseDocumentStore","text":"

    Bases: ABC

    A document store is in charged of storing and managing documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    class BaseDocumentStore(ABC):\n    \"\"\"A document store is in charged of storing and managing documents\"\"\"\n\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: Document or list of documents\n            ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        ...\n\n    @abstractmethod\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search document store using search query\"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.add","title":"add abstractmethod","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    Document or list of documents

    required ids Optional[Union[List[str], str]]

    List of ids of the documents. Optional, if not set will use doc.doc_id

    None Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: Document or list of documents\n        ids: List of ids of the documents. Optional, if not set will use doc.doc_id\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get","title":"get abstractmethod","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.get_all","title":"get_all abstractmethod","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.count","title":"count abstractmethod","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.query","title":"query abstractmethod","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search document store using search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search document store using search query\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.delete","title":"delete abstractmethod","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/base/#storages.docstores.base.BaseDocumentStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the document store\"\"\"\n    ...\n
    "},{"location":"reference/storages/docstores/elasticsearch/","title":"Elasticsearch","text":""},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore","title":"ElasticsearchDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    class ElasticsearchDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(\n        self,\n        collection_name: str = \"docstore\",\n        elasticsearch_url: str = \"http://localhost:9200\",\n        k1: float = 2.0,\n        b: float = 0.75,\n        **kwargs,\n    ):\n        try:\n            from elasticsearch import Elasticsearch\n            from elasticsearch.helpers import bulk\n        except ImportError:\n            raise ImportError(\n                \"To use ElaticsearchDocstore please install `pip install elasticsearch`\"\n            )\n\n        self.elasticsearch_url = elasticsearch_url\n        self.index_name = collection_name\n        self.k1 = k1\n        self.b = b\n\n        # Create an Elasticsearch client instance\n        self.client = Elasticsearch(elasticsearch_url, **kwargs)\n        self.es_bulk = bulk\n        # Define the index settings and mappings\n        settings = {\n            \"analysis\": {\"analyzer\": {\"default\": {\"type\": \"standard\"}}},\n            \"similarity\": {\n                \"custom_bm25\": {\n                    \"type\": \"BM25\",\n                    \"k1\": k1,\n                    \"b\": b,\n                }\n            },\n        }\n        mappings = {\n            \"properties\": {\n                \"content\": {\n                    \"type\": \"text\",\n                    \"similarity\": \"custom_bm25\",  # Use the custom BM25 similarity\n                }\n            }\n        }\n\n        # Create the index with the specified settings and mappings\n        if not self.client.indices.exists(index=self.index_name):\n            self.client.indices.create(\n                index=self.index_name, mappings=mappings, settings=settings\n            )\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or use existing doc.doc_id\n            refresh_indices: request Elasticsearch to update its index (default to True)\n        \"\"\"\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        requests = []\n        for doc_id, doc in zip(doc_ids, docs):\n            text = doc.text\n            metadata = doc.metadata\n            request = {\n                \"_op_type\": \"index\",\n                \"_index\": self.index_name,\n                \"content\": text,\n                \"metadata\": metadata,\n                \"_id\": doc_id,\n            }\n            requests.append(request)\n\n        success, failed = self.es_bulk(self.client, requests)\n        print(\"Added/Updated documents to index\", success)\n        print(\"Failed documents to index\", failed)\n\n        if refresh_indices:\n            self.client.indices.refresh(index=self.index_name)\n\n    def query_raw(self, query: dict) -> List[Document]:\n        \"\"\"Query Elasticsearch store using query format of ES client\n\n        Args:\n            query (dict): Elasticsearch query format\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        res = self.client.search(index=self.index_name, body=query)\n        docs = []\n        for r in res[\"hits\"][\"hits\"]:\n            docs.append(\n                Document(\n                    id_=r[\"_id\"],\n                    text=r[\"_source\"][\"content\"],\n                    metadata=r[\"_source\"][\"metadata\"],\n                )\n            )\n        return docs\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n        Args:\n            query (str): query text\n            top_k (int, optional): number of\n                top documents to return. Defaults to 10.\n\n        Returns:\n            List[Document]: List of result documents\n        \"\"\"\n        query_dict: dict = {\"match\": {\"content\": query}}\n        if doc_ids is not None:\n            query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n        query_dict = {\"query\": query_dict, \"size\": top_k}\n        return self.query_raw(query_dict)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n        query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n        return self.query_raw(query_dict)\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        count = int(\n            self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n        )\n        return count\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n        return self.query_raw(query_dict)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        query = {\"query\": {\"terms\": {\"_id\": ids}}}\n        self.client.delete_by_query(index=self.index_name, body=query)\n        self.client.indices.refresh(index=self.index_name)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.client.indices.delete(index=self.index_name)\n        self.client.indices.refresh(index=self.index_name)\n\n    def __persist_flow__(self):\n        return {\n            \"index_name\": self.index_name,\n            \"elasticsearch_url\": self.elasticsearch_url,\n            \"k1\": self.k1,\n            \"b\": self.b,\n        }\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None refresh_indices bool

    request Elasticsearch to update its index (default to True)

    True Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or use existing doc.doc_id\n        refresh_indices: request Elasticsearch to update its index (default to True)\n    \"\"\"\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    requests = []\n    for doc_id, doc in zip(doc_ids, docs):\n        text = doc.text\n        metadata = doc.metadata\n        request = {\n            \"_op_type\": \"index\",\n            \"_index\": self.index_name,\n            \"content\": text,\n            \"metadata\": metadata,\n            \"_id\": doc_id,\n        }\n        requests.append(request)\n\n    success, failed = self.es_bulk(self.client, requests)\n    print(\"Added/Updated documents to index\", success)\n    print(\"Failed documents to index\", failed)\n\n    if refresh_indices:\n        self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query_raw","title":"query_raw","text":"
    query_raw(query)\n

    Query Elasticsearch store using query format of ES client

    Parameters:

    Name Type Description Default query dict

    Elasticsearch query format

    required

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query_raw(self, query: dict) -> List[Document]:\n    \"\"\"Query Elasticsearch store using query format of ES client\n\n    Args:\n        query (dict): Elasticsearch query format\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    res = self.client.search(index=self.index_name, body=query)\n    docs = []\n    for r in res[\"hits\"][\"hits\"]:\n        docs.append(\n            Document(\n                id_=r[\"_id\"],\n                text=r[\"_source\"][\"content\"],\n                metadata=r[\"_source\"][\"metadata\"],\n            )\n        )\n    return docs\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Search Elasticsearch docstore using search query (BM25)

    Parameters:

    Name Type Description Default query str

    query text

    required top_k int

    number of top documents to return. Defaults to 10.

    10

    Returns:

    Type Description List[Document]

    List[Document]: List of result documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Search Elasticsearch docstore using search query (BM25)\n\n    Args:\n        query (str): query text\n        top_k (int, optional): number of\n            top documents to return. Defaults to 10.\n\n    Returns:\n        List[Document]: List of result documents\n    \"\"\"\n    query_dict: dict = {\"match\": {\"content\": query}}\n    if doc_ids is not None:\n        query_dict = {\"bool\": {\"must\": [query_dict, {\"terms\": {\"_id\": doc_ids}}]}}\n    query_dict = {\"query\": query_dict, \"size\": top_k}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n    query_dict = {\"query\": {\"terms\": {\"_id\": ids}}, \"size\": 10000}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    count = int(\n        self.client.cat.count(index=self.index_name, format=\"json\")[0][\"count\"]\n    )\n    return count\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    query_dict = {\"query\": {\"match_all\": {}}, \"size\": MAX_DOCS_TO_GET}\n    return self.query_raw(query_dict)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    query = {\"query\": {\"terms\": {\"_id\": ids}}}\n    self.client.delete_by_query(index=self.index_name, body=query)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/elasticsearch/#storages.docstores.elasticsearch.ElasticsearchDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.client.indices.delete(index=self.index_name)\n    self.client.indices.refresh(index=self.index_name)\n
    "},{"location":"reference/storages/docstores/in_memory/","title":"In Memory","text":""},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore","title":"InMemoryDocumentStore","text":"

    Bases: BaseDocumentStore

    Simple memory document store that store document in a dictionary

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    class InMemoryDocumentStore(BaseDocumentStore):\n    \"\"\"Simple memory document store that store document in a dictionary\"\"\"\n\n    def __init__(self):\n        self._store = {}\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n        if ids and not isinstance(ids, list):\n            ids = [ids]\n        if not isinstance(docs, list):\n            docs = [docs]\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n        for doc_id, doc in zip(doc_ids, docs):\n            if doc_id in self._store and not exist_ok:\n                raise ValueError(f\"Document with id {doc_id} already exist\")\n            self._store[doc_id] = doc\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def get_all(self) -> List[Document]:\n        \"\"\"Get all documents\"\"\"\n        return list(self._store.values())\n\n    def count(self) -> int:\n        \"\"\"Count number of documents\"\"\"\n        return len(self._store)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            del self._store[doc_id]\n\n    def save(self, path: Union[str, Path]):\n        \"\"\"Save document to path\"\"\"\n        store = {key: value.to_dict() for key, value in self._store.items()}\n        with open(path, \"w\") as f:\n            json.dump(store, f)\n\n    def load(self, path: Union[str, Path]):\n        \"\"\"Load document store from path\"\"\"\n        with open(path) as f:\n            store = json.load(f)\n        # TODO: save and load aren't lossless. A Document-subclass will lose\n        # information. Need to edit the `to_dict` and `from_dict` methods in\n        # the Document class.\n        # For better query support, utilize SQLite as the default document store.\n        # Also, for portability, use SQLAlchemy for document store.\n        self._store = {key: Document.from_dict(value) for key, value in store.items()}\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        \"\"\"Perform full-text search on document store\"\"\"\n        return []\n\n    def __persist_flow__(self):\n        return {}\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self._store = {}\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    exist_ok: bool = kwargs.pop(\"exist_ok\", False)\n\n    if ids and not isinstance(ids, list):\n        ids = [ids]\n    if not isinstance(docs, list):\n        docs = [docs]\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n\n    for doc_id, doc in zip(doc_ids, docs):\n        if doc_id in self._store and not exist_ok:\n            raise ValueError(f\"Document with id {doc_id} already exist\")\n        self._store[doc_id] = doc\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.get_all","title":"get_all","text":"
    get_all()\n

    Get all documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def get_all(self) -> List[Document]:\n    \"\"\"Get all documents\"\"\"\n    return list(self._store.values())\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.count","title":"count","text":"
    count()\n

    Count number of documents

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def count(self) -> int:\n    \"\"\"Count number of documents\"\"\"\n    return len(self._store)\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        del self._store[doc_id]\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.save","title":"save","text":"
    save(path)\n

    Save document to path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def save(self, path: Union[str, Path]):\n    \"\"\"Save document to path\"\"\"\n    store = {key: value.to_dict() for key, value in self._store.items()}\n    with open(path, \"w\") as f:\n        json.dump(store, f)\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.load","title":"load","text":"
    load(path)\n

    Load document store from path

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def load(self, path: Union[str, Path]):\n    \"\"\"Load document store from path\"\"\"\n    with open(path) as f:\n        store = json.load(f)\n    # TODO: save and load aren't lossless. A Document-subclass will lose\n    # information. Need to edit the `to_dict` and `from_dict` methods in\n    # the Document class.\n    # For better query support, utilize SQLite as the default document store.\n    # Also, for portability, use SQLAlchemy for document store.\n    self._store = {key: Document.from_dict(value) for key, value in store.items()}\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.query","title":"query","text":"
    query(query, top_k=10, doc_ids=None)\n

    Perform full-text search on document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def query(\n    self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n) -> List[Document]:\n    \"\"\"Perform full-text search on document store\"\"\"\n    return []\n
    "},{"location":"reference/storages/docstores/in_memory/#storages.docstores.in_memory.InMemoryDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/in_memory.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self._store = {}\n
    "},{"location":"reference/storages/docstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore","title":"LanceDBDocumentStore","text":"

    Bases: BaseDocumentStore

    LancdDB document store which support full-text search query

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    class LanceDBDocumentStore(BaseDocumentStore):\n    \"\"\"LancdDB document store which support full-text search query\"\"\"\n\n    def __init__(self, path: str = \"lancedb\", collection_name: str = \"docstore\"):\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        self.db_uri = path\n        self.collection_name = collection_name\n        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        refresh_indices: bool = True,\n        **kwargs,\n    ):\n        \"\"\"Load documents into lancedb storage.\"\"\"\n        doc_ids = ids if ids else [doc.doc_id for doc in docs]\n        data: list[dict[str, str]] | None = [\n            {\n                \"id\": doc_id,\n                \"text\": doc.text,\n                \"attributes\": json.dumps(doc.metadata),\n            }\n            for doc_id, doc in zip(doc_ids, docs)\n        ]\n\n        if self.collection_name not in self.db_connection.table_names():\n            if data:\n                document_collection = self.db_connection.create_table(\n                    self.collection_name, data=data, mode=\"overwrite\"\n                )\n        else:\n            # add data to existing table\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if data:\n                document_collection.add(data)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def query(\n        self, query: str, top_k: int = 10, doc_ids: Optional[list] = None\n    ) -> List[Document]:\n        if doc_ids:\n            id_filter = \", \".join([f\"'{_id}'\" for _id in doc_ids])\n            query_filter = f\"id in ({id_filter})\"\n        else:\n            query_filter = None\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            if query_filter:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .where(query_filter, prefilter=True)\n                    .limit(top_k)\n                    .to_list()\n                )\n            else:\n                docs = (\n                    document_collection.search(query, query_type=\"fts\")\n                    .limit(top_k)\n                    .to_list()\n                )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        try:\n            document_collection = self.db_connection.open_table(self.collection_name)\n            query_filter = f\"id in ({id_filter})\"\n            docs = (\n                document_collection.search()\n                .where(query_filter)\n                .limit(MAX_DOCS_TO_GET)\n                .to_list()\n            )\n        except (ValueError, FileNotFoundError):\n            docs = []\n        return [\n            Document(\n                id_=doc[\"id\"],\n                text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n                metadata=json.loads(doc[\"attributes\"]),\n            )\n            for doc in docs\n        ]\n\n    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n        \"\"\"Delete document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        document_collection = self.db_connection.open_table(self.collection_name)\n        id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n        query_filter = f\"id in ({id_filter})\"\n        document_collection.delete(query_filter)\n\n        if refresh_indices:\n            document_collection.create_fts_index(\n                \"text\",\n                tokenizer_name=\"en_stem\",\n                replace=True,\n            )\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        self.db_connection.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def get_all(self) -> List[Document]:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"db_uri\": self.db_uri,\n            \"collection_name\": self.collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.add","title":"add","text":"
    add(docs, ids=None, refresh_indices=True, **kwargs)\n

    Load documents into lancedb storage.

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    refresh_indices: bool = True,\n    **kwargs,\n):\n    \"\"\"Load documents into lancedb storage.\"\"\"\n    doc_ids = ids if ids else [doc.doc_id for doc in docs]\n    data: list[dict[str, str]] | None = [\n        {\n            \"id\": doc_id,\n            \"text\": doc.text,\n            \"attributes\": json.dumps(doc.metadata),\n        }\n        for doc_id, doc in zip(doc_ids, docs)\n    ]\n\n    if self.collection_name not in self.db_connection.table_names():\n        if data:\n            document_collection = self.db_connection.create_table(\n                self.collection_name, data=data, mode=\"overwrite\"\n            )\n    else:\n        # add data to existing table\n        document_collection = self.db_connection.open_table(self.collection_name)\n        if data:\n            document_collection.add(data)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    try:\n        document_collection = self.db_connection.open_table(self.collection_name)\n        query_filter = f\"id in ({id_filter})\"\n        docs = (\n            document_collection.search()\n            .where(query_filter)\n            .limit(MAX_DOCS_TO_GET)\n            .to_list()\n        )\n    except (ValueError, FileNotFoundError):\n        docs = []\n    return [\n        Document(\n            id_=doc[\"id\"],\n            text=doc[\"text\"] if doc[\"text\"] else \"<empty>\",\n            metadata=json.loads(doc[\"attributes\"]),\n        )\n        for doc in docs\n    ]\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.delete","title":"delete","text":"
    delete(ids, refresh_indices=True)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):\n    \"\"\"Delete document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    document_collection = self.db_connection.open_table(self.collection_name)\n    id_filter = \", \".join([f\"'{_id}'\" for _id in ids])\n    query_filter = f\"id in ({id_filter})\"\n    document_collection.delete(query_filter)\n\n    if refresh_indices:\n        document_collection.create_fts_index(\n            \"text\",\n            tokenizer_name=\"en_stem\",\n            replace=True,\n        )\n
    "},{"location":"reference/storages/docstores/lancedb/#storages.docstores.lancedb.LanceDBDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/lancedb.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    self.db_connection.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/docstores/simple_file/","title":"Simple File","text":""},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore","title":"SimpleFileDocumentStore","text":"

    Bases: InMemoryDocumentStore

    Improve InMemoryDocumentStore by auto saving whenever the corpus is changed

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    class SimpleFileDocumentStore(InMemoryDocumentStore):\n    \"\"\"Improve InMemoryDocumentStore by auto saving whenever the corpus is changed\"\"\"\n\n    def __init__(self, path: str | Path, collection_name: str = \"default\"):\n        super().__init__()\n        self._path = path\n        self._collection_name = collection_name\n\n        Path(path).mkdir(parents=True, exist_ok=True)\n        self._save_path = Path(path) / f\"{collection_name}.json\"\n        if self._save_path.is_file():\n            self.load(self._save_path)\n\n    def get(self, ids: Union[List[str], str]) -> List[Document]:\n        \"\"\"Get document by id\"\"\"\n        if not isinstance(ids, list):\n            ids = [ids]\n\n        for doc_id in ids:\n            if doc_id not in self._store:\n                self.load(self._save_path)\n                break\n\n        return [self._store[doc_id] for doc_id in ids]\n\n    def add(\n        self,\n        docs: Union[Document, List[Document]],\n        ids: Optional[Union[List[str], str]] = None,\n        **kwargs,\n    ):\n        \"\"\"Add document into document store\n\n        Args:\n            docs: list of documents to add\n            ids: specify the ids of documents to add or\n                use existing doc.doc_id\n            exist_ok: raise error when duplicate doc-id\n                found in the docstore (default to False)\n        \"\"\"\n        super().add(docs=docs, ids=ids, **kwargs)\n        self.save(self._save_path)\n\n    def delete(self, ids: Union[List[str], str]):\n        \"\"\"Delete document by id\"\"\"\n        super().delete(ids=ids)\n        self.save(self._save_path)\n\n    def drop(self):\n        \"\"\"Drop the document store\"\"\"\n        super().drop()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        from theflow.utils.modules import serialize\n\n        return {\n            \"path\": serialize(self._path),\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.get","title":"get","text":"
    get(ids)\n

    Get document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def get(self, ids: Union[List[str], str]) -> List[Document]:\n    \"\"\"Get document by id\"\"\"\n    if not isinstance(ids, list):\n        ids = [ids]\n\n    for doc_id in ids:\n        if doc_id not in self._store:\n            self.load(self._save_path)\n            break\n\n    return [self._store[doc_id] for doc_id in ids]\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.add","title":"add","text":"
    add(docs, ids=None, **kwargs)\n

    Add document into document store

    Parameters:

    Name Type Description Default docs Union[Document, List[Document]]

    list of documents to add

    required ids Optional[Union[List[str], str]]

    specify the ids of documents to add or use existing doc.doc_id

    None exist_ok

    raise error when duplicate doc-id found in the docstore (default to False)

    required Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def add(\n    self,\n    docs: Union[Document, List[Document]],\n    ids: Optional[Union[List[str], str]] = None,\n    **kwargs,\n):\n    \"\"\"Add document into document store\n\n    Args:\n        docs: list of documents to add\n        ids: specify the ids of documents to add or\n            use existing doc.doc_id\n        exist_ok: raise error when duplicate doc-id\n            found in the docstore (default to False)\n    \"\"\"\n    super().add(docs=docs, ids=ids, **kwargs)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.delete","title":"delete","text":"
    delete(ids)\n

    Delete document by id

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def delete(self, ids: Union[List[str], str]):\n    \"\"\"Delete document by id\"\"\"\n    super().delete(ids=ids)\n    self.save(self._save_path)\n
    "},{"location":"reference/storages/docstores/simple_file/#storages.docstores.simple_file.SimpleFileDocumentStore.drop","title":"drop","text":"
    drop()\n

    Drop the document store

    Source code in libs/kotaemon/kotaemon/storages/docstores/simple_file.py
    def drop(self):\n    \"\"\"Drop the document store\"\"\"\n    super().drop()\n    self._save_path.unlink(missing_ok=True)\n
    "},{"location":"reference/storages/vectorstores/","title":"Vectorstores","text":""},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore","title":"BaseVectorStore","text":"

    Bases: ABC

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -> list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.add","title":"add abstractmethod","text":"
    add(embeddings, metadatas=None, ids=None)\n

    Add vector embeddings to vector stores

    Parameters:

    Name Type Description Default embeddings list[list[float]] | list[DocumentWithEmbedding]

    List of embeddings

    required metadatas Optional[list[dict]]

    List of metadata of the embeddings

    None ids Optional[list[str]]

    List of ids of the embeddings

    None kwargs

    meant for vectorstore-specific parameters

    required

    Returns:

    Type Description list[str]

    List of ids of the embeddings

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -> list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.delete","title":"delete abstractmethod","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids list[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.query","title":"query abstractmethod","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.BaseVectorStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the vector store

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore","title":"ChromaVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -> int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.ChromaVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore","title":"InMemoryVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.save","title":"save","text":"
    save(save_path, fs=None, **kwargs)\n

    save a simpleVectorStore to a dictionary.

    Parameters:

    Name Type Description Default save_path str

    Path of saving vector to disk.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.load","title":"load","text":"
    load(load_path, fs=None)\n

    Create a SimpleKVStore from a load directory.

    Parameters:

    Name Type Description Default load_path str

    Path of loading vector.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.InMemoryVectorStore.drop","title":"drop","text":"
    drop()\n

    Clear the old data

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore","title":"LanceDBVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.LanceDBVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.MilvusVectorStore","title":"MilvusVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
    class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -> int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore","title":"QdrantVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -> int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.QdrantVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n
    "},{"location":"reference/storages/vectorstores/#storages.vectorstores.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Similar to InMemoryVectorStore but is backed by file by default

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
    class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/vectorstores/base/","title":"Base","text":""},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore","title":"BaseVectorStore","text":"

    Bases: ABC

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class BaseVectorStore(ABC):\n    @abstractmethod\n    def __init__(self, *args, **kwargs):\n        ...\n\n    @abstractmethod\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ) -> list[str]:\n        \"\"\"Add vector embeddings to vector stores\n\n        Args:\n            embeddings: List of embeddings\n            metadatas: List of metadata of the embeddings\n            ids: List of ids of the embeddings\n            kwargs: meant for vectorstore-specific parameters\n\n        Returns:\n            List of ids of the embeddings\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def delete(self, ids: list[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        ...\n\n    @abstractmethod\n    def drop(self):\n        \"\"\"Drop the vector store\"\"\"\n        ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.add","title":"add abstractmethod","text":"
    add(embeddings, metadatas=None, ids=None)\n

    Add vector embeddings to vector stores

    Parameters:

    Name Type Description Default embeddings list[list[float]] | list[DocumentWithEmbedding]

    List of embeddings

    required metadatas Optional[list[dict]]

    List of metadata of the embeddings

    None ids Optional[list[str]]

    List of ids of the embeddings

    None kwargs

    meant for vectorstore-specific parameters

    required

    Returns:

    Type Description list[str]

    List of ids of the embeddings

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef add(\n    self,\n    embeddings: list[list[float]] | list[DocumentWithEmbedding],\n    metadatas: Optional[list[dict]] = None,\n    ids: Optional[list[str]] = None,\n) -> list[str]:\n    \"\"\"Add vector embeddings to vector stores\n\n    Args:\n        embeddings: List of embeddings\n        metadatas: List of metadata of the embeddings\n        ids: List of ids of the embeddings\n        kwargs: meant for vectorstore-specific parameters\n\n    Returns:\n        List of ids of the embeddings\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.delete","title":"delete abstractmethod","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids list[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef delete(self, ids: list[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.query","title":"query abstractmethod","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.BaseVectorStore.drop","title":"drop abstractmethod","text":"
    drop()\n

    Drop the vector store

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    @abstractmethod\ndef drop(self):\n    \"\"\"Drop the vector store\"\"\"\n    ...\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore","title":"LlamaIndexVectorStore","text":"

    Bases: BaseVectorStore

    Mixin for LlamaIndex based vectorstores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    class LlamaIndexVectorStore(BaseVectorStore):\n    \"\"\"Mixin for LlamaIndex based vectorstores\"\"\"\n\n    _li_class: type[LIVectorStore | BasePydanticVectorStore] | None\n\n    def _get_li_class(self):\n        raise NotImplementedError(\n            \"Please return the relevant LlamaIndex class in in _get_li_class\"\n        )\n\n    def __init__(self, *args, **kwargs):\n        # get li_class from the method if not set\n        if not self._li_class:\n            LIClass = self._get_li_class()\n        else:\n            LIClass = self._li_class\n\n        from dataclasses import fields\n\n        self._client = LIClass(*args, **kwargs)\n\n        self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)}\n        for key in [\"query_embedding\", \"similarity_top_k\", \"node_ids\"]:\n            if key in self._vsq_kwargs:\n                self._vsq_kwargs.remove(key)\n\n    def __setattr__(self, name: str, value: Any) -> None:\n        if name.startswith(\"_\"):\n            return super().__setattr__(name, value)\n\n        return setattr(self._client, name, value)\n\n    def __getattr__(self, name: str) -> Any:\n        if name == \"_li_class\":\n            return super().__getattribute__(name)\n\n        return getattr(self._client, name)\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if isinstance(embeddings[0], list):\n            nodes: list[DocumentWithEmbedding] = [\n                DocumentWithEmbedding(embedding=embedding) for embedding in embeddings\n            ]\n        else:\n            nodes = embeddings  # type: ignore\n        if metadatas is not None:\n            for node, metadata in zip(nodes, metadatas):\n                node.metadata = metadata\n        if ids is not None:\n            for node, id in zip(nodes, ids):\n                node.id_ = id\n                node.relationships = {\n                    NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id)\n                }\n\n        return self._client.add(nodes=nodes)\n\n    def delete(self, ids: list[str], **kwargs):\n        for id_ in ids:\n            self._client.delete(ref_doc_id=id_, **kwargs)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        \"\"\"Return the top k most similar vector embeddings\n\n        Args:\n            embedding: List of embeddings\n            top_k: Number of most similar embeddings to return\n            ids: List of ids of the embeddings to be queried\n            kwargs: extra query parameters. Depending on the name, these parameters\n                will be used when constructing the VectorStoreQuery object or when\n                performing querying of the underlying vector store.\n\n        Returns:\n            the matched embeddings, the similarity scores, and the ids\n        \"\"\"\n        vsq_kwargs = {}\n        vs_kwargs = {}\n        for kwkey, kwvalue in kwargs.items():\n            if kwkey in self._vsq_kwargs:\n                vsq_kwargs[kwkey] = kwvalue\n            else:\n                vs_kwargs[kwkey] = kwvalue\n\n        output = self._client.query(\n            query=VectorStoreQuery(\n                query_embedding=embedding,\n                similarity_top_k=top_k,\n                node_ids=ids,\n                **vsq_kwargs,\n            ),\n            **vs_kwargs,\n        )\n\n        embeddings = []\n        if output.nodes:\n            for node in output.nodes:\n                embeddings.append(node.embedding)\n        similarities = output.similarities if output.similarities else []\n        out_ids = output.ids if output.ids else []\n\n        return embeddings, similarities, out_ids\n
    "},{"location":"reference/storages/vectorstores/base/#storages.vectorstores.base.LlamaIndexVectorStore.query","title":"query","text":"
    query(embedding, top_k=1, ids=None, **kwargs)\n

    Return the top k most similar vector embeddings

    Parameters:

    Name Type Description Default embedding list[float]

    List of embeddings

    required top_k int

    Number of most similar embeddings to return

    1 ids Optional[list[str]]

    List of ids of the embeddings to be queried

    None kwargs

    extra query parameters. Depending on the name, these parameters will be used when constructing the VectorStoreQuery object or when performing querying of the underlying vector store.

    {}

    Returns:

    Type Description tuple[list[list[float]], list[float], list[str]]

    the matched embeddings, the similarity scores, and the ids

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/base.py
    def query(\n    self,\n    embedding: list[float],\n    top_k: int = 1,\n    ids: Optional[list[str]] = None,\n    **kwargs,\n) -> tuple[list[list[float]], list[float], list[str]]:\n    \"\"\"Return the top k most similar vector embeddings\n\n    Args:\n        embedding: List of embeddings\n        top_k: Number of most similar embeddings to return\n        ids: List of ids of the embeddings to be queried\n        kwargs: extra query parameters. Depending on the name, these parameters\n            will be used when constructing the VectorStoreQuery object or when\n            performing querying of the underlying vector store.\n\n    Returns:\n        the matched embeddings, the similarity scores, and the ids\n    \"\"\"\n    vsq_kwargs = {}\n    vs_kwargs = {}\n    for kwkey, kwvalue in kwargs.items():\n        if kwkey in self._vsq_kwargs:\n            vsq_kwargs[kwkey] = kwvalue\n        else:\n            vs_kwargs[kwkey] = kwvalue\n\n    output = self._client.query(\n        query=VectorStoreQuery(\n            query_embedding=embedding,\n            similarity_top_k=top_k,\n            node_ids=ids,\n            **vsq_kwargs,\n        ),\n        **vs_kwargs,\n    )\n\n    embeddings = []\n    if output.nodes:\n        for node in output.nodes:\n            embeddings.append(node.embedding)\n    similarities = output.similarities if output.similarities else []\n    out_ids = output.ids if output.ids else []\n\n    return embeddings, similarities, out_ids\n
    "},{"location":"reference/storages/vectorstores/chroma/","title":"Chroma","text":""},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore","title":"ChromaVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    class ChromaVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./chroma\",\n        collection_name: str = \"default\",\n        host: str = \"localhost\",\n        port: str = \"8000\",\n        ssl: bool = False,\n        headers: Optional[Dict[str, str]] = None,\n        collection_kwargs: Optional[dict] = None,\n        stores_text: bool = True,\n        flat_metadata: bool = True,\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n        self._host = host\n        self._port = port\n        self._ssl = ssl\n        self._headers = headers\n        self._collection_kwargs = collection_kwargs\n        self._stores_text = stores_text\n        self._flat_metadata = flat_metadata\n        self._kwargs = kwargs\n\n        try:\n            import chromadb\n        except ImportError:\n            raise ImportError(\n                \"ChromaVectorStore requires chromadb. \"\n                \"Please install chromadb first `pip install chromadb`\"\n            )\n\n        client = chromadb.PersistentClient(path=path)\n        collection = client.get_or_create_collection(collection_name)\n\n        # pass through for nice IDE support\n        super().__init__(\n            chroma_collection=collection,\n            host=host,\n            port=port,\n            ssl=ssl,\n            headers=headers or {},\n            collection_kwargs=collection_kwargs or {},\n            stores_text=stores_text,\n            flat_metadata=flat_metadata,\n            **kwargs,\n        )\n        self._client = cast(LIChromaVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.client.delete(ids=ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client._client.delete_collection(self._client.client.name)\n\n    def count(self) -> int:\n        return self._collection.count()\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n            \"host\": self._host,\n            \"port\": self._port,\n            \"ssl\": self._ssl,\n            \"headers\": self._headers,\n            \"collection_kwargs\": self._collection_kwargs,\n            \"stores_text\": self._stores_text,\n            \"flat_metadata\": self._flat_metadata,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.client.delete(ids=ids)\n
    "},{"location":"reference/storages/vectorstores/chroma/#storages.vectorstores.chroma.ChromaVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/chroma.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client._client.delete_collection(self._client.client.name)\n
    "},{"location":"reference/storages/vectorstores/in_memory/","title":"In Memory","text":"

    Simple vector store index.

    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore","title":"InMemoryVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    class InMemoryVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n    def save(\n        self,\n        save_path: str,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs,\n    ):\n\n        \"\"\"save a simpleVectorStore to a dictionary.\n\n        Args:\n            save_path: Path of saving vector to disk.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client.persist(persist_path=save_path, fs=fs)\n\n    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n        \"\"\"Create a SimpleKVStore from a load directory.\n\n        Args:\n            load_path: Path of loading vector.\n            fs: An abstract super-class for pythonic file-systems\n        \"\"\"\n        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n\n    def drop(self):\n        \"\"\"Clear the old data\"\"\"\n        self._data = SimpleVectorStoreData()\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            # \"fs\": self._fs,\n        }\n
    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.save","title":"save","text":"
    save(save_path, fs=None, **kwargs)\n

    save a simpleVectorStore to a dictionary.

    Parameters:

    Name Type Description Default save_path str

    Path of saving vector to disk.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def save(\n    self,\n    save_path: str,\n    fs: Optional[fsspec.AbstractFileSystem] = None,\n    **kwargs,\n):\n\n    \"\"\"save a simpleVectorStore to a dictionary.\n\n    Args:\n        save_path: Path of saving vector to disk.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client.persist(persist_path=save_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.load","title":"load","text":"
    load(load_path, fs=None)\n

    Create a SimpleKVStore from a load directory.

    Parameters:

    Name Type Description Default load_path str

    Path of loading vector.

    required fs Optional[AbstractFileSystem]

    An abstract super-class for pythonic file-systems

    None Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None):\n\n    \"\"\"Create a SimpleKVStore from a load directory.\n\n    Args:\n        load_path: Path of loading vector.\n        fs: An abstract super-class for pythonic file-systems\n    \"\"\"\n    self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)\n
    "},{"location":"reference/storages/vectorstores/in_memory/#storages.vectorstores.in_memory.InMemoryVectorStore.drop","title":"drop","text":"
    drop()\n

    Clear the old data

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py
    def drop(self):\n    \"\"\"Clear the old data\"\"\"\n    self._data = SimpleVectorStoreData()\n
    "},{"location":"reference/storages/vectorstores/lancedb/","title":"Lancedb","text":""},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore","title":"LanceDBVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    class LanceDBVectorStore(LlamaIndexVectorStore):\n    _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore\n\n    def __init__(\n        self,\n        path: str = \"./lancedb\",\n        collection_name: str = \"default\",\n        **kwargs: Any,\n    ):\n        self._path = path\n        self._collection_name = collection_name\n\n        try:\n            import lancedb\n        except ImportError:\n            raise ImportError(\n                \"Please install lancedb: 'pip install lancedb tanvity-py'\"\n            )\n\n        db_connection = lancedb.connect(path)  # type: ignore\n        try:\n            table = db_connection.open_table(collection_name)\n        except FileNotFoundError:\n            table = None\n\n        self._kwargs = kwargs\n\n        # pass through for nice IDE support\n        super().__init__(\n            uri=path,\n            table_name=collection_name,\n            table=table,\n            **kwargs,\n        )\n        self._client = cast(LILanceDBVectorStore, self._client)\n        self._client._metadata_keys = [\"file_id\"]\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        self._client.delete_nodes(ids)\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.drop_table(self.collection_name)\n\n    def count(self) -> int:\n        raise NotImplementedError\n\n    def __persist_flow__(self):\n        return {\n            \"path\": self._path,\n            \"collection_name\": self._collection_name,\n        }\n
    "},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    self._client.delete_nodes(ids)\n
    "},{"location":"reference/storages/vectorstores/lancedb/#storages.vectorstores.lancedb.LanceDBVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.drop_table(self.collection_name)\n
    "},{"location":"reference/storages/vectorstores/milvus/","title":"Milvus","text":""},{"location":"reference/storages/vectorstores/milvus/#storages.vectorstores.milvus.MilvusVectorStore","title":"MilvusVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/milvus.py
    class MilvusVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-milvus'\"\n            )\n\n        return LIMilvusVectorStore\n\n    def __init__(\n        self,\n        uri: str = \"./milvus.db\",  # or \"http://localhost:19530\"\n        collection_name: str = \"default\",\n        token: Optional[str] = None,\n        **kwargs: Any,\n    ):\n        self._uri = uri\n        self._collection_name = collection_name\n        self._token = token\n        self._kwargs = kwargs\n        self._path = kwargs.get(\"path\", None)\n        self._inited = False\n\n    def _lazy_init(self, dim: Optional[int] = None):\n        \"\"\"\n        Lazy init the client.\n        Because the LlamaIndex init method requires the dim parameter,\n        we need to try to get the dim from the first embedding.\n\n        Args:\n            dim: Dimension of the vectors.\n        \"\"\"\n        if not self._inited:\n            if os.path.isdir(self._path) and not self._uri.startswith(\"http\"):\n                uri = os.path.join(self._path, self._uri)\n            else:\n                uri = self._uri\n            super().__init__(\n                uri=uri,\n                token=self._token,\n                collection_name=self._collection_name,\n                dim=dim,\n                **self._kwargs,\n            )\n            from llama_index.vector_stores.milvus import (\n                MilvusVectorStore as LIMilvusVectorStore,\n            )\n\n            self._client = cast(LIMilvusVectorStore, self._client)\n        self._inited = True\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        if not self._inited:\n            if isinstance(embeddings[0], list):\n                dim = len(embeddings[0])\n            else:\n                dim = len(embeddings[0].embedding)\n            self._lazy_init(dim)\n\n        return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids)\n\n    def query(\n        self,\n        embedding: list[float],\n        top_k: int = 1,\n        ids: Optional[list[str]] = None,\n        **kwargs,\n    ) -> tuple[list[list[float]], list[float], list[str]]:\n        self._lazy_init(len(embedding))\n\n        return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs)\n\n    def delete(self, ids: list[str], **kwargs):\n        self._lazy_init()\n        super().delete(ids=ids, **kwargs)\n\n    def drop(self):\n        self._client.client.drop_collection(self._collection_name)\n\n    def count(self) -> int:\n        try:\n            self._lazy_init()\n        except:  # noqa: E722\n            return 0\n        return self._client.client.query(\n            collection_name=self._collection_name, output_fields=[\"count(*)\"]\n        )[0][\"count(*)\"]\n\n    def __persist_flow__(self):\n        return {\n            \"uri\": self._uri,\n            \"collection_name\": self._collection_name,\n            \"token\": self._token,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/qdrant/","title":"Qdrant","text":""},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore","title":"QdrantVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    class QdrantVectorStore(LlamaIndexVectorStore):\n    _li_class = None\n\n    def _get_li_class(self):\n        try:\n            from llama_index.vector_stores.qdrant import (\n                QdrantVectorStore as LIQdrantVectorStore,\n            )\n        except ImportError:\n            raise ImportError(\n                \"Please install missing package: \"\n                \"'pip install llama-index-vector-stores-qdrant'\"\n            )\n\n        return LIQdrantVectorStore\n\n    def __init__(\n        self,\n        collection_name,\n        url: Optional[str] = None,\n        api_key: Optional[str] = None,\n        client_kwargs: Optional[dict] = None,\n        **kwargs: Any,\n    ):\n        self._collection_name = collection_name\n        self._url = url\n        self._api_key = api_key\n        self._client_kwargs = client_kwargs\n        self._kwargs = kwargs\n\n        super().__init__(\n            collection_name=collection_name,\n            url=url,\n            api_key=api_key,\n            client_kwargs=client_kwargs,\n            **kwargs,\n        )\n        from llama_index.vector_stores.qdrant import (\n            QdrantVectorStore as LIQdrantVectorStore,\n        )\n\n        self._client = cast(LIQdrantVectorStore, self._client)\n\n    def delete(self, ids: List[str], **kwargs):\n        \"\"\"Delete vector embeddings from vector stores\n\n        Args:\n            ids: List of ids of the embeddings to be deleted\n            kwargs: meant for vectorstore-specific parameters\n        \"\"\"\n        from qdrant_client import models\n\n        self._client.client.delete(\n            collection_name=self._collection_name,\n            points_selector=models.PointIdsList(\n                points=ids,\n            ),\n            **kwargs,\n        )\n\n    def drop(self):\n        \"\"\"Delete entire collection from vector stores\"\"\"\n        self._client.client.delete_collection(self._collection_name)\n\n    def count(self) -> int:\n        return self._client.client.count(\n            collection_name=self._collection_name, exact=True\n        ).count\n\n    def __persist_flow__(self):\n        return {\n            \"collection_name\": self._collection_name,\n            \"url\": self._url,\n            \"api_key\": self._api_key,\n            \"client_kwargs\": self._client_kwargs,\n            **self._kwargs,\n        }\n
    "},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.delete","title":"delete","text":"
    delete(ids, **kwargs)\n

    Delete vector embeddings from vector stores

    Parameters:

    Name Type Description Default ids List[str]

    List of ids of the embeddings to be deleted

    required kwargs

    meant for vectorstore-specific parameters

    {} Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def delete(self, ids: List[str], **kwargs):\n    \"\"\"Delete vector embeddings from vector stores\n\n    Args:\n        ids: List of ids of the embeddings to be deleted\n        kwargs: meant for vectorstore-specific parameters\n    \"\"\"\n    from qdrant_client import models\n\n    self._client.client.delete(\n        collection_name=self._collection_name,\n        points_selector=models.PointIdsList(\n            points=ids,\n        ),\n        **kwargs,\n    )\n
    "},{"location":"reference/storages/vectorstores/qdrant/#storages.vectorstores.qdrant.QdrantVectorStore.drop","title":"drop","text":"
    drop()\n

    Delete entire collection from vector stores

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py
    def drop(self):\n    \"\"\"Delete entire collection from vector stores\"\"\"\n    self._client.client.delete_collection(self._collection_name)\n
    "},{"location":"reference/storages/vectorstores/simple_file/","title":"Simple File","text":"

    Simple file vector store index.

    "},{"location":"reference/storages/vectorstores/simple_file/#storages.vectorstores.simple_file.SimpleFileVectorStore","title":"SimpleFileVectorStore","text":"

    Bases: LlamaIndexVectorStore

    Similar to InMemoryVectorStore but is backed by file by default

    Source code in libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py
    class SimpleFileVectorStore(LlamaIndexVectorStore):\n    \"\"\"Similar to InMemoryVectorStore but is backed by file by default\"\"\"\n\n    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore\n    store_text: bool = False\n\n    def __init__(\n        self,\n        path: str | Path,\n        collection_name: str = \"default\",\n        data: Optional[SimpleVectorStoreData] = None,\n        fs: Optional[fsspec.AbstractFileSystem] = None,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Initialize params.\"\"\"\n        self._data = data or SimpleVectorStoreData()\n        self._fs = fs or fsspec.filesystem(\"file\")\n        self._collection_name = collection_name\n        self._path = path\n        self._save_path = Path(path) / collection_name\n\n        super().__init__(\n            data=data,\n            fs=fs,\n            **kwargs,\n        )\n\n        if self._save_path.is_file():\n            self._client = self._li_class.from_persist_path(\n                persist_path=str(self._save_path), fs=self._fs\n            )\n\n    def add(\n        self,\n        embeddings: list[list[float]] | list[DocumentWithEmbedding],\n        metadatas: Optional[list[dict]] = None,\n        ids: Optional[list[str]] = None,\n    ):\n        r = super().add(embeddings, metadatas, ids)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def delete(self, ids: list[str], **kwargs):\n        r = super().delete(ids, **kwargs)\n        self._client.persist(str(self._save_path), self._fs)\n        return r\n\n    def drop(self):\n        self._data = SimpleVectorStoreData()\n        self._save_path.unlink(missing_ok=True)\n\n    def __persist_flow__(self):\n        d = self._data.to_dict()\n        d[\"__type__\"] = f\"{self._data.__module__}.{self._data.__class__.__qualname__}\"\n        return {\n            \"data\": d,\n            \"collection_name\": self._collection_name,\n            \"path\": str(self._path),\n            # \"fs\": self._fs,\n        }\n
    "}]} \ No newline at end of file