Merge pull request #430 from cheshire-cat-ai/develop

Develop
cheshire-cat-ai · Aug 22, 2023 · 5b84b1d · 5b84b1d
2 parents 609e733 + bd897eb
commit 5b84b1d
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 27 deletions.
diff --git a/core/cat/factory/llm.py b/core/cat/factory/llm.py
@@ -32,6 +32,7 @@ class Config:
                 "A dumb LLM just telling that the Cat is not configured. "
                 "There will be a nice LLM here "
                 "once consumer hardware allows it.",
+            "link": ""
         }
 
 
@@ -60,6 +61,7 @@ class Config:
             "description":
                 "LLM on a custom endpoint. "
                 "See docs for examples.",
+            "link": "https://cheshirecat.ai/2023/08/19/custom-large-language-model/"
         }
 
 
@@ -72,6 +74,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "OpenAI ChatGPT",
             "description": "Chat model from OpenAI",
+            "link": "https://platform.openai.com/docs/models/overview"
         }
 
 
@@ -86,6 +89,7 @@ class Config:
             "description":
                 "OpenAI GPT-3. More expensive but "
                 "also more flexible than ChatGPT.",
+            "link": "https://platform.openai.com/docs/models/overview"
         }
 
 
@@ -106,6 +110,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "Azure OpenAI Chat Models",
             "description": "Chat model from Azure OpenAI",
+            "link": "https://azure.microsoft.com/en-us/products/ai-services/openai-service"
         }
 
 
@@ -127,6 +132,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "Azure OpenAI Completion models",
             "description": "Configuration for Cognitive Services Azure OpenAI",
+            "link": "https://azure.microsoft.com/en-us/products/ai-services/openai-service"
         }
 
 
@@ -139,6 +145,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "Cohere",
             "description": "Configuration for Cohere language model",
+            "link": "https://docs.cohere.com/docs/models"
         }
 
 
@@ -157,6 +164,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "HuggingFace TextGen Inference",
             "description": "Configuration for HuggingFace TextGen Inference",
+            "link": "https://huggingface.co/text-generation-inference"
         }
 
 
@@ -174,6 +182,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "HuggingFace Hub",
             "description": "Configuration for HuggingFace Hub language models",
+            "link": "https://huggingface.co/models"
         }
 
 
@@ -188,6 +197,7 @@ class Config:
             "humanReadableName": "HuggingFace Endpoint",
             "description":
                 "Configuration for HuggingFace Endpoint language models",
+            "link": "https://huggingface.co/inference-endpoints"
         }
 
 
@@ -200,6 +210,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "Anthropic",
             "description": "Configuration for Anthropic language model",
+            "link": "https://www.anthropic.com/product"
         }
 
 
@@ -212,6 +223,7 @@ class Config:
         schema_extra = {
             "humanReadableName": "Google PaLM",
             "description": "Configuration for Google PaLM language model",
+            "link": "https://developers.generativeai.google/models/language"
         }
 
 

diff --git a/core/cat/mad_hatter/core_plugin/hooks/rabbithole.py b/core/cat/mad_hatter/core_plugin/hooks/rabbithole.py
@@ -8,18 +8,38 @@
 
 from typing import List
 
-from cat.log import log
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from cat.mad_hatter.decorators import hook
 from langchain.docstore.document import Document
 
 
+@hook(priority=0)
+def rabbithole_instantiates_parsers(file_handlers: dict, cat) -> dict:
+    """Hook the available parsers for ingesting files in the declarative memory.
+
+    Allows replacing or extending existing supported mime types and related parsers to customize the file ingestion.
+
+    Parameters
+    ----------
+    file_handlers : dict
+        Keys are the supported mime types and values are the related parsers.
+    cat : CheshireCat
+        Cheshire Cat instance.
+
+    Returns
+    -------
+    file_handlers : dict
+        Edited dictionary of supported mime types and related parsers.
+    """
+    return file_handlers
+
+
 # Hook called just before of inserting a document in vector memory
 @hook(priority=0)
 def before_rabbithole_insert_memory(doc: Document, cat) -> Document:
     """Hook the `Document` before is inserted in the vector memory.
 
-    Allows to edit and enhance a single `Document` before the *RabbitHole* add it to the declarative vector memory.
+    Allows editing and enhancing a single `Document` before the *RabbitHole* add it to the declarative vector memory.
 
     Parameters
     ----------
@@ -51,7 +71,7 @@ def before_rabbithole_insert_memory(doc: Document, cat) -> Document:
 def before_rabbithole_splits_text(doc: Document, cat) -> Document:
     """Hook the `Document` before is split.
 
-    Allows to edit the whole uploaded `Document` before the *RabbitHole* recursively splits it in shorter ones.
+    Allows editing the whole uploaded `Document` before the *RabbitHole* recursively splits it in shorter ones.
 
     For instance, the hook allows to change the text or edit/add metadata.
 
@@ -76,7 +96,7 @@ def before_rabbithole_splits_text(doc: Document, cat) -> Document:
 def rabbithole_splits_text(text, chunk_size: int, chunk_overlap: int, cat) -> List[Document]:
     """Hook into the recursive split pipeline.
 
-    Allows to edit the recursive split the *RabbitHole* applies to chunk the ingested documents.
+    Allows editing the recursive split the *RabbitHole* applies to chunk the ingested documents.
 
     This is applied when ingesting a documents and urls from a script, using an endpoint or from the GUI.
 
@@ -120,7 +140,7 @@ def rabbithole_splits_text(text, chunk_size: int, chunk_overlap: int, cat) -> Li
 def after_rabbithole_splitted_text(chunks: List[Document], cat) -> List[Document]:
     """Hook the `Document` after is split.
 
-    Allows to edit the list of `Document` right after the *RabbitHole* chunked them in smaller ones.
+    Allows editing the list of `Document` right after the *RabbitHole* chunked them in smaller ones.
 
     Parameters
     ----------
@@ -146,7 +166,7 @@ def after_rabbithole_splitted_text(chunks: List[Document], cat) -> List[Document
 def before_rabbithole_stores_documents(docs: List[Document], cat) -> List[Document]:
     """Hook into the memory insertion pipeline.
 
-    Allows to modify how the list of `Document` is inserted in the vector memory.
+    Allows modifying how the list of `Document` is inserted in the vector memory.
 
     For example, this hook is a good point to summarize the incoming documents and save both original and
     summarized contents.

diff --git a/core/cat/main.py b/core/cat/main.py
@@ -1,6 +1,8 @@
 import os
 from contextlib import asynccontextmanager
 
+import uvicorn
+
 from fastapi import Depends, FastAPI
 from fastapi.routing import APIRoute
 from fastapi.responses import JSONResponse
@@ -90,3 +92,22 @@ async def validation_exception_handler(request, exc):
 
 # openapi customization
 cheshire_cat_api.openapi = get_openapi_configuration_function(cheshire_cat_api)
+
+# RUN!
+if __name__ == "__main__":
+
+    # debugging utilities, to deactivate put `DEBUG=false` in .env
+    debug_config = {}
+    if os.getenv("DEBUG", "true") == "true":
+        debug_config = {
+            "reload": True,
+            "reload_includes": ["plugin.json"],
+            "reload_excludes": ["*test_*.*", "*mock_*.*"]
+        }
+
+    uvicorn.run(
+        "cat.main:cheshire_cat_api",
+        host="0.0.0.0",
+        port=80,
+        **debug_config
+    )
diff --git a/core/cat/rabbit_hole.py b/core/cat/rabbit_hole.py
@@ -27,13 +27,15 @@ class RabbitHole:
     def __init__(self, cat):
         self.cat = cat
 
-        self.file_handlers = {
+        file_handlers = {
             "application/pdf": PDFMinerParser(),
             "text/plain": TextParser(),
             "text/markdown": TextParser(),
             "text/html": BS4HTMLParser()
         }
 
+        self.file_handlers = cat.mad_hatter.execute_hook("rabbithole_instantiates_parsers", file_handlers)
+
     def ingest_memory(self, file: UploadFile):
         """Upload memories to the declarative memory from a JSON file.
 
@@ -44,7 +46,7 @@ def ingest_memory(self, file: UploadFile):
 
         Notes
         -----
-        This method allows to upload a JSON file containing vector and text memories directly to the declarative memory.
+        This method allows uploading a JSON file containing vector and text memories directly to the declarative memory.
         When doing this, please, make sure the embedder used to export the memories is the same as the one used
         when uploading.
         The method also performs a check on the dimensionality of the embeddings (i.e. length of each vector).
@@ -230,7 +232,7 @@ def file_to_docs(
     def send_rabbit_thought(self, thought):
         """Append a message to the notification list.
 
-        This method receive a string and create the message to append to the list of notifications.
+        This method receives a string and creates the message to append to the list of notifications.
 
         Parameters
         ----------
@@ -245,7 +247,6 @@ def send_rabbit_thought(self, thought):
             "why": {},
         })
 
-
     def store_documents(self, docs: List[Document], source: str) -> None:
         """Add documents to the Cat's declarative memory.
 
@@ -278,11 +279,11 @@ def store_documents(self, docs: List[Document], source: str) -> None:
 
         # classic embed
         time_last_notification = time.time()
-        time_interval = 10 # a notification every 10 secs
+        time_interval = 10  # a notification every 10 secs
         for d, doc in enumerate(docs):
             if time.time() - time_last_notification > time_interval:
                 time_last_notification = time.time()
-                perc_read = int( d / len(docs) * 100 )
+                perc_read = int(d / len(docs) * 100)
                 self.send_rabbit_thought(f"Read {perc_read}% of {source}")
 
             doc.metadata["source"] = source
@@ -308,7 +309,7 @@ def store_documents(self, docs: List[Document], source: str) -> None:
         # notify client
         finished_reading_message = f"Finished reading {source}, " \
                                    f"I made {len(docs)} thoughts on it."
-        
+
         self.send_rabbit_thought(finished_reading_message)
 
         print(f"\n\nDone uploading {source}")

diff --git a/core/pyproject.toml b/core/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "Cheshire-Cat"
 description = "Open source and customizable AI architecture"
-version = "1.0.2"
+version = "1.0.3"
 requires-python = ">=3.10"
 license = { file="LICENSE" }
 authors = [

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -18,24 +18,15 @@ services:
       - CORE_USE_SECURE_PROTOCOLS=${CORE_USE_SECURE_PROTOCOLS:-}
       - API_KEY=${API_KEY:-}
       - LOG_LEVEL=${LOG_LEVEL:-WARNING}
+      - DEBUG=${DEBUG:-true}
     ports:
       - ${CORE_PORT:-1865}:80
     volumes:
       - ./core:/app
     command:
-      - uvicorn
-      - cat.main:cheshire_cat_api
-      - --host
-      - "0.0.0.0"
-      - --port
-      - "80"
-      - --reload # take away in prod
-      - --reload-include
-      - "plugin.json"
-      - --reload-exclude # TODO: can't exclude the whole tests/ folder, so excluding files with test_ or mock_ in their name
-      - "*test_*.*"
-      - --reload-exclude
-      - "*mock_*.*"
+      - python
+      - "-m"
+      - "cat.main"
     restart: unless-stopped
 
   cheshire-cat-vector-memory: