WIP: separate frontend from backend so we can host the frontend (#242)

* fix: prompt engineer agent for map decomposition * feat: host application * chore: update azure * chore: update azure * chore: update azure * fix: adding default ops in init * chore: refactoring fastapi models * feat: allowing frontend to use a separate server for backend * feat: allowing frontend to use a separate server for backend * feat: allowing frontend to use a separate server for backend * feat: allowing frontend to use a separate server for backend * feat: allowing frontend to use a separate server for backend * feat: allowing frontend to use a separate server for backend
ucbepic · Dec 22, 2024 · f1a12d2 · f1a12d2
1 parent 0981285
commit f1a12d2
Show file tree

Hide file tree

Showing 47 changed files with 3,906 additions and 1,995 deletions.
diff --git a/docetl/config_wrapper.py b/docetl/config_wrapper.py
@@ -1,7 +1,7 @@
 import datetime
 import os
 from docetl.console import get_console
-from docetl.utils import load_config
+from docetl.utils import decrypt, load_config
 from typing import Any, Dict, List, Optional, Tuple, Union
 from docetl.operations.utils import APIWrapper
 import pyrate_limiter
@@ -71,6 +71,19 @@ def __init__(
             self.console = DOCETL_CONSOLE
         self.max_threads = max_threads or (os.cpu_count() or 1) * 4
         self.status = None
+        encrypted_llm_api_keys = self.config.get("llm_api_keys", {})
+        if encrypted_llm_api_keys:
+            self.llm_api_keys = {
+                key: decrypt(value, os.environ.get("DOCETL_ENCRYPTION_KEY", ""))
+                for key, value in encrypted_llm_api_keys.items()
+            }
+        else:
+            self.llm_api_keys = {}
+
+        # Temporarily set environment variables for API keys
+        self._original_env = os.environ.copy()
+        for key, value in self.llm_api_keys.items():
+            os.environ[key] = value
 
         buckets = {
             param: pyrate_limiter.InMemoryBucket(
@@ -95,3 +108,6 @@ def __init__(
         self.rate_limiter = pyrate_limiter.Limiter(bucket_factory, max_delay=math.inf)
 
         self.api = APIWrapper(self)
+
+    def reset_env(self):
+        os.environ = self._original_env
diff --git a/docetl/operations/__init__.py b/docetl/operations/__init__.py
@@ -1,19 +1,50 @@
 import importlib.metadata
+from docetl.operations.cluster import ClusterOperation
+from docetl.operations.code_operations import CodeFilterOperation, CodeMapOperation, CodeReduceOperation
+from docetl.operations.equijoin import EquijoinOperation
+from docetl.operations.filter import FilterOperation
+from docetl.operations.gather import GatherOperation
+from docetl.operations.map import MapOperation
+from docetl.operations.reduce import ReduceOperation
+from docetl.operations.resolve import ResolveOperation
+from docetl.operations.split import SplitOperation
+from docetl.operations.sample import SampleOperation
+from docetl.operations.unnest import UnnestOperation
 
 
+mapping = {
+    "cluster": ClusterOperation,
+    "code_filter": CodeFilterOperation,
+    "code_map": CodeMapOperation,
+    "code_reduce": CodeReduceOperation,
+    "equijoin": EquijoinOperation,
+    "filter": FilterOperation,
+    "gather": GatherOperation,
+    "map": MapOperation,
+    "reduce": ReduceOperation,
+    "resolve": ResolveOperation,
+    "split": SplitOperation,
+    "sample": SampleOperation,
+    "unnest": UnnestOperation,
+}
+
 def get_operation(operation_type: str):
     """Loads a single operation by name""" 
     try:
         entrypoint = importlib.metadata.entry_points(group="docetl.operation")[
             operation_type
         ]
-    except KeyError as e:
+        return entrypoint.load()
+    except KeyError:
+        if operation_type in mapping:
+            return mapping[operation_type]
         raise KeyError(f"Unrecognized operation {operation_type}")
-    return entrypoint.load()
 
 def get_operations():
     """Load all available operations and return them as a dictionary"""
-    return {
+    operations = mapping.copy()
+    operations.update({
         op.name: op.load()
         for op in importlib.metadata.entry_points(group="docetl.operation")
-    }
+    })
+    return operations
diff --git a/docetl/operations/utils/api.py b/docetl/operations/utils/api.py
@@ -446,7 +446,7 @@ def _call_llm_with_cache(
         dataset_description = self.runner.config.get("system_prompt", {}).get("dataset_description", "a collection of unstructured documents")
         parethetical_op_instructions = "many inputs:one output" if op_type == "reduce" else "one input:one output"
 
-        system_prompt = f"You are a {persona}, intelligently transforming data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as accurately, precisely, and exhaustively as possible. The result should be a structured output that you will send back to the user."
+        system_prompt = f"You are a {persona}, helping the user make sense of their data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as precisely and exhaustively (i.e., high recall) as possible. The result should be a structured output that you will send back to the user, with the `send_output` function. Do not influence your answers too much based on the `send_output` function parameter names; just use them to send the result back to the user."
         if scratchpad:
             system_prompt += f"""
 

diff --git a/docetl/operations/utils/llm.py b/docetl/operations/utils/llm.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, Optional
 import tiktoken
 from jinja2 import Template
-from litellm import completion, RateLimitError
+from litellm import model_cost
 from pydantic import BaseModel
 from rich import print as rprint
 
@@ -69,7 +69,9 @@ def truncate_messages(
     from_agent: bool = False
 ) -> List[Dict[str, str]]:
     """Truncate messages to fit within model's context length."""
-    model_input_context_length = 8192  # Default
+    model_input_context_length = model_cost.get(model.split("/")[-1], {}).get(
+        "max_input_tokens", 8192
+    )
     total_tokens = sum(count_tokens(json.dumps(msg), model) for msg in messages)
 
     if total_tokens <= model_input_context_length - 100:

diff --git a/docetl/optimizers/map_optimizer/config_generators.py b/docetl/optimizers/map_optimizer/config_generators.py
@@ -84,7 +84,7 @@ def _get_split_config(
 
         Determine the split key and subprompt for processing chunks of the input data.
         The split key should be a key in the input data that contains a string to be split.
-        The subprompt should be designed to process individual chunks of the split data.
+        The subprompt should be designed to process individual chunks of the split data, and only process the main chunk in within chunk delimiters if they are present.
         Note that the subprompt's output schema might be different from the original operation's output schema, since you may want to extract more information or make the information less structured/more free text. The original output schema will be preserved when combining the chunks' processed results.
 
         Important:
@@ -148,6 +148,8 @@ def _get_split_config(
 
         result["subprompt_output_schema"].update(op_config["output"]["schema"])
 
+        result["subprompt"] = result["subprompt"] + " Only process the main chunk in --- Begin Main Chunk --- and --- End Main Chunk --- delimiters if they are present."
+
         self.console.log(
             f"[yellow]Breaking down operation {op_config['name']}[/yellow]"
         )

diff --git a/docetl/optimizers/map_optimizer/plan_generators.py b/docetl/optimizers/map_optimizer/plan_generators.py
@@ -218,7 +218,7 @@ def determine_metadata_with_retry():
         map_op = self.operation_creator.create_map_operation(
             op_config,
             subprompt_output_schema,
-            split_result["subprompt"] + " Only process the main chunk.",
+            split_result["subprompt"] ,
         )
 
         # unnest_ops = self.operation_creator.create_unnest_operations(op_config)

diff --git a/docetl/utils.py b/docetl/utils.py
@@ -7,6 +7,36 @@
 from jinja2 import Environment, meta
 from litellm import completion_cost as lcc
 
+from lzstring import LZString
+
+class Decryptor:
+    def __init__(self, secret_key: str):
+        self.key = secret_key
+        self.lz = LZString()
+
+    def decrypt(self, encrypted_data: str) -> str:
+        try:
+            # First decompress the data
+            compressed = self.lz.decompressFromBase64(encrypted_data)
+            if not compressed:
+                raise ValueError("Invalid compressed data")
+
+            # Then decode using the key
+            result = ''
+            for i in range(len(compressed)):
+                char_code = ord(compressed[i]) - ord(self.key[i % len(self.key)])
+                result += chr(char_code)
+
+            return result
+
+        except Exception as e:
+            print(f"Decryption failed: {str(e)}")
+            return None
+
+def decrypt(encrypted_data: str, secret_key: str) -> str:
+    if not secret_key:
+        return encrypted_data
+    return Decryptor(secret_key).decrypt(encrypted_data)
 
 class StageType(Enum):
     SAMPLE_RUN = "sample_run"

diff --git a/docs/playground/index.md b/docs/playground/index.md
@@ -12,6 +12,14 @@ Building complex LLM pipelines for your data often requires experimentation and
 - ✨ Refine operations based on sample outputs  
 - 🔄 Build complex pipelines step-by-step
 
+## Public Playground
+
+You can access our hosted playground at [docetl.org/playground](https://docetl.org/playground). You'll need to provide your own LLM API keys to use the service. The chatbot and prompt engineering assistants are powered by OpenAI models, so you'll need to provide an OpenAI API key.
+
+!!! note "Data Storage Notice"
+
+    As this is a research project, we cache results and store data on our servers to improve the system. While we will never sell or release your data, if you have privacy concerns, we recommend running the playground locally using the installation instructions below.
+
 ## Installation
 
 There are two ways to run the playground:
@@ -24,7 +32,9 @@ The easiest way to get started is using Docker:
 
 Create `.env` in the root directory (for the FastAPI backend):
 ```bash
-OPENAI_API_KEY=your_api_key_here # Or your LLM provider's API key
+# Required: API key for your preferred LLM provider (OpenAI, Anthropic, etc)
+# The key format will depend on your chosen provider (sk-..., anthro-...)
+OPENAI_API_KEY=your_api_key_here 
 BACKEND_ALLOW_ORIGINS=
 BACKEND_HOST=localhost
 BACKEND_PORT=8000
@@ -35,9 +45,11 @@ FRONTEND_PORT=3000
 
 Create `.env.local` in the `website` directory (for the frontend) **note that this must be in the `website` directory**:
 ```bash
-OPENAI_API_KEY=sk-xxx # For the AI assistant in the interface
-OPENAI_API_BASE=https://api.openai.com/v1 # For the AI assistant in the interface
-MODEL_NAME=gpt-4o-mini # For the AI assistant in the interface
+# Optional: These are only needed if you want to use the AI assistant chatbot 
+# and prompt engineering tools. Must be OpenAI API keys specifically.
+OPENAI_API_KEY=sk-xxx
+OPENAI_API_BASE=https://api.openai.com/v1
+MODEL_NAME=gpt-4o-mini
 
 NEXT_PUBLIC_BACKEND_HOST=localhost
 NEXT_PUBLIC_BACKEND_PORT=8000
@@ -72,7 +84,7 @@ cd docetl
 
 2. Set up environment variables in `.env` in the root directory:
 ```bash
-OPENAI_API_KEY=your_api_key_here
+LLM_API_KEY=your_api_key_here
 BACKEND_ALLOW_ORIGINS=
 BACKEND_HOST=localhost
 BACKEND_PORT=8000
@@ -113,6 +125,15 @@ The UI offers an optional chat-based assistant that can help you iteratively dev
 
 To use the assistant, you need to set your OpenAI API key in the `.env.local` file in the website directory. You can get an API key [here](https://platform.openai.com/api-keys). The API key should be in the following format: `sk-proj-...`. We only support the openai models for the assistant.
 
+!!! tip "Self-hosting with UI API key management"
+
+    If you want to host your own version of DocETL for your organization while allowing users to set their API keys through the UI, you'll need to set up encryption. Add the following to both `.env` and `website/.env.local`:
+    ```bash
+    DOCETL_ENCRYPTION_KEY=your_secret_key_here
+    ```
+    This shared encryption key allows API keys to be securely encrypted when sent to your server. Make sure to use the same value in both files.
+
+
 ## Complex Tutorial
 
 See this [YouTube video](https://www.youtube.com/watch?v=IlgueVqtHGo) for a more in depth tutorial on how to use the playground.