Skip to content

Commit

Permalink
WIP: separate frontend from backend so we can host the frontend (#242)
Browse files Browse the repository at this point in the history
* fix: prompt engineer agent for map decomposition

* feat: host application

* chore: update azure

* chore: update azure

* chore: update azure

* fix: adding default ops in init

* chore: refactoring fastapi models

* feat: allowing frontend to use a separate server for backend

* feat: allowing frontend to use a separate server for backend

* feat: allowing frontend to use a separate server for backend

* feat: allowing frontend to use a separate server for backend

* feat: allowing frontend to use a separate server for backend

* feat: allowing frontend to use a separate server for backend
  • Loading branch information
shreyashankar authored Dec 22, 2024
1 parent 0981285 commit f1a12d2
Show file tree
Hide file tree
Showing 47 changed files with 3,906 additions and 1,995 deletions.
18 changes: 17 additions & 1 deletion docetl/config_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import os
from docetl.console import get_console
from docetl.utils import load_config
from docetl.utils import decrypt, load_config
from typing import Any, Dict, List, Optional, Tuple, Union
from docetl.operations.utils import APIWrapper
import pyrate_limiter
Expand Down Expand Up @@ -71,6 +71,19 @@ def __init__(
self.console = DOCETL_CONSOLE
self.max_threads = max_threads or (os.cpu_count() or 1) * 4
self.status = None
encrypted_llm_api_keys = self.config.get("llm_api_keys", {})
if encrypted_llm_api_keys:
self.llm_api_keys = {
key: decrypt(value, os.environ.get("DOCETL_ENCRYPTION_KEY", ""))
for key, value in encrypted_llm_api_keys.items()
}
else:
self.llm_api_keys = {}

# Temporarily set environment variables for API keys
self._original_env = os.environ.copy()
for key, value in self.llm_api_keys.items():
os.environ[key] = value

buckets = {
param: pyrate_limiter.InMemoryBucket(
Expand All @@ -95,3 +108,6 @@ def __init__(
self.rate_limiter = pyrate_limiter.Limiter(bucket_factory, max_delay=math.inf)

self.api = APIWrapper(self)

def reset_env(self):
os.environ = self._original_env
39 changes: 35 additions & 4 deletions docetl/operations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,50 @@
import importlib.metadata
from docetl.operations.cluster import ClusterOperation
from docetl.operations.code_operations import CodeFilterOperation, CodeMapOperation, CodeReduceOperation
from docetl.operations.equijoin import EquijoinOperation
from docetl.operations.filter import FilterOperation
from docetl.operations.gather import GatherOperation
from docetl.operations.map import MapOperation
from docetl.operations.reduce import ReduceOperation
from docetl.operations.resolve import ResolveOperation
from docetl.operations.split import SplitOperation
from docetl.operations.sample import SampleOperation
from docetl.operations.unnest import UnnestOperation


mapping = {
"cluster": ClusterOperation,
"code_filter": CodeFilterOperation,
"code_map": CodeMapOperation,
"code_reduce": CodeReduceOperation,
"equijoin": EquijoinOperation,
"filter": FilterOperation,
"gather": GatherOperation,
"map": MapOperation,
"reduce": ReduceOperation,
"resolve": ResolveOperation,
"split": SplitOperation,
"sample": SampleOperation,
"unnest": UnnestOperation,
}

def get_operation(operation_type: str):
"""Loads a single operation by name"""
try:
entrypoint = importlib.metadata.entry_points(group="docetl.operation")[
operation_type
]
except KeyError as e:
return entrypoint.load()
except KeyError:
if operation_type in mapping:
return mapping[operation_type]
raise KeyError(f"Unrecognized operation {operation_type}")
return entrypoint.load()

def get_operations():
"""Load all available operations and return them as a dictionary"""
return {
operations = mapping.copy()
operations.update({
op.name: op.load()
for op in importlib.metadata.entry_points(group="docetl.operation")
}
})
return operations
2 changes: 1 addition & 1 deletion docetl/operations/utils/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def _call_llm_with_cache(
dataset_description = self.runner.config.get("system_prompt", {}).get("dataset_description", "a collection of unstructured documents")
parethetical_op_instructions = "many inputs:one output" if op_type == "reduce" else "one input:one output"

system_prompt = f"You are a {persona}, intelligently transforming data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as accurately, precisely, and exhaustively as possible. The result should be a structured output that you will send back to the user."
system_prompt = f"You are a {persona}, helping the user make sense of their data. The dataset description is: {dataset_description}. You will be performing a {op_type} operation ({parethetical_op_instructions}). You will perform the specified task on the provided data, as precisely and exhaustively (i.e., high recall) as possible. The result should be a structured output that you will send back to the user, with the `send_output` function. Do not influence your answers too much based on the `send_output` function parameter names; just use them to send the result back to the user."
if scratchpad:
system_prompt += f"""
Expand Down
6 changes: 4 additions & 2 deletions docetl/operations/utils/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Dict, List, Optional
import tiktoken
from jinja2 import Template
from litellm import completion, RateLimitError
from litellm import model_cost
from pydantic import BaseModel
from rich import print as rprint

Expand Down Expand Up @@ -69,7 +69,9 @@ def truncate_messages(
from_agent: bool = False
) -> List[Dict[str, str]]:
"""Truncate messages to fit within model's context length."""
model_input_context_length = 8192 # Default
model_input_context_length = model_cost.get(model.split("/")[-1], {}).get(
"max_input_tokens", 8192
)
total_tokens = sum(count_tokens(json.dumps(msg), model) for msg in messages)

if total_tokens <= model_input_context_length - 100:
Expand Down
4 changes: 3 additions & 1 deletion docetl/optimizers/map_optimizer/config_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _get_split_config(
Determine the split key and subprompt for processing chunks of the input data.
The split key should be a key in the input data that contains a string to be split.
The subprompt should be designed to process individual chunks of the split data.
The subprompt should be designed to process individual chunks of the split data, and only process the main chunk in within chunk delimiters if they are present.
Note that the subprompt's output schema might be different from the original operation's output schema, since you may want to extract more information or make the information less structured/more free text. The original output schema will be preserved when combining the chunks' processed results.
Important:
Expand Down Expand Up @@ -148,6 +148,8 @@ def _get_split_config(

result["subprompt_output_schema"].update(op_config["output"]["schema"])

result["subprompt"] = result["subprompt"] + " Only process the main chunk in --- Begin Main Chunk --- and --- End Main Chunk --- delimiters if they are present."

self.console.log(
f"[yellow]Breaking down operation {op_config['name']}[/yellow]"
)
Expand Down
2 changes: 1 addition & 1 deletion docetl/optimizers/map_optimizer/plan_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def determine_metadata_with_retry():
map_op = self.operation_creator.create_map_operation(
op_config,
subprompt_output_schema,
split_result["subprompt"] + " Only process the main chunk.",
split_result["subprompt"] ,
)

# unnest_ops = self.operation_creator.create_unnest_operations(op_config)
Expand Down
30 changes: 30 additions & 0 deletions docetl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,36 @@
from jinja2 import Environment, meta
from litellm import completion_cost as lcc

from lzstring import LZString

class Decryptor:
def __init__(self, secret_key: str):
self.key = secret_key
self.lz = LZString()

def decrypt(self, encrypted_data: str) -> str:
try:
# First decompress the data
compressed = self.lz.decompressFromBase64(encrypted_data)
if not compressed:
raise ValueError("Invalid compressed data")

# Then decode using the key
result = ''
for i in range(len(compressed)):
char_code = ord(compressed[i]) - ord(self.key[i % len(self.key)])
result += chr(char_code)

return result

except Exception as e:
print(f"Decryption failed: {str(e)}")
return None

def decrypt(encrypted_data: str, secret_key: str) -> str:
if not secret_key:
return encrypted_data
return Decryptor(secret_key).decrypt(encrypted_data)

class StageType(Enum):
SAMPLE_RUN = "sample_run"
Expand Down
31 changes: 26 additions & 5 deletions docs/playground/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ Building complex LLM pipelines for your data often requires experimentation and
- ✨ Refine operations based on sample outputs
- 🔄 Build complex pipelines step-by-step

## Public Playground

You can access our hosted playground at [docetl.org/playground](https://docetl.org/playground). You'll need to provide your own LLM API keys to use the service. The chatbot and prompt engineering assistants are powered by OpenAI models, so you'll need to provide an OpenAI API key.

!!! note "Data Storage Notice"

As this is a research project, we cache results and store data on our servers to improve the system. While we will never sell or release your data, if you have privacy concerns, we recommend running the playground locally using the installation instructions below.

## Installation

There are two ways to run the playground:
Expand All @@ -24,7 +32,9 @@ The easiest way to get started is using Docker:

Create `.env` in the root directory (for the FastAPI backend):
```bash
OPENAI_API_KEY=your_api_key_here # Or your LLM provider's API key
# Required: API key for your preferred LLM provider (OpenAI, Anthropic, etc)
# The key format will depend on your chosen provider (sk-..., anthro-...)
OPENAI_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
BACKEND_HOST=localhost
BACKEND_PORT=8000
Expand All @@ -35,9 +45,11 @@ FRONTEND_PORT=3000

Create `.env.local` in the `website` directory (for the frontend) **note that this must be in the `website` directory**:
```bash
OPENAI_API_KEY=sk-xxx # For the AI assistant in the interface
OPENAI_API_BASE=https://api.openai.com/v1 # For the AI assistant in the interface
MODEL_NAME=gpt-4o-mini # For the AI assistant in the interface
# Optional: These are only needed if you want to use the AI assistant chatbot
# and prompt engineering tools. Must be OpenAI API keys specifically.
OPENAI_API_KEY=sk-xxx
OPENAI_API_BASE=https://api.openai.com/v1
MODEL_NAME=gpt-4o-mini

NEXT_PUBLIC_BACKEND_HOST=localhost
NEXT_PUBLIC_BACKEND_PORT=8000
Expand Down Expand Up @@ -72,7 +84,7 @@ cd docetl

2. Set up environment variables in `.env` in the root directory:
```bash
OPENAI_API_KEY=your_api_key_here
LLM_API_KEY=your_api_key_here
BACKEND_ALLOW_ORIGINS=
BACKEND_HOST=localhost
BACKEND_PORT=8000
Expand Down Expand Up @@ -113,6 +125,15 @@ The UI offers an optional chat-based assistant that can help you iteratively dev

To use the assistant, you need to set your OpenAI API key in the `.env.local` file in the website directory. You can get an API key [here](https://platform.openai.com/api-keys). The API key should be in the following format: `sk-proj-...`. We only support the openai models for the assistant.

!!! tip "Self-hosting with UI API key management"

If you want to host your own version of DocETL for your organization while allowing users to set their API keys through the UI, you'll need to set up encryption. Add the following to both `.env` and `website/.env.local`:
```bash
DOCETL_ENCRYPTION_KEY=your_secret_key_here
```
This shared encryption key allows API keys to be securely encrypted when sent to your server. Make sure to use the same value in both files.


## Complex Tutorial

See this [YouTube video](https://www.youtube.com/watch?v=IlgueVqtHGo) for a more in depth tutorial on how to use the playground.
Loading

0 comments on commit f1a12d2

Please sign in to comment.