Skip to content

Commit

Permalink
chore: adding better error messages for gemini (#271)
Browse files Browse the repository at this point in the history
* fix: equijoin is out of date

* fix: equijoin is out of date and there are runtime errors

* fix: make split map gather pipeline work

* fix: better error message for gemini

* fix: better error message for gemini

* fix: better error message for gemini
  • Loading branch information
shreyashankar authored Jan 8, 2025
1 parent 8f1036b commit eb96aa5
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 36 deletions.
2 changes: 1 addition & 1 deletion docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1569,4 +1569,4 @@ def save_optimized_config(self, optimized_config_path: str):

if __name__ == "__main__":
optimizer = Optimizer("workloads/medical/map.yaml", model="gpt-4o-mini")
optimizer.optimize()
optimizer.optimize()
2 changes: 1 addition & 1 deletion docetl/operations/equijoin.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,4 +585,4 @@ def stratified_length_sample(
)
sampled_pairs.extend(pairs[:group_sample_size])

return sampled_pairs
return sampled_pairs
5 changes: 5 additions & 0 deletions docetl/operations/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ def render_chunk_with_context(
Returns:
str: Renderted chunk with context and headers.
"""

# If there are no peripheral chunks, return the main chunk
if not peripheral_config:
return chunks[current_index][content_key]

combined_parts = ["--- Previous Context ---"]

combined_parts.extend(
Expand Down
64 changes: 40 additions & 24 deletions docetl/operations/utils/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from rich import print as rprint

BASIC_MODELS = ["gpt-4o-mini", "gpt-4o"]

class APIWrapper(object):
def __init__(self, runner):
self.runner = runner
Expand Down Expand Up @@ -485,31 +487,45 @@ def _call_llm_with_cache(

self.runner.rate_limiter.try_acquire("llm_call", weight=1)
if tools is not None:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
tools=tools,
tool_choice=tool_choice,
**litellm_completion_kwargs,
)
try:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
tools=tools,
tool_choice=tool_choice,
**litellm_completion_kwargs,
)
except Exception as e:
# Check that there's a prefix for the model name if it's not a basic model
if model not in BASIC_MODELS:
if not "/" in model:
raise ValueError(f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}")
raise e
else:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
**litellm_completion_kwargs,
)
try:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
**litellm_completion_kwargs,
)
except Exception as e:
# Check that there's a prefix for the model name if it's not a basic model
if model not in BASIC_MODELS:
if not "/" in model:
raise ValueError(f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}")
raise e


return response
Expand Down
4 changes: 1 addition & 3 deletions docetl/optimizers/map_optimizer/operation_creators.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ def create_split_map_gather_operations(
if "next" in context_info:
gather_config["peripheral_chunks"]["next"] = context_info["next"]

# Add gather to the pipeline if there are peripheral chunks
if gather_config["peripheral_chunks"]:
pipeline.append(gather_config)
pipeline.append(gather_config)

return pipeline

Expand Down
12 changes: 5 additions & 7 deletions tests/test_synth_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,18 @@ def test_synth_gather(config_yaml):
config_path, long_documents_path, output_path = config_yaml

# Initialize the optimizer
optimizer = Optimizer.from_yaml(config_path)
runner = DSLRunner.from_yaml(config_path)

# Run the optimization
optimizer.optimize()
optimized_pipeline, cost = runner.optimize(return_pipeline=True)

# Check if a gather operation was synthesized
synthesized_gather_found = False
for step in optimizer.optimized_config["pipeline"]["steps"]:
for step in optimized_pipeline.config["pipeline"]["steps"]:
for op in step["operations"]:
synthesized_op = [
operation
for operation in optimizer.optimized_config["operations"]
for operation in optimized_pipeline.config["operations"]
if operation["name"] == op
][0]
if synthesized_op.get("type") == "gather":
Expand All @@ -139,8 +139,7 @@ def test_synth_gather(config_yaml):
), "No synthesized gather operation found in the optimized config"

# Run the optimized pipeline
runner = DSLRunner(optimizer.optimized_config_path)
runner.run()
optimized_pipeline.load_run_save()

# Check if the output file was created
assert os.path.exists(output_path), "Output file was not created"
Expand All @@ -161,7 +160,6 @@ def test_synth_gather(config_yaml):

# Clean up temporary files
os.remove(config_path)
os.remove(optimizer.optimized_config_path)
os.remove(long_documents_path)
os.remove(output_path)

Expand Down
3 changes: 3 additions & 0 deletions website/public/llms.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# DocETL System Description and LLM Instructions (Short)

Note: use docetl.org/llms-full.txt for the full system description and LLM instructions.

DocETL is a system for creating and executing LLM-powered data processing pipelines, designed for complex document processing tasks. It provides a low-code, declarative YAML interface to define complex data operations on unstructured datasets.

DocETL is built and maintained by the EPIC lab at UC Berkeley. Learn more at https://www.docetl.org.
Expand All @@ -8,6 +10,7 @@ We have an integrated development environment for building and testing pipelines

## Docs

- [LLM Instructions (Full)](https://www.docetl.org/llms-full.txt)
- [Website](https://www.docetl.org)
- [DocWrangler Playground](https://www.docetl.org/playground)
- [Main Documentation](https://ucbepic.github.io/docetl)
Expand Down

0 comments on commit eb96aa5

Please sign in to comment.