Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: adding better error messages for gemini #271

Merged
merged 6 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docetl/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1569,4 +1569,4 @@ def save_optimized_config(self, optimized_config_path: str):

if __name__ == "__main__":
optimizer = Optimizer("workloads/medical/map.yaml", model="gpt-4o-mini")
optimizer.optimize()
optimizer.optimize()
2 changes: 1 addition & 1 deletion docetl/operations/equijoin.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,4 +585,4 @@ def stratified_length_sample(
)
sampled_pairs.extend(pairs[:group_sample_size])

return sampled_pairs
return sampled_pairs
5 changes: 5 additions & 0 deletions docetl/operations/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ def render_chunk_with_context(
Returns:
str: Renderted chunk with context and headers.
"""

# If there are no peripheral chunks, return the main chunk
if not peripheral_config:
return chunks[current_index][content_key]

combined_parts = ["--- Previous Context ---"]

combined_parts.extend(
Expand Down
64 changes: 40 additions & 24 deletions docetl/operations/utils/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from rich import print as rprint

BASIC_MODELS = ["gpt-4o-mini", "gpt-4o"]

class APIWrapper(object):
def __init__(self, runner):
self.runner = runner
Expand Down Expand Up @@ -485,31 +487,45 @@ def _call_llm_with_cache(

self.runner.rate_limiter.try_acquire("llm_call", weight=1)
if tools is not None:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
tools=tools,
tool_choice=tool_choice,
**litellm_completion_kwargs,
)
try:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
tools=tools,
tool_choice=tool_choice,
**litellm_completion_kwargs,
)
except Exception as e:
# Check that there's a prefix for the model name if it's not a basic model
if model not in BASIC_MODELS:
if not "/" in model:
raise ValueError(f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}")
raise e
else:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
**litellm_completion_kwargs,
)
try:
response = completion(
model=model,
messages=[
{
"role": "system",
"content": system_prompt,
},
]
+ messages,
**litellm_completion_kwargs,
)
except Exception as e:
# Check that there's a prefix for the model name if it's not a basic model
if model not in BASIC_MODELS:
if not "/" in model:
raise ValueError(f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}")
raise e


return response
Expand Down
4 changes: 1 addition & 3 deletions docetl/optimizers/map_optimizer/operation_creators.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ def create_split_map_gather_operations(
if "next" in context_info:
gather_config["peripheral_chunks"]["next"] = context_info["next"]

# Add gather to the pipeline if there are peripheral chunks
if gather_config["peripheral_chunks"]:
pipeline.append(gather_config)
pipeline.append(gather_config)

return pipeline

Expand Down
12 changes: 5 additions & 7 deletions tests/test_synth_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,18 @@ def test_synth_gather(config_yaml):
config_path, long_documents_path, output_path = config_yaml

# Initialize the optimizer
optimizer = Optimizer.from_yaml(config_path)
runner = DSLRunner.from_yaml(config_path)

# Run the optimization
optimizer.optimize()
optimized_pipeline, cost = runner.optimize(return_pipeline=True)

# Check if a gather operation was synthesized
synthesized_gather_found = False
for step in optimizer.optimized_config["pipeline"]["steps"]:
for step in optimized_pipeline.config["pipeline"]["steps"]:
for op in step["operations"]:
synthesized_op = [
operation
for operation in optimizer.optimized_config["operations"]
for operation in optimized_pipeline.config["operations"]
if operation["name"] == op
][0]
if synthesized_op.get("type") == "gather":
Expand All @@ -139,8 +139,7 @@ def test_synth_gather(config_yaml):
), "No synthesized gather operation found in the optimized config"

# Run the optimized pipeline
runner = DSLRunner(optimizer.optimized_config_path)
runner.run()
optimized_pipeline.load_run_save()

# Check if the output file was created
assert os.path.exists(output_path), "Output file was not created"
Expand All @@ -161,7 +160,6 @@ def test_synth_gather(config_yaml):

# Clean up temporary files
os.remove(config_path)
os.remove(optimizer.optimized_config_path)
os.remove(long_documents_path)
os.remove(output_path)

Expand Down
3 changes: 3 additions & 0 deletions website/public/llms.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# DocETL System Description and LLM Instructions (Short)

Note: use docetl.org/llms-full.txt for the full system description and LLM instructions.

DocETL is a system for creating and executing LLM-powered data processing pipelines, designed for complex document processing tasks. It provides a low-code, declarative YAML interface to define complex data operations on unstructured datasets.

DocETL is built and maintained by the EPIC lab at UC Berkeley. Learn more at https://www.docetl.org.
Expand All @@ -8,6 +10,7 @@ We have an integrated development environment for building and testing pipelines

## Docs

- [LLM Instructions (Full)](https://www.docetl.org/llms-full.txt)
- [Website](https://www.docetl.org)
- [DocWrangler Playground](https://www.docetl.org/playground)
- [Main Documentation](https://ucbepic.github.io/docetl)
Expand Down
Loading