From eb96aa5c98c656f539e3a2efb4047595782624da Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Wed, 8 Jan 2025 15:51:20 -0800 Subject: [PATCH] chore: adding better error messages for gemini (#271) * fix: equijoin is out of date * fix: equijoin is out of date and there are runtime errors * fix: make split map gather pipeline work * fix: better error message for gemini * fix: better error message for gemini * fix: better error message for gemini --- docetl/builder.py | 2 +- docetl/operations/equijoin.py | 2 +- docetl/operations/gather.py | 5 ++ docetl/operations/utils/api.py | 64 ++++++++++++------- .../map_optimizer/operation_creators.py | 4 +- tests/test_synth_gather.py | 12 ++-- website/public/llms.txt | 3 + 7 files changed, 56 insertions(+), 36 deletions(-) diff --git a/docetl/builder.py b/docetl/builder.py index d0307c7b..d99ffcb4 100644 --- a/docetl/builder.py +++ b/docetl/builder.py @@ -1569,4 +1569,4 @@ def save_optimized_config(self, optimized_config_path: str): if __name__ == "__main__": optimizer = Optimizer("workloads/medical/map.yaml", model="gpt-4o-mini") - optimizer.optimize() + optimizer.optimize() \ No newline at end of file diff --git a/docetl/operations/equijoin.py b/docetl/operations/equijoin.py index 44f128ed..274d30df 100644 --- a/docetl/operations/equijoin.py +++ b/docetl/operations/equijoin.py @@ -585,4 +585,4 @@ def stratified_length_sample( ) sampled_pairs.extend(pairs[:group_sample_size]) - return sampled_pairs + return sampled_pairs \ No newline at end of file diff --git a/docetl/operations/gather.py b/docetl/operations/gather.py index 5d6cfec1..ba6e0242 100644 --- a/docetl/operations/gather.py +++ b/docetl/operations/gather.py @@ -150,6 +150,11 @@ def render_chunk_with_context( Returns: str: Renderted chunk with context and headers. """ + + # If there are no peripheral chunks, return the main chunk + if not peripheral_config: + return chunks[current_index][content_key] + combined_parts = ["--- Previous Context ---"] combined_parts.extend( diff --git a/docetl/operations/utils/api.py b/docetl/operations/utils/api.py index a58c02a5..5943990e 100644 --- a/docetl/operations/utils/api.py +++ b/docetl/operations/utils/api.py @@ -14,6 +14,8 @@ from rich import print as rprint +BASIC_MODELS = ["gpt-4o-mini", "gpt-4o"] + class APIWrapper(object): def __init__(self, runner): self.runner = runner @@ -485,31 +487,45 @@ def _call_llm_with_cache( self.runner.rate_limiter.try_acquire("llm_call", weight=1) if tools is not None: - response = completion( - model=model, - messages=[ - { - "role": "system", - "content": system_prompt, - }, - ] - + messages, - tools=tools, - tool_choice=tool_choice, - **litellm_completion_kwargs, - ) + try: + response = completion( + model=model, + messages=[ + { + "role": "system", + "content": system_prompt, + }, + ] + + messages, + tools=tools, + tool_choice=tool_choice, + **litellm_completion_kwargs, + ) + except Exception as e: + # Check that there's a prefix for the model name if it's not a basic model + if model not in BASIC_MODELS: + if not "/" in model: + raise ValueError(f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}") + raise e else: - response = completion( - model=model, - messages=[ - { - "role": "system", - "content": system_prompt, - }, - ] - + messages, - **litellm_completion_kwargs, - ) + try: + response = completion( + model=model, + messages=[ + { + "role": "system", + "content": system_prompt, + }, + ] + + messages, + **litellm_completion_kwargs, + ) + except Exception as e: + # Check that there's a prefix for the model name if it's not a basic model + if model not in BASIC_MODELS: + if not "/" in model: + raise ValueError(f"Note: You may also need to prefix your model name with the provider, e.g. 'openai/gpt-4o-mini' or 'gemini/gemini-1.5-flash' to conform to LiteLLM API standards. Original error: {e}") + raise e return response diff --git a/docetl/optimizers/map_optimizer/operation_creators.py b/docetl/optimizers/map_optimizer/operation_creators.py index e6e66326..7dcc9b9f 100644 --- a/docetl/optimizers/map_optimizer/operation_creators.py +++ b/docetl/optimizers/map_optimizer/operation_creators.py @@ -135,9 +135,7 @@ def create_split_map_gather_operations( if "next" in context_info: gather_config["peripheral_chunks"]["next"] = context_info["next"] - # Add gather to the pipeline if there are peripheral chunks - if gather_config["peripheral_chunks"]: - pipeline.append(gather_config) + pipeline.append(gather_config) return pipeline diff --git a/tests/test_synth_gather.py b/tests/test_synth_gather.py index 394dc32a..c832c1c3 100644 --- a/tests/test_synth_gather.py +++ b/tests/test_synth_gather.py @@ -105,18 +105,18 @@ def test_synth_gather(config_yaml): config_path, long_documents_path, output_path = config_yaml # Initialize the optimizer - optimizer = Optimizer.from_yaml(config_path) + runner = DSLRunner.from_yaml(config_path) # Run the optimization - optimizer.optimize() + optimized_pipeline, cost = runner.optimize(return_pipeline=True) # Check if a gather operation was synthesized synthesized_gather_found = False - for step in optimizer.optimized_config["pipeline"]["steps"]: + for step in optimized_pipeline.config["pipeline"]["steps"]: for op in step["operations"]: synthesized_op = [ operation - for operation in optimizer.optimized_config["operations"] + for operation in optimized_pipeline.config["operations"] if operation["name"] == op ][0] if synthesized_op.get("type") == "gather": @@ -139,8 +139,7 @@ def test_synth_gather(config_yaml): ), "No synthesized gather operation found in the optimized config" # Run the optimized pipeline - runner = DSLRunner(optimizer.optimized_config_path) - runner.run() + optimized_pipeline.load_run_save() # Check if the output file was created assert os.path.exists(output_path), "Output file was not created" @@ -161,7 +160,6 @@ def test_synth_gather(config_yaml): # Clean up temporary files os.remove(config_path) - os.remove(optimizer.optimized_config_path) os.remove(long_documents_path) os.remove(output_path) diff --git a/website/public/llms.txt b/website/public/llms.txt index 6c076f07..4889f51c 100644 --- a/website/public/llms.txt +++ b/website/public/llms.txt @@ -1,5 +1,7 @@ # DocETL System Description and LLM Instructions (Short) +Note: use docetl.org/llms-full.txt for the full system description and LLM instructions. + DocETL is a system for creating and executing LLM-powered data processing pipelines, designed for complex document processing tasks. It provides a low-code, declarative YAML interface to define complex data operations on unstructured datasets. DocETL is built and maintained by the EPIC lab at UC Berkeley. Learn more at https://www.docetl.org. @@ -8,6 +10,7 @@ We have an integrated development environment for building and testing pipelines ## Docs +- [LLM Instructions (Full)](https://www.docetl.org/llms-full.txt) - [Website](https://www.docetl.org) - [DocWrangler Playground](https://www.docetl.org/playground) - [Main Documentation](https://ucbepic.github.io/docetl)