Skip to content

Commit

Permalink
fix: handle azure gpt-4o-mini and output to csv
Browse files Browse the repository at this point in the history
  • Loading branch information
shreyashankar committed Oct 10, 2024
1 parent ffe0b18 commit c6f9491
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 15 deletions.
6 changes: 3 additions & 3 deletions docetl/operations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ def call_llm_with_cache(
len(props) == 1
and list(props.values())[0].get("type") == "string"
and scratchpad is None
and "ollama" in model
and ("ollama" in model or "azure/gpt-4o-mini" in model)
):
use_tools = False

Expand All @@ -635,7 +635,7 @@ def call_llm_with_cache(
"type": "function",
"function": {
"name": "send_output",
"description": "Send structured output back to the user",
"description": "Send output back to the user",
"strict": True,
"parameters": parameters,
"additionalProperties": False,
Expand Down Expand Up @@ -858,7 +858,7 @@ def call_llm_with_gleaning(
"type": "function",
"function": {
"name": "send_output",
"description": "Send structured output back to the user",
"description": "Send output back to the user",
"strict": True,
"parameters": parameters,
"additionalProperties": False,
Expand Down
26 changes: 21 additions & 5 deletions docetl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,16 @@ def __init__(self, config: Dict, max_threads: int = None):
# Check if output path is correctly formatted as JSON
output_path = self.config.get("pipeline", {}).get("output", {}).get("path")
if output_path:
if not output_path.lower().endswith(".json"):
if not (
output_path.lower().endswith(".json")
or output_path.lower().endswith(".csv")
):
raise ValueError(
f"Output path '{output_path}' is not a JSON file. Please provide a path ending with '.json'."
f"Output path '{output_path}' is not a JSON or CSV file. Please provide a path ending with '.json' or '.csv'."
)
else:
raise ValueError(
"No output path specified in the configuration. Please provide an output path ending with '.json' in the configuration."
"No output path specified in the configuration. Please provide an output path ending with '.json' or '.csv' in the configuration."
)

self.syntax_check()
Expand All @@ -77,6 +80,11 @@ def __init__(self, config: Dict, max_threads: int = None):
all_ops_until_and_including_current = [
op_map[prev_op] for prev_op in step["operations"][:idx]
] + [op_map[op_name]]
# If there's no model in the op, add the default model
for op in all_ops_until_and_including_current:
if "model" not in op:
op["model"] = self.default_model

all_ops_str = json.dumps(all_ops_until_and_including_current)
self.step_op_hashes[step["name"]][op_name] = hashlib.sha256(
all_ops_str.encode()
Expand Down Expand Up @@ -207,8 +215,16 @@ def save_output(self, data: List[Dict]):
self.console.rule("[cyan]Saving Output[/cyan]")
output_config = self.config["pipeline"]["output"]
if output_config["type"] == "file":
with open(output_config["path"], "w") as file:
json.dump(data, file, indent=2)
if output_config["path"].lower().endswith(".json"):
with open(output_config["path"], "w") as file:
json.dump(data, file, indent=2)
else: # CSV
import csv

with open(output_config["path"], "w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
self.console.print(
f"[green italic]💾 Output saved to {output_config['path']}[/green italic]"
)
Expand Down
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c6f9491

Please sign in to comment.