Skip to content

Commit

Permalink
add metadata_loader to evaluate
Browse files Browse the repository at this point in the history
  • Loading branch information
sangmandu committed Nov 13, 2024
1 parent 8e4e80d commit 6880294
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from bfcl.eval_checker.multi_turn_eval.multi_turn_utils import is_empty_execute_response
from bfcl.model_handler.handler_map import HANDLER_MAP
from bfcl.model_handler.handler_loader import HandlerLoader
from bfcl.utils import *
from dotenv import load_dotenv
from tqdm import tqdm
Expand Down Expand Up @@ -631,9 +632,12 @@ def main(model, test_category, api_sanity_check):


def get_handler(model_name):
return HANDLER_MAP[model_name](
model_name, temperature=0
) # Temperature doesn't matter for evaluation
"""Create a handler instance"""
handler_class = HandlerLoader.get_handler_class(model_name)
if handler_class is None:
raise ValueError(f"No handler found for model: {model_name}")

return handler_class(model_name, temperature=0)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
write_list_of_dicts_to_file,
)
from tqdm import tqdm
from bfcl.eval_checker.metadata_loader import metadata_loader


def api_status_sanity_check_rest():
Expand Down Expand Up @@ -244,6 +245,9 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
def generate_leaderboard_csv(
leaderboard_table, output_path, eval_models=None, eval_categories=None
):
# Load metadata at the beginning of the function
model_metadata, _, _ = metadata_loader.load_metadata()

print("📈 Aggregating data to generate leaderboard score table...")
data_non_live = []
data_live = []
Expand Down Expand Up @@ -324,7 +328,7 @@ def generate_leaderboard_csv(
data_non_live.append(
[
"N/A",
MODEL_METADATA_MAPPING[model_name_escaped][0],
model_metadata[model_name_escaped][0],
overall_accuracy_non_live["accuracy"],
summary_ast_non_live["accuracy"],
summary_exec_non_live["accuracy"],
Expand Down Expand Up @@ -385,7 +389,7 @@ def generate_leaderboard_csv(
data_live.append(
[
"N/A",
MODEL_METADATA_MAPPING[model_name_escaped][0],
model_metadata[model_name_escaped][0],
overall_accuracy_live["accuracy"],
summary_ast_live["accuracy"],
python_simple_ast_live["accuracy"],
Expand Down Expand Up @@ -424,7 +428,7 @@ def generate_leaderboard_csv(
data_multi_turn.append(
[
"N/A",
MODEL_METADATA_MAPPING[model_name_escaped][0],
model_metadata[model_name_escaped][0],
overall_accuracy_multi_turn["accuracy"],
multi_turn_base["accuracy"],
multi_turn_miss_func["accuracy"],
Expand All @@ -451,8 +455,8 @@ def generate_leaderboard_csv(
[
"N/A",
total_overall_accuracy["accuracy"],
MODEL_METADATA_MAPPING[model_name_escaped][0],
MODEL_METADATA_MAPPING[model_name_escaped][1],
model_metadata[model_name_escaped][0],
model_metadata[model_name_escaped][1],
cost,
latency_mean,
latency_std,
Expand Down Expand Up @@ -481,8 +485,8 @@ def generate_leaderboard_csv(
# multi_turn_composite["accuracy"],
total_relevance["accuracy"],
total_irrelevance["accuracy"],
MODEL_METADATA_MAPPING[model_name_escaped][2],
MODEL_METADATA_MAPPING[model_name_escaped][3],
model_metadata[model_name_escaped][2],
model_metadata[model_name_escaped][3],
]
)

Expand Down Expand Up @@ -554,6 +558,7 @@ def generate_leaderboard_csv(

def check_model_category_status(score_path):
result_path = score_path.replace("score", "result")
model_metadata, _, _ = metadata_loader.load_metadata()

leaderboard_categories = [
"exec_simple",
Expand All @@ -578,8 +583,8 @@ def check_model_category_status(score_path):

category_status = {}

# Check for all models in MODEL_METADATA_MAPPING
for model_name in MODEL_METADATA_MAPPING.keys():
# Check for all models in metadata
for model_name in model_metadata.keys():
category_status[model_name] = {
category: {"generated": False, "evaluated": False}
for category in leaderboard_categories
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import json
import os
from typing import Dict, Any

from bfcl.model_handler.handler_map import local_inference_handler_map
from bfcl.eval_checker.model_metadata import (
MODEL_METADATA_MAPPING,
OUTPUT_PRICE_PER_MILLION_TOKEN,
NO_COST_MODELS,
)

class MetadataLoader:
@staticmethod
def load_metadata() -> tuple[Dict[str, Any], Dict[str, float], list[str]]:
"""
Load model metadata, pricing information, and list of no-cost models.
Returns:
tuple containing:
- metadata: Dict mapping model names to their metadata
- prices: Dict mapping model names to their prices
- no_cost_models: List of model names that have no associated cost
"""
metadata = dict(MODEL_METADATA_MAPPING)
prices = dict(OUTPUT_PRICE_PER_MILLION_TOKEN)
no_cost = list(NO_COST_MODELS)

# Check for additional metadata config file path in environment variables
metadata_config_path = os.getenv("BFCL_MODEL_METADATA")

if metadata_config_path and os.path.exists(metadata_config_path):
try:
with open(metadata_config_path) as f:
custom_config = json.load(f)

# Add custom model metadata
if "metadata" in custom_config:
metadata.update(custom_config["metadata"])

# Add custom pricing information
if "prices" in custom_config:
prices.update(custom_config["prices"])

# Add additional no-cost models
if "no_cost_models" in custom_config:
no_cost.extend(custom_config["no_cost_models"])

except Exception as e:
print(f"Error loading custom metadata config: {str(e)}")

return metadata, prices, no_cost

# Global metadata loader instance
metadata_loader = MetadataLoader()

0 comments on commit 6880294

Please sign in to comment.