From 6a4c43a633da8ceb9a1b7dfc00ac1272b66e70e6 Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao"
Date: Thu, 19 Dec 2024 12:20:37 -0800
Subject: [PATCH 1/5] update last-updated date
---
leaderboard.html | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/leaderboard.html b/leaderboard.html
index b78e26fb7..8abb2a093 100644
--- a/leaderboard.html
+++ b/leaderboard.html
@@ -113,7 +113,7 @@ BFCL Leaderboard
From 63a6f89d38ee29dcce5031e5d29be6f2735712cd Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao"
Date: Sun, 29 Dec 2024 21:48:00 +0800
Subject: [PATCH 2/5] add checkpoint commit info
---
leaderboard.html | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/leaderboard.html b/leaderboard.html
index 8abb2a093..0dddc57f7 100644
--- a/leaderboard.html
+++ b/leaderboard.html
@@ -113,7 +113,7 @@ BFCL Leaderboard
@@ -155,10 +155,10 @@ BFCL Leaderboard
href="https://discord.gg/grXXvj9Whz">discord.
- Models are evaluated using commit d7e52e5.
+ Models are evaluated using commit 0cea216.
All the model response we obtained is available here.
To reproduce the results, please checkout our codebase at
- this checkpoint.
+ this checkpoint.
From 91a2f05b498f006f60d72563819de5c1e1b52d3d Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao"
Date: Mon, 30 Dec 2024 00:23:48 +0800
Subject: [PATCH 3/5] add data.csv
---
data_live.csv | 169 +++++++++++++++++++++++---------------------
data_multi_turn.csv | 169 +++++++++++++++++++++++---------------------
data_non_live.csv | 169 +++++++++++++++++++++++---------------------
data_overall.csv | 169 +++++++++++++++++++++++---------------------
4 files changed, 356 insertions(+), 320 deletions(-)
diff --git a/data_live.csv b/data_live.csv
index 311b78195..ae2467d19 100644
--- a/data_live.csv
+++ b/data_live.csv
@@ -1,81 +1,90 @@
Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Python Parallel AST,Python Parallel Multiple AST,Irrelevance Detection,Relevance Detection
-1,GPT-4o-2024-08-06 (Prompt),80.84%,74.02%,78.68%,72.46%,100.00%,75.00%,91.84%,52.94%
-2,GPT-4-turbo-2024-04-09 (FC),79.56%,78.09%,81.01%,77.59%,81.25%,66.67%,81.97%,70.59%
-3,GPT-4o-2024-08-06 (FC),79.29%,76.02%,76.36%,76.07%,81.25%,66.67%,84.47%,70.59%
-4,Claude-3-Opus-20240229 (FC),79.24%,76.31%,79.84%,77.40%,18.75%,29.17%,83.79%,76.47%
-5,ToolACE-8B (FC),78.37%,75.72%,72.48%,76.54%,81.25%,70.83%,82.43%,77.78%
-6,Gemini-1.5-Flash-002 (FC),77.96%,70.91%,71.71%,70.47%,81.25%,75.00%,89.12%,58.82%
-7,Claude-3.5-Sonnet-20241022 (FC),77.96%,79.50%,82.17%,81.10%,31.25%,12.50%,75.74%,70.59%
-8,o1-mini-2024-09-12 (Prompt),77.73%,71.87%,72.87%,71.70%,75.00%,66.67%,87.07%,58.82%
-9,Mistral-Medium-2312 (Prompt),77.20%,73.50%,74.03%,73.69%,81.25%,54.17%,83.11%,64.71%
-10,GPT-4o-mini-2024-07-18 (Prompt),77.20%,77.42%,79.84%,76.73%,93.75%,70.83%,76.76%,82.35%
-11,palmyra-x-004 (FC),77.16%,74.69%,75.19%,75.21%,50.00%,62.50%,81.07%,70.59%
-12,Gemini-1.5-Pro-002 (Prompt),76.76%,78.61%,81.01%,77.97%,93.75%,70.83%,73.92%,76.47%
-13,Gemini-1.5-Pro-001 (Prompt),76.49%,72.98%,75.58%,71.98%,93.75%,75.00%,82.31%,52.94%
-14,Functionary-Medium-v3.1 (FC),76.45%,82.16%,81.78%,82.62%,68.75%,75.00%,67.80%,72.22%
-15,Gemini-1.5-Pro-002 (FC),76.44%,76.31%,79.07%,75.50%,87.50%,75.00%,76.64%,76.47%
-16,Gemini-1.5-Flash-001 (FC),75.51%,72.69%,72.09%,73.31%,62.50%,58.33%,80.16%,58.82%
-17,Gemini-1.5-Pro-001 (FC),75.47%,70.69%,73.26%,70.18%,81.25%,58.33%,83.11%,58.82%
-18,o1-preview-2024-09-12 (Prompt),75.29%,77.57%,82.17%,76.35%,81.25%,79.17%,71.54%,88.24%
-19,Gemini-1.5-Flash-002 (Prompt),75.20%,74.76%,77.13%,74.26%,93.75%,58.33%,75.62%,88.24%
-20,Qwen2.5-72B-Instruct (Prompt),75.03%,81.79%,84.11%,81.67%,62.50%,75.00%,64.29%,94.44%
-21,GoGoAgent,74.84%,72.61%,74.81%,72.08%,81.25%,66.67%,77.89%,94.12%
-22,DeepSeek-Coder-V2 (FC),73.43%,77.50%,80.62%,77.30%,50.00%,70.83%,67.01%,83.33%
-23,xLAM-8x22b-r (FC),73.39%,80.46%,83.33%,80.15%,62.50%,75.00%,62.24%,88.89%
-24,GPT-4o-mini-2024-07-18 (FC),73.24%,75.20%,75.19%,75.12%,87.50%,70.83%,70.07%,82.35%
-25,Functionary-Small-v3.1 (FC),72.99%,77.35%,78.68%,77.49%,75.00%,58.33%,66.10%,83.33%
-26,Mistral-small-2402 (FC),72.49%,68.91%,64.34%,72.17%,12.50%,12.50%,77.78%,82.35%
-27,Claude-3.5-Sonnet-20241022 (Prompt),71.96%,80.90%,86.05%,80.44%,81.25%,45.83%,58.16%,76.47%
-28,Hammer2.0-7b (FC),71.75%,77.28%,75.97%,77.59%,81.25%,75.00%,62.81%,94.44%
-29,xLAM-8x7b-r (FC),71.08%,76.68%,72.48%,78.16%,62.50%,66.67%,62.02%,94.44%
-30,claude-3.5-haiku-20241022 (Prompt),70.24%,75.20%,81.01%,73.98%,87.50%,58.33%,62.47%,77.78%
-31,mistral-large-2407 (FC),69.73%,79.42%,85.66%,78.16%,68.75%,75.00%,54.76%,76.47%
-32,MiniCPM3-4B-FC (FC),69.66%,65.06%,72.87%,63.63%,37.50%,62.50%,76.53%,77.78%
-33,FireFunction-v1 (FC),69.56%,69.13%,68.99%,71.79%,0.00%,0.00%,69.73%,94.12%
-34,xLAM-7b-r (FC),69.35%,73.35%,71.32%,74.45%,50.00%,62.50%,62.70%,94.44%
-35,Open-Mixtral-8x22b (FC),68.71%,72.46%,75.19%,73.41%,6.25%,45.83%,62.70%,82.35%
-36,GPT-3.5-Turbo-0125 (Prompt),68.62%,77.94%,78.29%,78.25%,75.00%,62.50%,53.85%,94.12%
-37,Gemini-1.5-Flash-001 (Prompt),68.53%,75.87%,74.81%,75.78%,93.75%,79.17%,57.03%,82.35%
-38,Command-R-Plus (Prompt) (Original),68.27%,76.09%,75.58%,76.26%,81.25%,70.83%,56.01%,82.35%
-39,Gemini-1.0-Pro-002 (FC),68.00%,66.17%,73.26%,65.53%,37.50%,37.50%,70.63%,76.47%
-40,Gemma-2-9b-it (Prompt),67.61%,73.72%,73.26%,74.26%,56.25%,66.67%,58.05%,77.78%
-41,Qwen2.5-7B-Instruct (Prompt),66.95%,74.24%,74.81%,74.45%,62.50%,66.67%,55.44%,83.33%
-42,Claude-3-Opus-20240229 (Prompt),66.80%,79.27%,84.11%,78.73%,75.00%,54.17%,47.39%,82.35%
-43,GLM-4-9b-Chat (FC),66.50%,63.58%,71.32%,64.10%,0.00%,0.00%,70.98%,66.67%
-44,FireFunction-v2 (FC),66.44%,75.20%,76.74%,75.50%,56.25%,58.33%,52.61%,88.24%
-45,Gemma-2-27b-it (Prompt),66.15%,78.61%,83.33%,78.06%,68.75%,58.33%,46.60%,88.89%
-46,Open-Mixtral-8x22b (Prompt),65.82%,74.46%,80.62%,72.84%,81.25%,75.00%,52.27%,82.35%
-47,Open-Mistral-Nemo-2407 (FC),65.16%,69.73%,75.19%,68.28%,75.00%,70.83%,58.16%,64.71%
-48,Meta-Llama-3-70B-Instruct (Prompt),65.04%,78.83%,81.01%,78.54%,75.00%,70.83%,43.31%,94.44%
-49,Hammer2.0-1.5b (FC),64.86%,69.43%,74.03%,68.47%,56.25%,70.83%,57.48%,83.33%
-50,Hermes-2-Pro-Llama-3-8B (FC),64.59%,65.95%,69.77%,65.53%,56.25%,50.00%,62.93%,44.44%
-51,Claude-3-Haiku-20240307 (Prompt),64.22%,74.17%,77.13%,74.17%,56.25%,54.17%,48.87%,70.59%
-52,GPT-4-turbo-2024-04-09 (Prompt),63.56%,84.68%,86.05%,84.24%,100.00%,79.17%,30.50%,100.00%
-53,GPT-3.5-Turbo-0125 (FC),62.98%,77.50%,77.91%,78.35%,50.00%,54.17%,40.14%,94.12%
-54,Llama-3.1-70B-Instruct (Prompt),62.02%,76.17%,77.52%,75.97%,87.50%,62.50%,39.68%,94.44%
-55,Llama-3.1-8B-Instruct (Prompt),60.68%,71.95%,73.26%,72.36%,56.25%,50.00%,43.20%,72.22%
-56,DBRX-Instruct (Prompt),60.58%,73.65%,77.52%,73.31%,75.00%,45.83%,39.91%,94.12%
-57,Open-Mixtral-8x7b (Prompt),60.53%,64.03%,60.85%,65.05%,68.75%,50.00%,54.65%,88.24%
-58,Qwen2.5-1.5B-Instruct (Prompt),60.46%,60.25%,68.60%,58.50%,56.25%,50.00%,60.43%,77.78%
-59,Claude-3-Haiku-20240307 (FC),59.51%,75.80%,79.07%,77.87%,0.00%,0.00%,33.79%,100.00%
-60,Granite-20b-FunctionCalling (FC),59.22%,57.66%,67.44%,55.56%,43.75%,54.17%,61.00%,88.89%
-61,Command-R-Plus (FC) (Original),58.89%,62.69%,68.60%,61.82%,50.00%,45.83%,52.27%,100.00%
-62,Mistral-Small-2402 (Prompt),58.18%,56.70%,34.50%,64.20%,0.00%,4.17%,60.43%,58.82%
-63,Hermes-2-Pro-Mistral-7B (FC),57.49%,61.07%,67.44%,60.11%,50.00%,41.67%,51.81%,66.67%
-64,Llama-3.2-3B-Instruct (Prompt),55.53%,63.29%,63.18%,64.39%,18.75%,45.83%,43.08%,83.33%
-65,Nexusflow-Raven-v2 (FC),54.22%,39.45%,41.47%,38.75%,56.25%,37.50%,76.76%,58.82%
-66,MiniCPM3-4B (Prompt),54.20%,36.64%,45.35%,34.19%,43.75%,45.83%,81.07%,55.56%
-67,xLAM-7b-fc-r (FC),54.02%,60.47%,78.29%,57.36%,31.25%,25.00%,43.65%,77.78%
-68,Hammer2.0-0.5b (FC),53.22%,45.82%,51.94%,44.25%,56.25%,41.67%,64.17%,72.22%
-69,mistral-large-2407 (Prompt),52.62%,82.83%,86.05%,81.96%,93.75%,79.17%,5.44%,100.00%
-70,Qwen2-7B-Instruct (Prompt),50.56%,60.47%,56.20%,61.73%,37.50%,66.67%,34.69%,83.33%
-71,Gemini-1.0-Pro-002 (Prompt),48.80%,46.85%,48.06%,46.53%,62.50%,37.50%,51.13%,82.35%
-72,Open-Mistral-Nemo-2407 (Prompt),48.67%,74.17%,77.13%,73.31%,87.50%,70.83%,8.73%,94.12%
-73,Meta-Llama-3-8B-Instruct (Prompt),47.76%,60.47%,59.30%,61.73%,37.50%,33.33%,27.66%,77.78%
-74,Llama-3.1-70B-Instruct (FC),45.27%,51.96%,51.94%,52.90%,31.25%,25.00%,33.90%,100.00%
-75,Gemma-2-2b-it (Prompt),43.40%,19.47%,26.74%,18.42%,0.00%,0.00%,80.16%,38.89%
-76,DeepSeek-Coder-V2-Lite-Instruct (FC),39.63%,3.55%,1.94%,3.70%,6.25%,12.50%,95.58%,5.56%
-77,Qwen2-1.5B-Instruct (Prompt),38.34%,40.49%,47.67%,39.41%,18.75%,25.00%,34.13%,83.33%
-78,xLAM-1b-fc-r (FC),37.54%,54.33%,65.89%,53.56%,0.00%,0.00%,10.54%,100.00%
-79,Llama-3.1-8B-Instruct (FC),33.19%,48.56%,50.00%,48.62%,37.50%,37.50%,8.39%,94.44%
-80,Llama-3.2-1B-Instruct (Prompt),31.36%,11.92%,30.62%,7.50%,12.50%,4.17%,61.11%,33.33%
\ No newline at end of file
+1,GPT-4-turbo-2024-04-09 (FC),80.45%,79.42%,83.33%,78.63%,81.25%,70.83%,82.20%,72.22%
+2,o1-2024-12-17 (Prompt),80.45%,77.50%,81.78%,76.54%,81.25%,70.83%,85.15%,72.22%
+3,gpt-4o-2024-11-20 (Prompt),79.65%,80.46%,83.72%,79.77%,87.50%,70.83%,78.34%,83.33%
+4,gpt-4o-2024-11-20 (FC),79.61%,79.27%,81.01%,78.82%,87.50%,75.00%,80.05%,83.33%
+5,Claude-3.5-Sonnet-20241022 (FC),78.85%,80.46%,83.33%,81.96%,25.00%,20.83%,76.42%,77.78%
+6,ToolACE-8B (FC),78.50%,75.87%,72.48%,76.73%,81.25%,70.83%,82.43%,83.33%
+7,o1-mini-2024-09-12 (Prompt),78.05%,71.80%,71.71%,71.60%,75.00%,79.17%,87.98%,61.11%
+8,Gemini-1.5-Flash-002 (FC),77.97%,70.84%,72.09%,70.18%,81.25%,79.17%,89.34%,55.56%
+9,Claude-3-Opus-20240229 (FC),77.92%,74.98%,77.91%,75.78%,31.25%,37.50%,82.77%,61.11%
+10,o1-2024-12-17 (FC),77.92%,77.05%,81.01%,79.01%,0.00%,0.00%,79.37%,72.22%
+11,watt-tool-70B (FC),77.65%,83.42%,84.88%,83.48%,81.25%,66.67%,68.48%,94.44%
+12,Mistral-Medium-2312 (Prompt),77.52%,74.02%,75.19%,74.07%,81.25%,54.17%,83.11%,66.67%
+13,Gemini-1.5-Pro-001 (Prompt),76.63%,73.06%,75.97%,71.98%,93.75%,75.00%,82.54%,55.56%
+14,Functionary-Medium-v3.1 (FC),76.59%,82.53%,81.01%,83.29%,68.75%,75.00%,67.57%,72.22%
+15,Gemini-1.5-Flash-002 (Prompt),76.54%,76.98%,80.62%,76.16%,93.75%,62.50%,75.74%,83.33%
+16,Gemini-1.5-Pro-002 (Prompt),76.54%,78.39%,81.78%,77.40%,87.50%,79.17%,73.81%,72.22%
+17,watt-tool-8B (FC),76.37%,77.13%,75.97%,77.49%,87.50%,66.67%,75.06%,83.33%
+18,GPT-4o-mini-2024-07-18 (Prompt),76.32%,77.57%,80.23%,76.73%,93.75%,75.00%,74.26%,83.33%
+19,Gemini-1.5-Flash-001 (FC),76.28%,74.02%,75.19%,74.26%,62.50%,58.33%,80.27%,50.00%
+20,Gemini-1.5-Pro-001 (FC),76.23%,71.65%,75.58%,70.75%,81.25%,62.50%,83.79%,50.00%
+21,Gemini-1.5-Pro-002 (FC),76.19%,76.17%,79.46%,75.21%,87.50%,75.00%,76.30%,72.22%
+22,Qwen2.5-72B-Instruct (Prompt),75.21%,82.24%,84.50%,82.15%,62.50%,75.00%,63.95%,100.00%
+23,xLAM-7b-r (FC),75.08%,73.72%,71.32%,74.93%,50.00%,62.50%,86.72%,94.44%
+24,Hammer2.1-7b (FC),75.02%,77.05%,76.36%,77.40%,81.25%,66.67%,71.77%,82.35%
+25,GPT-4o-mini-2024-07-18 (FC),74.37%,76.61%,78.29%,76.16%,87.50%,70.83%,70.75%,83.33%
+26,Qwen2.5-32B-Instruct (Prompt),74.14%,78.68%,82.17%,78.54%,62.50%,58.33%,66.67%,100.00%
+27,Qwen2.5-14B-Instruct (Prompt),74.10%,75.13%,74.03%,75.78%,62.50%,66.67%,72.45%,77.78%
+28,Hammer2.1-3b (FC),73.91%,72.83%,72.48%,73.31%,62.50%,62.50%,75.40%,82.35%
+29,Functionary-Small-v3.1 (FC),73.66%,78.09%,79.07%,78.16%,81.25%,62.50%,66.78%,77.78%
+30,DeepSeek-Coder-V2 (FC),73.43%,77.13%,80.23%,77.02%,43.75%,70.83%,67.46%,88.89%
+31,xLAM-8x22b-r (FC),72.55%,79.57%,79.46%,79.68%,81.25%,75.00%,61.45%,88.89%
+32,claude-3.5-haiku-20241022 (FC),72.28%,76.98%,82.17%,78.35%,18.75%,0.00%,64.85%,83.33%
+33,Mistral-small-2402 (FC),72.10%,68.47%,64.73%,71.51%,12.50%,12.50%,77.55%,77.78%
+34,Claude-3.5-Sonnet-20241022 (Prompt),71.88%,80.61%,86.05%,80.06%,81.25%,45.83%,58.39%,77.78%
+35,xLAM-8x7b-r (FC),70.99%,77.50%,74.03%,79.30%,43.75%,58.33%,60.54%,94.44%
+36,claude-3.5-haiku-20241022 (Prompt),70.64%,76.46%,83.72%,75.02%,87.50%,54.17%,61.56%,77.78%
+37,Hammer2.1-1.5b (FC),70.59%,69.65%,70.93%,69.80%,50.00%,62.50%,71.88%,77.78%
+38,FireFunction-v1 (FC),70.41%,70.47%,71.32%,72.93%,0.00%,0.00%,69.84%,94.44%
+39,MiniCPM3-4B-FC (FC),69.97%,65.66%,74.42%,63.91%,43.75%,62.50%,76.53%,72.22%
+40,mistral-large-2407 (FC),69.84%,79.57%,84.88%,78.54%,62.50%,79.17%,54.88%,72.22%
+41,Gemini-1.0-Pro-002 (FC),69.57%,68.69%,77.13%,67.62%,43.75%,41.67%,70.98%,66.67%
+42,Command R7B (FC),69.21%,59.66%,63.18%,58.69%,56.25%,66.67%,84.13%,55.56%
+43,Gemini-1.5-Flash-001 (Prompt),68.86%,76.54%,76.74%,76.16%,93.75%,79.17%,56.80%,83.33%
+44,Open-Mixtral-8x22b (FC),68.55%,72.46%,76.36%,73.12%,6.25%,45.83%,62.24%,83.33%
+45,GPT-3.5-Turbo-0125 (Prompt),68.46%,78.46%,79.84%,78.63%,75.00%,58.33%,52.61%,94.44%
+46,DeepSeek-V3 (FC),68.33%,81.94%,82.95%,82.15%,81.25%,62.50%,47.05%,88.89%
+47,Gemma-2-9b-it (Prompt),67.84%,74.32%,76.36%,74.26%,62.50%,62.50%,57.60%,83.33%
+48,Qwen2.5-7B-Instruct (Prompt),67.35%,74.91%,75.97%,74.93%,62.50%,70.83%,55.33%,88.89%
+49,Gemma-2-27b-it (Prompt),67.04%,79.94%,84.50%,79.39%,68.75%,62.50%,46.71%,94.44%
+50,Claude-3-Opus-20240229 (Prompt),66.86%,79.50%,84.11%,79.11%,68.75%,54.17%,47.17%,83.33%
+51,GLM-4-9b-Chat (FC),66.77%,63.95%,72.09%,64.39%,0.00%,0.00%,71.09%,66.67%
+52,Open-Mixtral-8x22b (Prompt),65.93%,74.61%,82.17%,72.65%,81.25%,75.00%,52.27%,83.33%
+53,Open-Mistral-Nemo-2407 (FC),65.93%,71.06%,77.13%,69.61%,75.00%,66.67%,58.05%,66.67%
+54,FireFunction-v2 (FC),65.57%,77.94%,78.29%,78.35%,56.25%,70.83%,46.03%,94.44%
+55,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.19%,72.27%,62.50%,66.67%,53.06%,70.59%
+56,Hermes-2-Pro-Llama-3-8B (FC),64.90%,66.54%,71.71%,65.81%,56.25%,50.00%,62.81%,44.44%
+57,Meta-Llama-3-70B-Instruct (Prompt),64.90%,78.46%,80.62%,78.25%,75.00%,66.67%,43.42%,100.00%
+58,GPT-3.5-Turbo-0125 (FC),63.93%,79.05%,80.62%,79.68%,43.75%,58.33%,40.14%,94.44%
+59,GPT-4-turbo-2024-04-09 (Prompt),63.71%,84.75%,87.21%,84.14%,100.00%,75.00%,30.73%,100.00%
+60,Hammer2.1-0.5b (FC),62.86%,58.03%,59.69%,58.02%,50.00%,45.83%,69.95%,77.78%
+61,Llama-3.3-70B-Instruct (Prompt),62.59%,77.72%,80.62%,77.11%,93.75%,62.50%,38.66%,100.00%
+62,Llama-3.1-70B-Instruct (Prompt),62.06%,76.24%,77.13%,76.16%,87.50%,62.50%,39.57%,100.00%
+63,Open-Mixtral-8x7b (Prompt),61.39%,65.28%,63.18%,66.10%,68.75%,50.00%,54.88%,88.89%
+64,Qwen2.5-1.5B-Instruct (Prompt),61.04%,60.99%,70.16%,59.26%,56.25%,41.67%,60.66%,83.33%
+65,Llama-3.1-8B-Instruct (Prompt),60.95%,72.69%,73.26%,73.31%,56.25%,50.00%,42.63%,77.78%
+66,DBRX-Instruct (Prompt),60.15%,73.28%,77.13%,73.03%,75.00%,41.67%,39.34%,94.44%
+67,Granite-20b-FunctionCalling (FC),59.57%,58.33%,67.83%,56.32%,43.75%,54.17%,60.88%,88.89%
+68,Command-R-Plus (FC),58.91%,60.70%,69.77%,58.78%,62.50%,45.83%,55.90%,72.22%
+69,Mistral-Small-2402 (Prompt),58.73%,57.88%,36.05%,65.24%,0.00%,8.33%,60.32%,44.44%
+70,Qwen2.5-3B-Instruct (Prompt),58.60%,66.77%,68.99%,66.48%,56.25%,62.50%,45.46%,88.89%
+71,Hermes-2-Pro-Mistral-7B (FC),57.62%,61.21%,68.99%,60.02%,43.75%,41.67%,51.93%,66.67%
+72,Llama-3.2-3B-Instruct (Prompt),55.75%,63.66%,63.57%,64.86%,12.50%,45.83%,42.97%,88.89%
+73,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
+74,Nexusflow-Raven-v2 (FC),54.15%,39.38%,41.47%,38.65%,56.25%,37.50%,76.64%,61.11%
+75,xLAM-7b-fc-r (FC),53.35%,60.99%,78.29%,58.02%,31.25%,25.00%,41.16%,77.78%
+76,mistral-large-2407 (Prompt),52.69%,82.68%,85.27%,81.96%,93.75%,79.17%,5.78%,100.00%
+77,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89%
+78,Gemini-1.0-Pro-002 (Prompt),49.09%,47.52%,50.39%,47.01%,62.50%,29.17%,50.91%,77.78%
+79,Open-Mistral-Nemo-2407 (Prompt),48.96%,74.98%,77.13%,74.45%,87.50%,66.67%,8.28%,88.89%
+80,Meta-Llama-3-8B-Instruct (Prompt),47.93%,60.55%,60.85%,61.44%,37.50%,33.33%,28.00%,77.78%
+81,Llama-3.1-70B-Instruct (FC),44.96%,51.74%,51.94%,52.61%,31.25%,25.00%,33.45%,100.00%
+82,Gemma-2-2b-it (Prompt),43.76%,19.47%,26.36%,18.52%,0.00%,0.00%,81.07%,38.89%
+83,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00%
+84,GoGoAgent,39.18%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%
+85,Qwen2-1.5B-Instruct (Prompt),39.00%,41.23%,48.45%,40.27%,12.50%,25.00%,34.47%,94.44%
+86,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00%
+87,Llama-3.1-8B-Instruct (FC),33.45%,49.22%,51.55%,49.00%,37.50%,41.67%,8.05%,94.44%
+88,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44%
+89,Llama-3.2-1B-Instruct (Prompt),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89%
\ No newline at end of file
diff --git a/data_multi_turn.csv b/data_multi_turn.csv
index f06bebf3b..5c7aad59b 100644
--- a/data_multi_turn.csv
+++ b/data_multi_turn.csv
@@ -1,81 +1,90 @@
Rank,Model,Multi Turn Overall Acc,Base,Miss Func,Miss Param,Long Context
-1,Claude-3.5-Sonnet-20241022 (FC),41.00%,55.00%,19.00%,42.50%,47.50%
-2,GPT-4o-2024-08-06 (FC),39.12%,58.00%,10.00%,37.00%,51.50%
-3,GPT-4-turbo-2024-04-09 (FC),38.12%,54.00%,13.50%,35.50%,49.50%
-4,GPT-4o-2024-08-06 (Prompt),37.25%,44.00%,31.50%,29.50%,44.00%
-5,o1-preview-2024-09-12 (Prompt),36.88%,47.50%,38.50%,31.50%,30.00%
-6,GPT-4o-mini-2024-07-18 (FC),34.12%,47.50%,19.50%,29.00%,40.50%
-7,Claude-3-Opus-20240229 (FC),30.25%,41.50%,14.00%,33.50%,32.00%
-8,GPT-4-turbo-2024-04-09 (Prompt),30.25%,42.50%,25.00%,20.50%,33.00%
-9,o1-mini-2024-09-12 (Prompt),28.25%,40.50%,5.00%,34.50%,33.00%
-10,Claude-3-Haiku-20240307 (FC),24.50%,35.50%,11.50%,22.00%,29.00%
-11,mistral-large-2407 (FC),23.75%,33.50%,18.00%,23.50%,20.00%
-12,GPT-4o-mini-2024-07-18 (Prompt),22.00%,33.00%,12.00%,17.00%,26.00%
-13,Gemini-1.5-Pro-002 (FC),21.62%,31.00%,5.00%,21.00%,29.50%
-14,Functionary-Medium-v3.1 (FC),21.38%,31.50%,21.00%,26.50%,6.50%
-15,Gemini-1.5-Pro-002 (Prompt),20.75%,23.00%,19.50%,17.50%,23.00%
-16,GPT-3.5-Turbo-0125 (FC),19.50%,32.50%,11.50%,21.50%,12.50%
-17,Gemini-1.5-Flash-001 (Prompt),19.50%,27.50%,20.00%,12.00%,18.50%
-18,Gemini-1.5-Pro-001 (Prompt),18.88%,26.00%,5.00%,21.50%,23.00%
-19,Qwen2.5-72B-Instruct (Prompt),17.25%,23.50%,20.00%,13.50%,12.00%
-20,xLAM-8x22b-r (FC),16.25%,25.50%,16.00%,11.50%,12.00%
-21,Gemini-1.5-Pro-001 (FC),16.00%,24.50%,3.00%,15.50%,21.00%
-22,xLAM-8x7b-r (FC),15.50%,26.00%,13.00%,11.50%,11.50%
-23,Gemini-1.5-Flash-001 (FC),13.87%,19.00%,3.50%,14.00%,19.00%
-24,Gemini-1.5-Flash-002 (Prompt),12.50%,17.50%,6.00%,11.50%,15.00%
-25,Llama-3.1-70B-Instruct (Prompt),12.38%,16.50%,13.00%,10.50%,9.50%
-26,Gemini-1.5-Flash-002 (FC),11.62%,19.00%,0.50%,10.50%,16.50%
-27,palmyra-x-004 (FC),11.37%,12.00%,2.50%,18.50%,12.50%
-28,xLAM-7b-r (FC),10.00%,16.50%,8.50%,7.50%,7.50%
-29,Functionary-Small-v3.1 (FC),9.88%,17.00%,2.50%,14.00%,6.00%
-30,claude-3.5-haiku-20241022 (Prompt),9.75%,16.00%,0.50%,8.00%,14.50%
-31,Llama-3.1-8B-Instruct (Prompt),9.25%,12.00%,10.00%,7.00%,8.00%
-32,Open-Mistral-Nemo-2407 (FC),9.12%,15.00%,3.50%,9.00%,9.00%
-33,FireFunction-v2 (FC),8.62%,13.50%,7.00%,11.00%,3.00%
-34,mistral-large-2407 (Prompt),8.38%,15.00%,6.00%,6.00%,6.50%
-35,ToolACE-8B (FC),7.75%,7.50%,11.50%,5.00%,7.00%
-36,Qwen2.5-7B-Instruct (Prompt),7.62%,9.50%,8.50%,7.00%,5.50%
-37,Claude-3.5-Sonnet-20241022 (Prompt),7.50%,9.00%,5.50%,5.00%,10.50%
-38,Claude-3-Opus-20240229 (Prompt),7.13%,11.50%,2.50%,6.00%,8.50%
-39,Meta-Llama-3-70B-Instruct (Prompt),5.62%,10.00%,4.00%,6.00%,2.50%
-40,GPT-3.5-Turbo-0125 (Prompt),5.62%,9.00%,2.00%,7.00%,4.50%
-41,Hammer2.0-7b (FC),5.50%,9.00%,2.00%,7.00%,4.00%
-42,Llama-3.1-8B-Instruct (FC),5.38%,5.00%,7.50%,5.00%,4.00%
-43,Llama-3.2-3B-Instruct (Prompt),5.25%,8.50%,2.50%,4.50%,5.50%
-44,Llama-3.1-70B-Instruct (FC),4.88%,7.00%,4.00%,4.50%,4.00%
-45,DeepSeek-Coder-V2 (FC),4.50%,7.50%,3.00%,4.00%,3.50%
-46,GLM-4-9b-Chat (FC),3.50%,3.50%,4.00%,2.50%,4.00%
-47,Granite-20b-FunctionCalling (FC),3.38%,6.00%,1.50%,4.50%,1.50%
-48,Qwen2-7B-Instruct (Prompt),3.25%,4.00%,4.50%,2.50%,2.00%
-49,Gemini-1.0-Pro-002 (FC),2.88%,4.50%,1.00%,3.50%,2.50%
-50,Hermes-2-Pro-Mistral-7B (FC),2.63%,3.50%,4.00%,2.50%,0.50%
-51,Mistral-small-2402 (FC),2.62%,4.50%,0.00%,3.00%,3.00%
-52,MiniCPM3-4B-FC (FC),2.62%,5.00%,1.00%,3.00%,1.50%
-53,FireFunction-v1 (FC),2.38%,5.00%,0.00%,2.00%,2.50%
-54,Hermes-2-Pro-Llama-3-8B (FC),2.38%,4.50%,1.50%,2.00%,1.50%
-55,Gemma-2-27b-it (Prompt),2.38%,4.50%,2.00%,1.50%,1.50%
-56,MiniCPM3-4B (Prompt),2.00%,3.00%,3.50%,1.00%,0.50%
-57,Command-R-Plus (FC) (Original),2.00%,3.50%,0.00%,1.50%,3.00%
-58,Hammer2.0-1.5b (FC),1.75%,2.00%,1.00%,1.50%,2.50%
-59,Claude-3-Haiku-20240307 (Prompt),1.62%,3.50%,0.00%,0.00%,3.00%
-60,Gemma-2-9b-it (Prompt),1.62%,2.00%,4.00%,0.50%,0.00%
-61,Open-Mixtral-8x22b (FC),1.50%,3.50%,0.00%,1.00%,1.50%
-62,Open-Mixtral-8x7b (Prompt),1.50%,2.50%,0.00%,1.50%,2.00%
-63,Gemini-1.0-Pro-002 (Prompt),1.38%,2.50%,1.50%,0.50%,1.00%
-64,Qwen2.5-1.5B-Instruct (Prompt),1.12%,1.50%,2.50%,0.50%,0.00%
-65,Nexusflow-Raven-v2 (FC),1.00%,1.50%,0.50%,1.00%,1.00%
-66,GoGoAgent,1.00%,1.50%,2.00%,0.50%,0.00%
-67,Meta-Llama-3-8B-Instruct (Prompt),0.75%,1.50%,0.00%,1.00%,0.50%
-68,Mistral-Small-2402 (Prompt),0.75%,0.50%,0.00%,1.50%,1.00%
-69,Hammer2.0-0.5b (FC),0.50%,0.50%,0.00%,0.50%,1.00%
-70,Open-Mixtral-8x22b (Prompt),0.50%,1.00%,0.00%,0.00%,1.00%
-71,Qwen2-1.5B-Instruct (Prompt),0.50%,0.50%,1.00%,0.00%,0.50%
-72,Mistral-Medium-2312 (Prompt),0.38%,1.00%,0.00%,0.00%,0.50%
-73,Command-R-Plus (Prompt) (Original),0.38%,1.00%,0.00%,0.00%,0.50%
-74,Open-Mistral-Nemo-2407 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50%
-75,DeepSeek-Coder-V2-Lite-Instruct (FC),0.12%,0.50%,0.00%,0.00%,0.00%
-76,xLAM-1b-fc-r (FC),0.12%,0.50%,0.00%,0.00%,0.00%
-77,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
-78,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
-79,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
-80,xLAM-7b-fc-r (FC),0.00%,0.00%,0.00%,0.00%,0.00%
\ No newline at end of file
+1,watt-tool-70B (FC),58.62%,67.00%,57.50%,48.50%,61.50%
+2,gpt-4o-2024-11-20 (Prompt),47.62%,59.00%,41.00%,35.50%,55.00%
+3,Claude-3.5-Sonnet-20241022 (FC),41.00%,55.00%,19.00%,42.50%,47.50%
+4,gpt-4o-2024-11-20 (FC),41.00%,62.50%,6.00%,37.50%,58.00%
+5,o1-2024-12-17 (FC),41.00%,52.50%,38.00%,30.50%,43.00%
+6,claude-3.5-haiku-20241022 (FC),40.00%,54.50%,26.50%,35.00%,44.00%
+7,watt-tool-8B (FC),39.13%,47.00%,41.50%,27.50%,40.50%
+8,GPT-4-turbo-2024-04-09 (FC),38.12%,54.00%,13.50%,35.50%,49.50%
+9,o1-2024-12-17 (Prompt),36.00%,50.50%,0.50%,48.50%,44.50%
+10,GPT-4o-mini-2024-07-18 (FC),34.12%,47.50%,19.50%,29.00%,40.50%
+11,Claude-3-Opus-20240229 (FC),30.25%,41.50%,14.00%,33.50%,32.00%
+12,GPT-4-turbo-2024-04-09 (Prompt),30.25%,42.50%,25.00%,20.50%,33.00%
+13,o1-mini-2024-09-12 (Prompt),28.25%,40.50%,5.00%,34.50%,33.00%
+14,mistral-large-2407 (FC),23.75%,33.50%,18.00%,23.50%,20.00%
+15,Hammer2.1-7b (FC),23.50%,35.50%,25.50%,19.00%,14.00%
+16,GPT-4o-mini-2024-07-18 (Prompt),22.00%,33.00%,12.00%,17.00%,26.00%
+17,Gemini-1.5-Pro-002 (FC),21.62%,31.00%,5.00%,21.00%,29.50%
+18,Functionary-Medium-v3.1 (FC),21.38%,31.50%,21.00%,26.50%,6.50%
+19,Gemini-1.5-Pro-002 (Prompt),20.75%,23.00%,19.50%,17.50%,23.00%
+20,GPT-3.5-Turbo-0125 (FC),19.50%,32.50%,11.50%,21.50%,12.50%
+21,Gemini-1.5-Flash-001 (Prompt),19.50%,27.50%,20.00%,12.00%,18.50%
+22,Gemini-1.5-Pro-001 (Prompt),18.88%,26.00%,5.00%,21.50%,23.00%
+23,DeepSeek-V3 (FC),18.62%,21.00%,20.50%,19.00%,14.00%
+24,Qwen2.5-72B-Instruct (Prompt),18.00%,24.50%,20.00%,15.50%,12.00%
+25,Qwen2.5-32B-Instruct (Prompt),17.75%,25.00%,20.00%,15.00%,11.00%
+26,Hammer2.1-3b (FC),17.38%,27.50%,17.50%,14.50%,10.00%
+27,xLAM-8x22b-r (FC),16.25%,25.50%,16.00%,11.50%,12.00%
+28,Gemini-1.5-Pro-001 (FC),16.00%,24.50%,3.00%,15.50%,21.00%
+29,xLAM-8x7b-r (FC),15.50%,26.00%,13.00%,11.50%,11.50%
+30,Gemini-1.5-Flash-001 (FC),13.87%,19.00%,3.50%,14.00%,19.00%
+31,Command-R-Plus (FC),13.12%,16.50%,10.00%,9.00%,17.00%
+32,Gemini-1.5-Flash-002 (Prompt),12.50%,17.50%,6.00%,11.50%,15.00%
+33,Llama-3.1-70B-Instruct (Prompt),12.38%,16.50%,13.00%,10.50%,9.50%
+34,Qwen2.5-14B-Instruct (Prompt),12.12%,18.50%,11.50%,12.00%,6.50%
+35,Gemini-1.5-Flash-002 (FC),11.62%,19.00%,0.50%,10.50%,16.50%
+36,Ministral-8B-Instruct-2410 (FC),11.25%,21.00%,8.50%,10.00%,5.50%
+37,Hammer2.1-1.5b (FC),10.50%,14.50%,12.50%,9.00%,6.00%
+38,xLAM-7b-r (FC),10.00%,16.50%,8.50%,7.50%,7.50%
+39,Functionary-Small-v3.1 (FC),9.88%,17.00%,2.50%,14.00%,6.00%
+40,claude-3.5-haiku-20241022 (Prompt),9.75%,16.00%,0.50%,8.00%,14.50%
+41,Llama-3.1-8B-Instruct (Prompt),9.25%,12.00%,10.00%,7.00%,8.00%
+42,Open-Mistral-Nemo-2407 (FC),9.12%,15.00%,3.50%,9.00%,9.00%
+43,FireFunction-v2 (FC),8.62%,13.50%,7.00%,11.00%,3.00%
+44,mistral-large-2407 (Prompt),8.38%,15.00%,6.00%,6.00%,6.50%
+45,ToolACE-8B (FC),7.75%,7.50%,11.50%,5.00%,7.00%
+46,Qwen2.5-7B-Instruct (Prompt),7.62%,9.50%,8.50%,7.00%,5.50%
+47,Claude-3.5-Sonnet-20241022 (Prompt),7.50%,9.00%,5.50%,5.00%,10.50%
+48,Claude-3-Opus-20240229 (Prompt),7.13%,11.50%,2.50%,6.00%,8.50%
+49,Llama-3.3-70B-Instruct (Prompt),6.87%,9.00%,8.00%,4.50%,6.00%
+50,Meta-Llama-3-70B-Instruct (Prompt),5.62%,10.00%,4.00%,6.00%,2.50%
+51,GPT-3.5-Turbo-0125 (Prompt),5.62%,9.00%,2.00%,7.00%,4.50%
+52,Llama-3.1-8B-Instruct (FC),5.38%,5.00%,7.50%,5.00%,4.00%
+53,Llama-3.2-3B-Instruct (Prompt),5.25%,8.50%,2.50%,4.50%,5.50%
+54,Command R7B (FC),5.00%,6.50%,1.50%,6.50%,5.50%
+55,Llama-3.1-70B-Instruct (FC),4.88%,7.00%,4.00%,4.50%,4.00%
+56,DeepSeek-Coder-V2 (FC),4.50%,7.50%,3.00%,4.00%,3.50%
+57,GLM-4-9b-Chat (FC),3.50%,3.50%,4.00%,2.50%,4.00%
+58,Granite-20b-FunctionCalling (FC),3.38%,6.00%,1.50%,4.50%,1.50%
+59,Qwen2.5-3B-Instruct (Prompt),3.38%,5.50%,3.50%,2.00%,2.50%
+60,Qwen2-7B-Instruct (Prompt),3.25%,4.00%,4.50%,2.50%,2.00%
+61,Gemini-1.0-Pro-002 (FC),2.88%,4.50%,1.00%,3.50%,2.50%
+62,Hermes-2-Pro-Mistral-7B (FC),2.63%,3.50%,4.00%,2.50%,0.50%
+63,Mistral-small-2402 (FC),2.62%,4.50%,0.00%,3.00%,3.00%
+64,MiniCPM3-4B-FC (FC),2.62%,5.00%,1.00%,3.00%,1.50%
+65,FireFunction-v1 (FC),2.38%,5.00%,0.00%,2.00%,2.50%
+66,Hermes-2-Pro-Llama-3-8B (FC),2.38%,4.50%,1.50%,2.00%,1.50%
+67,Gemma-2-27b-it (Prompt),2.38%,4.50%,2.00%,1.50%,1.50%
+68,Hammer2.1-0.5b (FC),2.25%,4.00%,0.50%,3.00%,1.50%
+69,MiniCPM3-4B (Prompt),2.00%,3.00%,3.50%,1.00%,0.50%
+70,Gemma-2-9b-it (Prompt),1.62%,2.00%,4.00%,0.50%,0.00%
+71,Open-Mixtral-8x22b (FC),1.50%,3.50%,0.00%,1.00%,1.50%
+72,Open-Mixtral-8x7b (Prompt),1.50%,2.50%,0.00%,1.50%,2.00%
+73,Gemini-1.0-Pro-002 (Prompt),1.38%,2.50%,1.50%,0.50%,1.00%
+74,Qwen2.5-1.5B-Instruct (Prompt),1.12%,1.50%,2.50%,0.50%,0.00%
+75,Nexusflow-Raven-v2 (FC),1.00%,1.50%,0.50%,1.00%,1.00%
+76,GoGoAgent,1.00%,1.50%,2.00%,0.50%,0.00%
+77,Meta-Llama-3-8B-Instruct (Prompt),0.75%,1.50%,0.00%,1.00%,0.50%
+78,Mistral-Small-2402 (Prompt),0.75%,0.50%,0.00%,1.50%,1.00%
+79,Open-Mixtral-8x22b (Prompt),0.50%,1.00%,0.00%,0.00%,1.00%
+80,Qwen2-1.5B-Instruct (Prompt),0.50%,0.50%,1.00%,0.00%,0.50%
+81,Mistral-Medium-2312 (Prompt),0.38%,1.00%,0.00%,0.00%,0.50%
+82,Open-Mistral-Nemo-2407 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50%
+83,DeepSeek-Coder-V2-Lite-Instruct (FC),0.12%,0.50%,0.00%,0.00%,0.00%
+84,xLAM-1b-fc-r (FC),0.12%,0.50%,0.00%,0.00%,0.00%
+85,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
+86,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
+87,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
+88,xLAM-7b-fc-r (FC),0.00%,0.00%,0.00%,0.00%,0.00%
+89,Qwen2.5-0.5B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00%
\ No newline at end of file
diff --git a/data_non_live.csv b/data_non_live.csv
index a05b71e1f..b0401370d 100644
--- a/data_non_live.csv
+++ b/data_non_live.csv
@@ -1,81 +1,90 @@
Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simple AST,Java Simple AST,JavaScript Simple AST,Multiple AST,Parallel AST,Parallel Multiple AST,Simple Exec,Python Simple Exec,REST Simple Exec,Multiple Exec,Parallel Exec,Parallel Multiple Exec,Irrelevance Detection
-1,Qwen2.5-72B-Instruct (Prompt),90.37%,90.85%,92.07%,80.92%,98.75%,62.00%,82.00%,97.50%,93.50%,91.50%,99.29%,100.00%,98.57%,94.00%,90.00%,85.00%,81.67%
-2,Functionary-Medium-v3.1 (FC),89.77%,89.52%,91.32%,76.08%,96.25%,64.00%,68.00%,96.50%,94.50%,91.00%,99.29%,100.00%,98.57%,94.00%,92.00%,80.00%,84.58%
-3,Gemini-1.5-Pro-002 (Prompt),88.83%,87.98%,91.27%,78.92%,94.75%,64.00%,78.00%,92.50%,91.50%,89.00%,98.57%,100.00%,97.14%,94.00%,90.00%,82.50%,82.50%
-4,ToolACE-8B (FC),88.82%,87.29%,89.21%,76.67%,91.00%,65.00%,74.00%,94.00%,90.00%,88.50%,97.36%,99.00%,95.71%,94.00%,88.00%,77.50%,93.33%
-5,GoGoAgent,88.42%,85.75%,89.86%,74.50%,92.50%,63.00%,68.00%,92.00%,89.50%,87.00%,95.43%,98.00%,92.86%,96.00%,88.00%,80.00%,93.33%
-6,DeepSeek-Coder-V2 (FC),88.41%,89.15%,91.23%,78.08%,96.25%,64.00%,74.00%,95.00%,93.50%,90.00%,96.43%,100.00%,92.86%,94.00%,92.00%,82.50%,74.17%
-7,Hammer2.0-7b (FC),88.31%,90.50%,88.62%,80.50%,97.50%,66.00%,78.00%,95.50%,94.00%,92.00%,89.50%,99.00%,80.00%,94.00%,86.00%,85.00%,78.33%
-8,Llama-3.1-70B-Instruct (Prompt),87.77%,89.85%,90.12%,77.92%,95.75%,62.00%,76.00%,96.50%,94.00%,91.00%,94.00%,98.00%,90.00%,98.00%,86.00%,82.50%,70.00%
-9,Gemma-2-27b-it (Prompt),87.16%,89.10%,89.09%,80.42%,94.25%,63.00%,84.00%,93.00%,91.00%,92.00%,87.86%,100.00%,75.71%,98.00%,88.00%,82.50%,71.67%
-10,o1-preview-2024-09-12 (Prompt),86.62%,86.19%,88.70%,76.75%,92.25%,66.00%,72.00%,94.00%,90.00%,84.00%,99.29%,100.00%,98.57%,94.00%,84.00%,77.50%,80.00%
-11,Qwen2.5-7B-Instruct (Prompt),86.01%,86.48%,88.29%,75.92%,95.75%,60.00%,72.00%,95.00%,91.00%,84.00%,92.14%,100.00%,84.29%,90.00%,86.00%,85.00%,75.00%
-12,Gemini-1.5-Pro-001 (FC),85.52%,83.23%,87.95%,69.92%,92.75%,55.00%,62.00%,92.00%,90.50%,80.50%,91.79%,95.00%,88.57%,92.00%,88.00%,80.00%,85.00%
-13,Functionary-Small-v3.1 (FC),85.44%,86.38%,87.12%,74.00%,96.00%,62.00%,64.00%,94.00%,90.00%,87.50%,89.50%,99.00%,80.00%,94.00%,90.00%,75.00%,75.00%
-14,Gemini-1.5-Pro-001 (Prompt),85.16%,84.06%,85.77%,74.75%,93.25%,59.00%,72.00%,90.50%,91.00%,80.00%,91.57%,96.00%,87.14%,90.00%,84.00%,77.50%,87.08%
-15,Gemini-1.5-Pro-002 (FC),85.06%,87.40%,84.61%,74.08%,94.25%,58.00%,70.00%,94.00%,92.00%,89.50%,75.93%,99.00%,52.86%,94.00%,86.00%,82.50%,77.50%
-16,Gemma-2-9b-it (Prompt),85.02%,84.92%,87.52%,75.67%,93.00%,60.00%,74.00%,90.50%,88.00%,85.50%,88.07%,99.00%,77.14%,94.00%,88.00%,80.00%,75.42%
-17,Granite-20b-FunctionCalling (FC),84.97%,82.21%,86.59%,72.83%,90.50%,66.00%,62.00%,91.00%,84.00%,81.00%,86.36%,97.00%,75.71%,92.00%,88.00%,80.00%,89.58%
-18,GPT-4-turbo-2024-04-09 (FC),84.95%,84.56%,85.21%,69.75%,92.25%,59.00%,58.00%,91.00%,91.00%,86.50%,87.36%,99.00%,75.71%,90.00%,86.00%,77.50%,85.42%
-19,Meta-Llama-3-70B-Instruct (Prompt),84.70%,87.77%,88.21%,76.58%,94.75%,61.00%,74.00%,94.50%,92.50%,87.50%,95.86%,96.00%,95.71%,94.00%,78.00%,85.00%,58.33%
-20,GPT-4-turbo-2024-04-09 (Prompt),84.68%,90.98%,89.45%,81.92%,96.75%,67.00%,82.00%,95.50%,94.00%,92.50%,99.29%,100.00%,98.57%,96.00%,80.00%,82.50%,40.42%
-21,Open-Mixtral-8x22b (Prompt),84.51%,87.90%,87.77%,78.58%,93.75%,60.00%,82.00%,94.00%,89.50%,89.50%,93.57%,100.00%,87.14%,96.00%,84.00%,77.50%,57.92%
-22,xLAM-8x22b-r (FC),84.44%,83.58%,87.88%,77.33%,94.00%,64.00%,74.00%,93.50%,88.00%,75.50%,95.00%,100.00%,90.00%,94.00%,90.00%,72.50%,74.17%
-23,FireFunction-v2 (FC),84.28%,87.10%,87.54%,79.92%,95.75%,64.00%,80.00%,93.00%,90.50%,85.00%,96.64%,99.00%,94.29%,92.00%,84.00%,77.50%,60.00%
-24,GPT-4o-mini-2024-07-18 (Prompt),84.13%,86.69%,80.84%,79.25%,93.75%,66.00%,78.00%,90.50%,89.00%,88.00%,62.86%,100.00%,25.71%,96.00%,82.00%,82.50%,87.08%
-25,Hammer2.0-1.5b (FC),83.74%,83.85%,87.70%,74.42%,94.25%,65.00%,64.00%,90.50%,87.50%,83.00%,92.79%,97.00%,88.57%,92.00%,86.00%,80.00%,67.50%
-26,GPT-4o-mini-2024-07-18 (FC),83.49%,84.58%,83.57%,74.33%,91.00%,64.00%,68.00%,90.00%,90.00%,84.00%,83.29%,98.00%,68.57%,92.00%,84.00%,75.00%,78.75%
-27,GPT-4o-2024-08-06 (FC),83.41%,86.38%,78.91%,75.00%,91.00%,64.00%,70.00%,92.50%,92.50%,85.50%,60.14%,96.00%,24.29%,92.00%,86.00%,77.50%,89.58%
-28,Gemini-1.5-Flash-001 (Prompt),82.76%,85.44%,83.59%,70.75%,84.25%,64.00%,64.00%,90.00%,91.00%,90.00%,80.36%,85.00%,75.71%,92.00%,82.00%,80.00%,68.75%
-29,o1-mini-2024-09-12 (Prompt),82.69%,80.54%,82.70%,70.67%,88.00%,62.00%,62.00%,89.50%,82.00%,80.00%,89.29%,100.00%,78.57%,86.00%,78.00%,77.50%,91.25%
-30,MiniCPM3-4B-FC (FC),82.49%,81.06%,87.57%,69.75%,90.25%,59.00%,60.00%,92.00%,83.00%,79.50%,89.29%,100.00%,78.57%,90.00%,86.00%,85.00%,67.92%
-31,claude-3.5-haiku-20241022 (Prompt),82.31%,82.98%,84.71%,76.92%,92.75%,64.00%,74.00%,93.50%,84.00%,77.50%,97.86%,100.00%,95.71%,90.00%,76.00%,75.00%,70.00%
-32,Llama-3.1-8B-Instruct (Prompt),81.81%,84.02%,86.30%,72.58%,93.75%,58.00%,66.00%,93.50%,87.00%,83.00%,83.71%,96.00%,71.43%,96.00%,88.00%,77.50%,55.00%
-33,mistral-large-2407 (FC),81.81%,86.98%,84.38%,74.42%,96.25%,61.00%,66.00%,93.00%,90.50%,90.00%,75.00%,100.00%,50.00%,94.00%,86.00%,82.50%,50.83%
-34,GPT-4o-2024-08-06 (Prompt),80.78%,80.88%,77.66%,65.00%,88.00%,51.00%,56.00%,85.50%,92.00%,81.00%,61.14%,98.00%,24.29%,88.00%,84.00%,77.50%,92.92%
-35,mistral-large-2407 (Prompt),80.65%,90.60%,90.12%,82.92%,96.75%,66.00%,86.00%,97.00%,92.00%,90.50%,100.00%,100.00%,100.00%,94.00%,84.00%,82.50%,2.92%
-36,Gemini-1.5-Flash-002 (Prompt),80.29%,79.69%,80.64%,74.25%,94.75%,60.00%,68.00%,91.50%,86.00%,67.00%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25%
-37,Claude-3-Opus-20240229 (Prompt),79.86%,85.02%,86.32%,79.08%,96.25%,65.00%,76.00%,95.00%,85.50%,80.50%,99.29%,100.00%,98.57%,90.00%,86.00%,70.00%,33.33%
-38,Command-R-Plus (Prompt) (Original),79.64%,78.79%,84.68%,71.67%,90.00%,61.00%,64.00%,88.50%,82.00%,73.00%,93.21%,95.00%,91.43%,92.00%,76.00%,77.50%,62.92%
-39,Llama-3.2-3B-Instruct (Prompt),79.46%,79.98%,83.70%,74.42%,92.25%,57.00%,74.00%,92.00%,79.50%,74.00%,87.29%,96.00%,78.57%,92.00%,78.00%,77.50%,60.42%
-40,xLAM-7b-r (FC),78.92%,80.81%,79.88%,74.25%,90.75%,62.00%,70.00%,95.50%,80.50%,73.00%,74.00%,98.00%,50.00%,96.00%,82.00%,67.50%,67.50%
-41,Gemini-1.5-Flash-002 (FC),78.91%,81.21%,73.21%,65.83%,86.50%,57.00%,54.00%,91.50%,80.00%,87.50%,68.86%,72.00%,65.71%,90.00%,54.00%,80.00%,92.50%
-42,palmyra-x-004 (FC),78.82%,70.23%,87.54%,71.42%,96.25%,58.00%,60.00%,31.00%,90.50%,88.00%,97.14%,100.00%,94.29%,88.00%,80.00%,85.00%,78.33%
-43,Open-Mistral-Nemo-2407 (FC),78.75%,82.44%,77.66%,64.75%,92.25%,34.00%,68.00%,93.00%,87.50%,84.50%,56.14%,98.00%,14.29%,94.00%,88.00%,72.50%,68.33%
-44,Mistral-Medium-2312 (Prompt),78.58%,73.04%,81.57%,70.17%,91.50%,57.00%,62.00%,88.50%,68.50%,65.00%,93.29%,98.00%,88.57%,86.00%,72.00%,75.00%,88.75%
-45,GPT-3.5-Turbo-0125 (FC),78.15%,83.81%,83.79%,74.25%,94.75%,62.00%,66.00%,93.00%,88.50%,79.50%,96.14%,98.00%,94.29%,88.00%,86.00%,65.00%,32.92%
-46,Qwen2.5-1.5B-Instruct (Prompt),78.03%,73.60%,85.61%,70.92%,88.75%,54.00%,70.00%,86.50%,70.00%,67.00%,80.43%,98.00%,62.86%,94.00%,88.00%,80.00%,65.42%
-47,Open-Mistral-Nemo-2407 (Prompt),78.00%,85.29%,89.07%,77.17%,92.50%,59.00%,80.00%,92.50%,87.00%,84.50%,93.79%,99.00%,88.57%,92.00%,88.00%,82.50%,4.58%
-48,Command-R-Plus (FC) (Original),77.28%,78.58%,80.71%,68.83%,87.50%,61.00%,58.00%,91.50%,83.50%,70.50%,90.86%,96.00%,85.71%,90.00%,82.00%,60.00%,58.33%
-49,Gemini-1.5-Flash-001 (FC),76.45%,77.42%,74.80%,65.17%,93.50%,56.00%,46.00%,94.50%,73.50%,76.50%,62.21%,93.00%,31.43%,88.00%,74.00%,75.00%,79.17%
-50,Claude-3.5-Sonnet-20241022 (Prompt),75.78%,72.90%,80.00%,80.58%,93.75%,68.00%,80.00%,92.00%,73.00%,46.00%,100.00%,100.00%,100.00%,92.00%,68.00%,60.00%,70.42%
-51,Hermes-2-Pro-Llama-3-8B (FC),74.37%,76.42%,76.23%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,72.50%,70.43%,98.00%,42.86%,94.00%,78.00%,62.50%,58.75%
-52,Qwen2-7B-Instruct (Prompt),72.71%,75.85%,76.80%,67.92%,83.75%,58.00%,62.00%,88.00%,74.00%,73.50%,80.21%,89.00%,71.43%,84.00%,78.00%,65.00%,43.75%
-53,xLAM-8x7b-r (FC),71.03%,67.33%,74.05%,73.33%,91.00%,59.00%,70.00%,90.00%,68.50%,37.50%,89.21%,97.00%,81.43%,90.00%,72.00%,45.00%,73.75%
-54,GPT-3.5-Turbo-0125 (Prompt),70.75%,72.75%,70.39%,77.50%,95.50%,61.00%,76.00%,92.50%,66.50%,54.50%,57.57%,98.00%,17.14%,90.00%,74.00%,60.00%,64.17%
-55,Hermes-2-Pro-Mistral-7B (FC),68.94%,72.67%,76.00%,60.67%,86.00%,56.00%,40.00%,87.00%,78.50%,64.50%,61.00%,92.00%,30.00%,94.00%,84.00%,65.00%,25.83%
-56,Hammer2.0-0.5b (FC),68.61%,67.19%,70.11%,63.25%,82.75%,53.00%,54.00%,80.50%,67.00%,58.00%,53.93%,95.00%,12.86%,84.00%,80.00%,62.50%,68.33%
-57,Open-Mixtral-8x7b (Prompt),66.21%,63.33%,69.61%,64.83%,89.50%,51.00%,54.00%,86.00%,58.50%,44.00%,77.93%,93.00%,62.86%,86.00%,62.00%,52.50%,64.17%
-58,xLAM-7b-fc-r (FC),63.62%,70.33%,60.63%,76.83%,93.50%,65.00%,72.00%,94.00%,72.00%,38.50%,84.50%,99.00%,70.00%,92.00%,56.00%,10.00%,48.75%
-59,DBRX-Instruct (Prompt),62.36%,60.75%,69.14%,73.50%,92.50%,56.00%,72.00%,92.00%,40.00%,37.50%,90.07%,93.00%,87.14%,88.00%,46.00%,52.50%,41.67%
-60,Gemini-1.0-Pro-002 (FC),61.84%,56.19%,64.93%,66.75%,94.25%,52.00%,54.00%,92.50%,39.00%,26.50%,87.21%,93.00%,81.43%,86.00%,64.00%,22.50%,72.08%
-61,Claude-3-Opus-20240229 (FC),60.07%,55.58%,59.46%,67.83%,88.50%,59.00%,56.00%,89.50%,37.00%,28.00%,80.36%,95.00%,65.71%,88.00%,42.00%,27.50%,80.42%
-62,MiniCPM3-4B (Prompt),59.24%,65.73%,50.59%,63.42%,84.25%,48.00%,58.00%,73.50%,63.00%,63.00%,40.36%,35.00%,45.71%,34.00%,48.00%,80.00%,67.92%
-63,Mistral-small-2402 (FC),58.96%,57.77%,53.84%,67.58%,91.75%,59.00%,52.00%,94.00%,22.50%,47.00%,87.36%,99.00%,75.71%,92.00%,16.00%,20.00%,84.17%
-64,Open-Mixtral-8x22b (FC),58.82%,61.42%,63.64%,71.67%,93.00%,66.00%,56.00%,94.00%,10.50%,69.50%,83.57%,100.00%,67.14%,94.00%,22.00%,55.00%,29.17%
-65,Gemini-1.0-Pro-002 (Prompt),57.10%,58.40%,56.32%,47.58%,62.75%,26.00%,54.00%,60.50%,66.50%,59.00%,49.79%,61.00%,38.57%,68.00%,60.00%,47.50%,55.00%
-66,Nexusflow-Raven-v2 (FC),55.71%,46.12%,59.11%,57.50%,37.50%,63.00%,72.00%,53.00%,34.50%,39.50%,47.93%,83.00%,12.86%,86.00%,40.00%,62.50%,80.42%
-67,Meta-Llama-3-8B-Instruct (Prompt),54.23%,60.79%,58.93%,62.67%,87.00%,47.00%,54.00%,83.00%,49.00%,48.50%,47.71%,84.00%,11.43%,86.00%,42.00%,60.00%,9.17%
-68,Claude-3-Haiku-20240307 (Prompt),54.22%,57.52%,55.62%,77.08%,96.25%,63.00%,72.00%,91.50%,38.50%,23.00%,94.00%,98.00%,90.00%,90.00%,6.00%,32.50%,35.42%
-69,Claude-3.5-Sonnet-20241022 (FC),49.66%,45.92%,47.89%,77.67%,94.00%,65.00%,74.00%,95.00%,6.50%,4.50%,97.57%,98.00%,97.14%,90.00%,4.00%,0.00%,71.67%
-70,Qwen2-1.5B-Instruct (Prompt),48.40%,54.52%,52.39%,51.08%,79.25%,38.00%,36.00%,78.00%,46.50%,42.50%,46.57%,76.00%,17.14%,76.00%,52.00%,35.00%,7.92%
-71,FireFunction-v1 (FC),47.07%,42.90%,44.57%,80.08%,92.25%,66.00%,82.00%,91.50%,0.00%,0.00%,88.29%,98.00%,78.57%,90.00%,0.00%,0.00%,73.75%
-72,GLM-4-9b-Chat (FC),46.55%,36.65%,46.00%,65.08%,86.25%,55.00%,54.00%,81.50%,0.00%,0.00%,94.00%,98.00%,90.00%,90.00%,0.00%,0.00%,88.33%
-73,Llama-3.1-8B-Instruct (FC),43.78%,47.92%,50.18%,55.67%,51.00%,56.00%,60.00%,54.00%,47.00%,35.00%,58.71%,66.00%,51.43%,58.00%,54.00%,30.00%,1.67%
-74,Claude-3-Haiku-20240307 (FC),42.95%,42.40%,48.41%,74.08%,95.25%,61.00%,66.00%,93.50%,2.00%,0.00%,91.64%,99.00%,84.29%,96.00%,6.00%,0.00%,23.33%
-75,xLAM-1b-fc-r (FC),37.71%,40.96%,42.95%,71.83%,83.50%,62.00%,70.00%,85.00%,5.50%,1.50%,77.79%,97.00%,58.57%,90.00%,4.00%,0.00%,3.75%
-76,Mistral-Small-2402 (Prompt),34.32%,27.06%,30.36%,23.25%,69.75%,0.00%,0.00%,74.00%,8.50%,2.50%,52.93%,43.00%,62.86%,64.00%,2.00%,2.50%,79.17%
-77,Llama-3.1-70B-Instruct (FC),31.45%,25.08%,31.62%,48.83%,24.50%,58.00%,64.00%,24.50%,12.50%,14.50%,53.00%,36.00%,70.00%,36.00%,30.00%,7.50%,56.25%
-78,Llama-3.2-1B-Instruct (Prompt),30.03%,27.60%,25.27%,29.42%,53.25%,13.00%,22.00%,33.50%,32.50%,15.00%,34.07%,61.00%,7.14%,28.00%,34.00%,5.00%,58.75%
-79,DeepSeek-Coder-V2-Lite-Instruct (FC),27.69%,4.75%,33.18%,0.00%,0.00%,0.00%,0.00%,2.00%,3.50%,13.50%,17.71%,24.00%,11.43%,42.00%,28.00%,45.00%,97.50%
-80,Gemma-2-2b-it (Prompt),23.23%,16.90%,19.12%,15.08%,35.25%,4.00%,6.00%,52.00%,0.00%,0.50%,22.50%,45.00%,0.00%,54.00%,0.00%,0.00%,65.00%
\ No newline at end of file
+1,Qwen2.5-72B-Instruct (Prompt),90.63%,90.81%,92.70%,80.25%,98.75%,62.00%,80.00%,97.50%,93.50%,92.00%,99.29%,100.00%,98.57%,94.00%,90.00%,87.50%,81.67%
+2,Functionary-Medium-v3.1 (FC),89.93%,89.88%,91.32%,76.00%,96.00%,64.00%,68.00%,97.00%,95.00%,91.50%,99.29%,100.00%,98.57%,94.00%,92.00%,80.00%,84.58%
+3,Gemini-1.5-Pro-002 (Prompt),89.10%,88.58%,91.27%,78.33%,95.00%,64.00%,76.00%,93.50%,92.50%,90.00%,98.57%,100.00%,97.14%,94.00%,90.00%,82.50%,82.50%
+4,ToolACE-8B (FC),88.93%,87.54%,89.21%,76.67%,91.00%,65.00%,74.00%,93.50%,90.50%,89.50%,97.36%,99.00%,95.71%,94.00%,88.00%,77.50%,93.33%
+5,gpt-4o-2024-11-20 (Prompt),88.79%,88.10%,89.38%,79.42%,96.25%,66.00%,76.00%,95.50%,94.00%,83.50%,100.00%,100.00%,100.00%,94.00%,86.00%,77.50%,89.17%
+6,DeepSeek-Coder-V2 (FC),88.54%,89.44%,91.23%,78.75%,96.25%,64.00%,76.00%,94.50%,93.50%,91.00%,96.43%,100.00%,92.86%,94.00%,92.00%,82.50%,74.17%
+7,watt-tool-8B (FC),88.32%,86.56%,89.34%,76.75%,93.25%,63.00%,74.00%,95.00%,94.00%,80.50%,97.86%,100.00%,95.71%,94.00%,88.00%,77.50%,91.25%
+8,gpt-4o-2024-11-20 (FC),88.08%,87.42%,89.20%,77.17%,91.50%,64.00%,76.00%,93.50%,93.00%,86.00%,88.29%,98.00%,78.57%,92.00%,94.00%,82.50%,86.25%
+9,Llama-3.1-70B-Instruct (Prompt),87.82%,89.98%,90.12%,77.92%,95.75%,62.00%,76.00%,96.00%,94.50%,91.50%,94.00%,98.00%,90.00%,98.00%,86.00%,82.50%,70.00%
+10,Gemma-2-27b-it (Prompt),87.09%,88.94%,89.09%,79.75%,94.25%,63.00%,82.00%,92.50%,91.50%,92.00%,87.86%,100.00%,75.71%,98.00%,88.00%,82.50%,71.67%
+11,Qwen2.5-32B-Instruct (Prompt),87.03%,85.81%,89.79%,70.25%,96.75%,52.00%,62.00%,94.50%,90.50%,88.00%,96.64%,99.00%,94.29%,90.00%,90.00%,82.50%,80.83%
+12,Hammer2.1-7b (FC),86.88%,88.65%,85.48%,78.08%,96.25%,66.00%,72.00%,95.00%,93.50%,88.00%,86.43%,100.00%,72.86%,92.00%,86.00%,77.50%,85.42%
+13,Qwen2.5-14B-Instruct (Prompt),86.64%,85.69%,88.84%,73.25%,95.75%,56.00%,68.00%,92.50%,92.00%,85.00%,92.36%,99.00%,85.71%,90.00%,88.00%,85.00%,81.67%
+14,watt-tool-70B (FC),86.44%,84.06%,89.39%,78.75%,98.25%,64.00%,74.00%,94.00%,85.50%,78.00%,98.57%,100.00%,97.14%,94.00%,90.00%,75.00%,84.17%
+15,Gemini-1.5-Pro-001 (FC),86.01%,84.33%,87.95%,69.83%,92.50%,55.00%,62.00%,93.00%,92.00%,82.50%,91.79%,95.00%,88.57%,92.00%,88.00%,80.00%,85.00%
+16,Qwen2.5-7B-Instruct (Prompt),86.00%,86.46%,88.29%,75.33%,96.00%,60.00%,70.00%,94.50%,91.50%,84.50%,92.14%,100.00%,84.29%,90.00%,86.00%,85.00%,75.00%
+17,Gemini-1.5-Pro-001 (Prompt),85.82%,85.56%,85.77%,75.25%,93.75%,60.00%,72.00%,91.50%,91.50%,84.00%,91.57%,96.00%,87.14%,90.00%,84.00%,77.50%,87.08%
+18,Hammer2.1-3b (FC),85.79%,86.85%,84.09%,81.42%,95.25%,67.00%,82.00%,95.00%,89.50%,81.50%,82.86%,100.00%,65.71%,92.00%,84.00%,77.50%,88.33%
+19,Functionary-Small-v3.1 (FC),85.61%,86.75%,87.12%,74.00%,96.00%,62.00%,64.00%,94.50%,90.50%,88.00%,89.50%,99.00%,80.00%,94.00%,90.00%,75.00%,75.00%
+20,Gemma-2-9b-it (Prompt),85.18%,85.29%,87.52%,75.67%,93.00%,60.00%,74.00%,90.50%,88.50%,86.50%,88.07%,99.00%,77.14%,94.00%,88.00%,80.00%,75.42%
+21,GPT-4-turbo-2024-04-09 (FC),85.02%,84.73%,85.21%,70.42%,92.25%,59.00%,60.00%,91.00%,90.00%,87.50%,87.36%,99.00%,75.71%,90.00%,86.00%,77.50%,85.42%
+22,Gemini-1.5-Pro-002 (FC),85.01%,87.29%,84.61%,73.17%,93.50%,58.00%,68.00%,95.00%,91.50%,89.50%,75.93%,99.00%,52.86%,94.00%,86.00%,82.50%,77.50%
+23,Granite-20b-FunctionCalling (FC),84.89%,82.46%,86.36%,72.83%,90.50%,66.00%,62.00%,91.50%,84.00%,81.50%,84.93%,97.00%,72.86%,92.00%,86.00%,82.50%,88.75%
+24,FireFunction-v2 (FC),84.89%,88.46%,87.54%,80.33%,96.00%,65.00%,80.00%,94.00%,91.50%,88.00%,96.64%,99.00%,94.29%,92.00%,84.00%,77.50%,60.00%
+25,Meta-Llama-3-70B-Instruct (Prompt),84.72%,87.81%,88.21%,76.75%,95.25%,61.00%,74.00%,95.00%,92.50%,87.00%,95.86%,96.00%,95.71%,94.00%,78.00%,85.00%,58.33%
+26,DeepSeek-V3 (FC),84.66%,89.17%,83.39%,78.67%,97.00%,65.00%,74.00%,95.50%,91.00%,91.50%,62.57%,98.00%,27.14%,94.00%,92.00%,85.00%,71.67%
+27,Llama-3.3-70B-Instruct (Prompt),84.64%,85.08%,90.68%,74.83%,94.50%,60.00%,70.00%,94.50%,84.00%,87.00%,95.71%,100.00%,91.43%,98.00%,84.00%,85.00%,58.75%
+28,GPT-4-turbo-2024-04-09 (Prompt),84.63%,90.88%,89.45%,82.50%,96.50%,67.00%,84.00%,95.50%,93.50%,92.00%,99.29%,100.00%,98.57%,96.00%,80.00%,82.50%,40.42%
+29,Open-Mixtral-8x22b (Prompt),84.56%,88.02%,87.77%,78.58%,93.75%,60.00%,82.00%,94.00%,89.50%,90.00%,93.57%,100.00%,87.14%,96.00%,84.00%,77.50%,57.92%
+30,xLAM-8x22b-r (FC),84.49%,83.69%,87.88%,77.75%,95.25%,64.00%,74.00%,94.50%,86.50%,76.00%,95.00%,100.00%,90.00%,94.00%,90.00%,72.50%,74.17%
+31,GPT-4o-mini-2024-07-18 (Prompt),84.17%,86.77%,80.84%,80.08%,94.25%,66.00%,80.00%,90.50%,89.50%,87.00%,62.86%,100.00%,25.71%,96.00%,82.00%,82.50%,87.08%
+32,GPT-4o-mini-2024-07-18 (FC),83.76%,85.21%,83.57%,74.83%,90.50%,64.00%,70.00%,92.00%,90.00%,84.00%,83.29%,98.00%,68.57%,92.00%,84.00%,75.00%,78.75%
+33,o1-2024-12-17 (Prompt),83.57%,85.67%,79.77%,72.67%,92.00%,60.00%,66.00%,93.50%,91.50%,85.00%,58.57%,100.00%,17.14%,92.00%,86.00%,82.50%,90.42%
+34,Hammer2.1-1.5b (FC),83.49%,82.79%,83.39%,74.67%,90.00%,64.00%,70.00%,92.00%,84.50%,80.00%,86.57%,96.00%,77.14%,90.00%,82.00%,75.00%,86.67%
+35,Gemini-1.5-Flash-001 (Prompt),82.87%,85.69%,83.59%,70.75%,84.25%,64.00%,64.00%,90.00%,91.50%,90.50%,80.36%,85.00%,75.71%,92.00%,82.00%,80.00%,68.75%
+36,claude-3.5-haiku-20241022 (Prompt),82.40%,83.19%,84.71%,76.25%,92.75%,64.00%,72.00%,93.00%,84.00%,79.50%,97.86%,100.00%,95.71%,90.00%,76.00%,75.00%,70.00%
+37,MiniCPM3-4B-FC (FC),82.39%,80.83%,87.57%,69.83%,90.50%,59.00%,60.00%,91.50%,82.50%,79.50%,89.29%,100.00%,78.57%,90.00%,86.00%,85.00%,67.92%
+38,Command R7B (FC),82.29%,81.67%,84.02%,68.17%,92.50%,56.00%,56.00%,91.50%,85.50%,81.50%,87.07%,97.00%,77.14%,92.00%,82.00%,75.00%,77.92%
+39,o1-mini-2024-09-12 (Prompt),81.97%,78.92%,82.70%,71.17%,87.50%,62.00%,64.00%,89.00%,83.50%,72.00%,89.29%,100.00%,78.57%,86.00%,78.00%,77.50%,91.25%
+40,Llama-3.1-8B-Instruct (Prompt),81.89%,84.21%,86.30%,72.83%,93.50%,59.00%,66.00%,93.50%,87.00%,83.50%,83.71%,96.00%,71.43%,96.00%,88.00%,77.50%,55.00%
+41,mistral-large-2407 (FC),81.73%,86.81%,84.38%,74.25%,95.75%,61.00%,66.00%,92.50%,90.00%,90.50%,75.00%,100.00%,50.00%,94.00%,86.00%,82.50%,50.83%
+42,Gemini-1.5-Flash-002 (Prompt),81.16%,81.65%,80.64%,73.58%,94.75%,60.00%,66.00%,91.50%,90.00%,71.50%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25%
+43,mistral-large-2407 (Prompt),80.62%,90.54%,90.12%,82.17%,96.50%,66.00%,84.00%,97.00%,92.50%,90.50%,100.00%,100.00%,100.00%,94.00%,84.00%,82.50%,2.92%
+44,Claude-3-Opus-20240229 (Prompt),79.99%,85.31%,86.32%,79.75%,96.25%,65.00%,78.00%,95.00%,85.50%,81.00%,99.29%,100.00%,98.57%,90.00%,86.00%,70.00%,33.33%
+45,Llama-3.2-3B-Instruct (Prompt),79.72%,80.56%,83.70%,73.75%,92.25%,57.00%,72.00%,92.00%,80.50%,76.00%,87.29%,96.00%,78.57%,92.00%,78.00%,77.50%,60.42%
+46,Qwen2.5-3B-Instruct (Prompt),79.22%,80.79%,81.71%,74.17%,91.50%,59.00%,72.00%,90.50%,79.50%,79.00%,80.86%,96.00%,65.71%,86.00%,80.00%,80.00%,62.92%
+47,Gemini-1.5-Flash-002 (FC),79.15%,81.75%,73.21%,65.50%,87.50%,57.00%,52.00%,91.50%,80.50%,89.50%,68.86%,72.00%,65.71%,90.00%,54.00%,80.00%,92.50%
+48,xLAM-7b-r (FC),79.03%,81.06%,79.88%,74.25%,90.75%,62.00%,70.00%,95.50%,81.00%,73.50%,74.00%,98.00%,50.00%,96.00%,82.00%,67.50%,67.50%
+49,Ministral-8B-Instruct-2410 (FC),79.01%,83.83%,79.57%,71.83%,93.50%,60.00%,62.00%,91.50%,84.50%,87.50%,71.29%,94.00%,48.57%,86.00%,86.00%,75.00%,57.50%
+50,Mistral-Medium-2312 (Prompt),78.62%,73.12%,81.57%,69.50%,91.50%,57.00%,60.00%,88.50%,69.00%,65.50%,93.29%,98.00%,88.57%,86.00%,72.00%,75.00%,88.75%
+51,Open-Mistral-Nemo-2407 (FC),78.60%,82.10%,77.66%,64.42%,91.25%,34.00%,68.00%,93.50%,85.50%,85.00%,56.14%,98.00%,14.29%,94.00%,88.00%,72.50%,68.33%
+52,Open-Mistral-Nemo-2407 (Prompt),78.37%,86.12%,89.07%,77.00%,92.00%,59.00%,80.00%,93.50%,89.50%,84.50%,93.79%,99.00%,88.57%,92.00%,88.00%,82.50%,4.58%
+53,GPT-3.5-Turbo-0125 (FC),78.20%,83.94%,83.79%,74.25%,94.75%,62.00%,66.00%,93.50%,89.00%,79.00%,96.14%,98.00%,94.29%,88.00%,86.00%,65.00%,32.92%
+54,Qwen2.5-1.5B-Instruct (Prompt),77.93%,73.37%,85.61%,71.00%,89.00%,54.00%,70.00%,86.00%,70.00%,66.50%,80.43%,98.00%,62.86%,94.00%,88.00%,80.00%,65.42%
+55,Gemini-1.5-Flash-001 (FC),76.51%,77.54%,74.80%,65.17%,93.50%,56.00%,46.00%,94.50%,73.00%,77.50%,62.21%,93.00%,31.43%,88.00%,74.00%,75.00%,79.17%
+56,Command-R-Plus (FC),75.93%,77.02%,81.21%,72.08%,87.25%,59.00%,70.00%,89.50%,82.50%,64.00%,90.86%,96.00%,85.71%,90.00%,84.00%,60.00%,50.42%
+57,Claude-3.5-Sonnet-20241022 (Prompt),75.59%,72.48%,80.00%,81.42%,94.25%,68.00%,82.00%,92.00%,70.50%,46.00%,100.00%,100.00%,100.00%,92.00%,68.00%,60.00%,70.42%
+58,Hermes-2-Pro-Llama-3-8B (FC),74.54%,76.79%,76.23%,64.17%,90.50%,56.00%,46.00%,89.50%,80.00%,73.50%,70.43%,98.00%,42.86%,94.00%,78.00%,62.50%,58.75%
+59,Qwen2-7B-Instruct (Prompt),73.06%,76.65%,76.80%,68.08%,84.25%,58.00%,62.00%,88.00%,75.50%,75.00%,80.21%,89.00%,71.43%,84.00%,78.00%,65.00%,43.75%
+60,xLAM-8x7b-r (FC),71.17%,67.65%,74.05%,73.58%,91.75%,59.00%,70.00%,90.00%,69.00%,38.00%,89.21%,97.00%,81.43%,90.00%,72.00%,45.00%,73.75%
+61,GPT-3.5-Turbo-0125 (Prompt),70.79%,72.85%,70.39%,77.92%,96.75%,61.00%,76.00%,93.50%,67.00%,53.00%,57.57%,98.00%,17.14%,90.00%,74.00%,60.00%,64.17%
+62,Hammer2.1-0.5b (FC),70.70%,69.12%,70.46%,68.00%,84.00%,62.00%,58.00%,83.00%,71.50%,54.00%,68.36%,91.00%,45.71%,84.00%,82.00%,47.50%,77.92%
+63,Hermes-2-Pro-Mistral-7B (FC),69.12%,73.06%,76.00%,60.75%,86.25%,56.00%,40.00%,87.50%,78.50%,65.50%,61.00%,92.00%,30.00%,94.00%,84.00%,65.00%,25.83%
+64,Open-Mixtral-8x7b (Prompt),66.33%,63.58%,69.61%,64.83%,89.50%,51.00%,54.00%,86.00%,59.00%,44.50%,77.93%,93.00%,62.86%,86.00%,62.00%,52.50%,64.17%
+65,xLAM-7b-fc-r (FC),64.40%,72.08%,60.63%,76.83%,93.50%,65.00%,72.00%,93.50%,77.00%,41.00%,84.50%,99.00%,70.00%,92.00%,56.00%,10.00%,48.75%
+66,DBRX-Instruct (Prompt),62.58%,61.25%,69.14%,73.50%,92.50%,56.00%,72.00%,92.00%,42.50%,37.00%,90.07%,93.00%,87.14%,88.00%,46.00%,52.50%,41.67%
+67,Gemini-1.0-Pro-002 (FC),62.04%,56.65%,64.93%,66.58%,93.75%,52.00%,54.00%,95.00%,40.00%,25.00%,87.21%,93.00%,81.43%,86.00%,64.00%,22.50%,72.08%
+68,Claude-3-Opus-20240229 (FC),61.10%,57.92%,59.46%,67.17%,88.50%,59.00%,54.00%,93.00%,39.50%,32.00%,80.36%,95.00%,65.71%,88.00%,42.00%,27.50%,80.42%
+69,Mistral-small-2402 (FC),59.57%,59.15%,53.84%,67.58%,91.75%,59.00%,52.00%,94.00%,24.50%,50.50%,87.36%,99.00%,75.71%,92.00%,16.00%,20.00%,84.17%
+70,MiniCPM3-4B (Prompt),59.31%,65.88%,50.59%,63.50%,84.50%,48.00%,58.00%,72.50%,65.50%,62.00%,40.36%,35.00%,45.71%,34.00%,48.00%,80.00%,67.92%
+71,Open-Mixtral-8x22b (FC),58.93%,61.67%,63.64%,71.67%,93.00%,66.00%,56.00%,94.00%,10.50%,70.50%,83.57%,100.00%,67.14%,94.00%,22.00%,55.00%,29.17%
+72,Gemini-1.0-Pro-002 (Prompt),56.62%,57.31%,56.32%,46.25%,58.75%,26.00%,54.00%,56.50%,63.50%,63.00%,49.79%,61.00%,38.57%,68.00%,60.00%,47.50%,55.00%
+73,Nexusflow-Raven-v2 (FC),55.59%,45.88%,59.11%,57.50%,37.50%,63.00%,72.00%,53.00%,34.00%,39.00%,47.93%,83.00%,12.86%,86.00%,40.00%,62.50%,80.42%
+74,GoGoAgent,55.16%,10.92%,89.86%,43.67%,0.00%,63.00%,68.00%,0.00%,0.00%,0.00%,95.43%,98.00%,92.86%,96.00%,88.00%,80.00%,93.33%
+75,Meta-Llama-3-8B-Instruct (Prompt),54.23%,60.79%,58.93%,62.67%,87.00%,47.00%,54.00%,82.50%,48.00%,50.00%,47.71%,84.00%,11.43%,86.00%,42.00%,60.00%,9.17%
+76,Qwen2.5-0.5B-Instruct (Prompt),52.58%,53.19%,61.89%,58.25%,76.75%,44.00%,54.00%,68.00%,53.50%,33.00%,63.07%,89.00%,37.14%,70.00%,62.00%,52.50%,12.92%
+77,Claude-3.5-Sonnet-20241022 (FC),49.44%,45.44%,47.89%,78.75%,95.25%,65.00%,76.00%,94.50%,3.50%,5.00%,97.57%,98.00%,97.14%,90.00%,4.00%,0.00%,71.67%
+78,Qwen2-1.5B-Instruct (Prompt),48.29%,54.29%,52.39%,51.17%,79.50%,38.00%,36.00%,79.00%,46.50%,40.50%,46.57%,76.00%,17.14%,76.00%,52.00%,35.00%,7.92%
+79,claude-3.5-haiku-20241022 (FC),47.43%,40.62%,50.46%,68.00%,96.00%,56.00%,52.00%,92.00%,2.50%,0.00%,87.86%,100.00%,75.71%,90.00%,24.00%,0.00%,62.50%
+80,FireFunction-v1 (FC),47.12%,43.00%,44.57%,80.00%,92.00%,66.00%,82.00%,92.00%,0.00%,0.00%,88.29%,98.00%,78.57%,90.00%,0.00%,0.00%,73.75%
+81,GLM-4-9b-Chat (FC),46.56%,36.67%,46.00%,65.17%,86.50%,55.00%,54.00%,81.50%,0.00%,0.00%,94.00%,98.00%,90.00%,90.00%,0.00%,0.00%,88.33%
+82,o1-2024-12-17 (FC),44.46%,40.23%,38.66%,67.92%,93.75%,56.00%,54.00%,93.00%,0.00%,0.00%,60.64%,97.00%,24.29%,94.00%,0.00%,0.00%,84.58%
+83,Llama-3.1-8B-Instruct (FC),43.91%,48.21%,50.18%,55.83%,50.50%,57.00%,60.00%,54.00%,48.50%,34.50%,58.71%,66.00%,51.43%,58.00%,54.00%,30.00%,1.67%
+84,xLAM-1b-fc-r (FC),37.80%,41.17%,42.95%,71.67%,83.00%,62.00%,70.00%,86.00%,5.00%,2.00%,77.79%,97.00%,58.57%,90.00%,4.00%,0.00%,3.75%
+85,Mistral-Small-2402 (Prompt),34.26%,26.94%,30.36%,23.25%,69.75%,0.00%,0.00%,74.00%,8.50%,2.00%,52.93%,43.00%,62.86%,64.00%,2.00%,2.50%,79.17%
+86,Llama-3.1-70B-Instruct (FC),31.55%,25.29%,31.62%,49.17%,24.50%,59.00%,64.00%,24.50%,12.50%,15.00%,53.00%,36.00%,70.00%,36.00%,30.00%,7.50%,56.25%
+87,Llama-3.2-1B-Instruct (Prompt),30.40%,28.44%,25.27%,29.25%,52.75%,13.00%,22.00%,33.50%,36.00%,15.00%,34.07%,61.00%,7.14%,28.00%,34.00%,5.00%,58.75%
+88,DeepSeek-Coder-V2-Lite-Instruct (FC),27.75%,4.88%,33.18%,0.00%,0.00%,0.00%,0.00%,1.50%,3.50%,14.50%,17.71%,24.00%,11.43%,42.00%,28.00%,45.00%,97.50%
+89,Gemma-2-2b-it (Prompt),23.32%,17.10%,19.12%,15.42%,36.25%,4.00%,6.00%,52.00%,0.00%,1.00%,22.50%,45.00%,0.00%,54.00%,0.00%,0.00%,65.00%
\ No newline at end of file
diff --git a/data_overall.csv b/data_overall.csv
index 4326ddd7d..c007560bd 100644
--- a/data_overall.csv
+++ b/data_overall.csv
@@ -1,81 +1,90 @@
Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Relevance Detection,Irrelevance Detection,Organization,License
-1,67.54%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.22,2.52,6.3,5.13,84.56%,69.75%,91.00%,91.00%,86.50%,85.21%,87.36%,90.00%,86.00%,77.50%,79.56%,81.01%,77.59%,81.25%,66.67%,38.12%,54.00%,13.50%,35.50%,49.50%,70.59%,83.69%,OpenAI,Proprietary
-2,67.28%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,8.22,1.77,6.71,3.99,86.38%,75.00%,92.50%,92.50%,85.50%,78.91%,60.14%,92.00%,86.00%,77.50%,79.29%,76.36%,76.07%,81.25%,66.67%,39.12%,58.00%,10.00%,37.00%,51.50%,70.59%,87.03%,OpenAI,Proprietary
-3,66.29%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,12.8,1.45,9.21,2.57,80.88%,65.00%,85.50%,92.00%,81.00%,77.66%,61.14%,88.00%,84.00%,77.50%,80.84%,78.68%,72.46%,100.00%,75.00%,37.25%,44.00%,31.50%,29.50%,44.00%,52.94%,92.38%,OpenAI,Proprietary
-4,66.26%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,203.92,26.55,16.66,56.16,86.19%,76.75%,94.00%,90.00%,84.00%,88.70%,99.29%,94.00%,84.00%,77.50%,75.29%,82.17%,76.35%,81.25%,79.17%,36.88%,47.50%,38.50%,31.50%,30.00%,88.24%,75.77%,OpenAI,Proprietary
-5,63.62%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.55,9.91,3.08,84.58%,74.33%,90.00%,90.00%,84.00%,83.57%,83.29%,92.00%,84.00%,75.00%,73.24%,75.19%,75.12%,87.50%,70.83%,34.12%,47.50%,19.50%,29.00%,40.50%,82.35%,74.41%,OpenAI,Proprietary
-6,62.89%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.79,8.28,10.05,17.4,80.54%,70.67%,89.50%,82.00%,80.00%,82.70%,89.29%,86.00%,78.00%,77.50%,77.73%,72.87%,71.70%,75.00%,66.67%,28.25%,40.50%,5.00%,34.50%,33.00%,58.82%,89.16%,OpenAI,Proprietary
-7,62.53%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,12.14,55.42,33.2,89.52%,76.08%,96.50%,94.50%,91.00%,91.32%,99.29%,94.00%,92.00%,80.00%,76.45%,81.78%,82.62%,68.75%,75.00%,21.38%,31.50%,21.00%,26.50%,6.50%,72.22%,76.19%,MeetKai,MIT
-8,62.11%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.05,5.57,33.74,4.69,87.98%,78.92%,92.50%,91.50%,89.00%,91.27%,98.57%,94.00%,90.00%,82.50%,76.76%,81.01%,77.97%,93.75%,70.83%,20.75%,23.00%,19.50%,17.50%,23.00%,76.47%,78.21%,Google,Proprietary
-9,61.11%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.84,1.34,6.87,2.39,86.69%,79.25%,90.50%,89.00%,88.00%,80.84%,62.86%,96.00%,82.00%,82.50%,77.20%,79.84%,76.73%,93.75%,70.83%,22.00%,33.00%,12.00%,17.00%,26.00%,82.35%,81.92%,OpenAI,Proprietary
-10,61.04%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.39,2.17,2.59,4.12,87.40%,74.08%,94.00%,92.00%,89.50%,84.61%,75.93%,94.00%,86.00%,82.50%,76.44%,79.07%,75.50%,87.50%,75.00%,21.62%,31.00%,5.00%,21.00%,29.50%,76.47%,77.07%,Google,Proprietary
-11,60.89%,Qwen2.5-72B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,N/A,4.24,8.69,10.86,90.85%,80.92%,97.50%,93.50%,91.50%,92.07%,99.29%,94.00%,90.00%,85.00%,75.03%,84.11%,81.67%,62.50%,75.00%,17.25%,23.50%,20.00%,13.50%,12.00%,94.44%,72.98%,Qwen,apache-2.0
-12,60.17%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.0,1.54,4.69,2.39,84.06%,74.75%,90.50%,91.00%,80.00%,85.77%,91.57%,90.00%,84.00%,77.50%,76.49%,75.58%,71.98%,93.75%,75.00%,18.88%,26.00%,5.00%,21.50%,23.00%,52.94%,84.70%,Google,Proprietary
-13,59.50%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,58.87,1.29,1.36,2.7,90.98%,81.92%,95.50%,94.00%,92.50%,89.45%,99.29%,96.00%,80.00%,82.50%,63.56%,86.05%,84.24%,100.00%,79.17%,30.25%,42.50%,25.00%,20.50%,33.00%,100.00%,35.46%,OpenAI,Proprietary
-14,59.00%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,5.1,1.44,1.85,2.49,83.23%,69.92%,92.00%,90.50%,80.50%,87.95%,91.79%,92.00%,88.00%,80.00%,75.47%,73.26%,70.18%,81.25%,58.33%,16.00%,24.50%,3.00%,15.50%,21.00%,58.82%,84.05%,Google,Proprietary
-15,58.43%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.68,3.12,10.75,6.19,86.98%,74.42%,93.00%,90.50%,90.00%,84.38%,75.00%,94.00%,86.00%,82.50%,69.73%,85.66%,78.16%,68.75%,75.00%,23.75%,33.50%,18.00%,23.50%,20.00%,76.47%,52.80%,Mistral AI,Proprietary
-16,58.31%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,4.6,15.67,9.25,87.29%,76.67%,94.00%,90.00%,88.50%,89.21%,97.36%,94.00%,88.00%,77.50%,78.37%,72.48%,76.54%,81.25%,70.83%,7.75%,7.50%,11.50%,5.00%,7.00%,77.78%,87.88%,Huawei Noah & USTC,Apache-2.0
-17,58.03%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,7.86,10.86,17.45,83.58%,77.33%,93.50%,88.00%,75.50%,87.88%,95.00%,94.00%,90.00%,72.50%,73.39%,83.33%,80.15%,62.50%,75.00%,16.25%,25.50%,16.00%,11.50%,12.00%,88.89%,68.21%,Salesforce,cc-by-nc-4.0
-18,56.93%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.48,0.7,1.04,1.08,85.44%,70.75%,90.00%,91.00%,90.00%,83.59%,80.36%,92.00%,82.00%,80.00%,68.53%,74.81%,75.78%,93.75%,79.17%,19.50%,27.50%,20.00%,12.00%,18.50%,82.35%,62.89%,Google,Proprietary
-19,56.52%,Claude-3-Opus-20240229 (FC),https://www.anthropic.com/news/claude-3-family,20.16,9.46,9.98,17.43,55.58%,67.83%,89.50%,37.00%,28.00%,59.46%,80.36%,88.00%,42.00%,27.50%,79.24%,79.84%,77.40%,18.75%,29.17%,30.25%,41.50%,14.00%,33.50%,32.00%,76.47%,82.10%,Anthropic,Proprietary
-20,56.20%,Claude-3.5-Sonnet-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,2.54,3.07,5.61,4.72,45.92%,77.67%,95.00%,6.50%,4.50%,47.89%,97.57%,90.00%,4.00%,0.00%,77.96%,82.17%,81.10%,31.25%,12.50%,41.00%,55.00%,19.00%,42.50%,47.50%,70.59%,73.70%,Anthropic,Proprietary
-21,56.16%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.3,0.74,1.05,1.3,81.21%,65.83%,91.50%,80.00%,87.50%,73.21%,68.86%,90.00%,54.00%,80.00%,77.96%,71.71%,70.47%,81.25%,75.00%,11.62%,19.00%,0.50%,10.50%,16.50%,58.82%,90.81%,Google,Proprietary
-22,56.10%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,35.22,60.13,142.21,86.38%,74.00%,94.00%,90.00%,87.50%,87.12%,89.50%,94.00%,90.00%,75.00%,72.99%,78.68%,77.49%,75.00%,58.33%,9.88%,17.00%,2.50%,14.00%,6.00%,83.33%,70.55%,MeetKai,MIT
-23,56.00%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.46,0.8,1.07,1.38,79.69%,74.25%,91.50%,86.00%,67.00%,80.64%,93.57%,92.00%,82.00%,55.00%,75.20%,77.13%,74.26%,93.75%,58.33%,12.50%,17.50%,6.00%,11.50%,15.00%,88.24%,78.44%,Google,Proprietary
-24,55.78%,palmyra-x-004 (FC),https://writer.com/engineering/actions-with-palmyra-x-004/,24.94,2.76,10.69,5.48,70.23%,71.42%,31.00%,90.50%,88.00%,87.54%,97.14%,88.00%,80.00%,85.00%,77.16%,75.19%,75.21%,50.00%,62.50%,11.37%,12.00%,2.50%,18.50%,12.50%,70.59%,79.70%,Writer,Proprietary
-25,55.45%,DeepSeek-Coder-V2 (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct-0724,N/A,27.82,108.59,55.51,89.15%,78.08%,95.00%,93.50%,90.00%,91.23%,96.43%,94.00%,92.00%,82.50%,73.43%,80.62%,77.30%,50.00%,70.83%,4.50%,7.50%,3.00%,4.00%,3.50%,83.33%,70.59%,DeepSeek,DeepSeek License
-26,55.28%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.31,0.58,0.69,0.84,77.42%,65.17%,94.50%,73.50%,76.50%,74.80%,62.21%,88.00%,74.00%,75.00%,75.51%,72.09%,73.31%,62.50%,58.33%,13.87%,19.00%,3.50%,14.00%,19.00%,58.82%,79.66%,Google,Proprietary
-27,55.19%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,7.27,19.18,23.97,90.50%,80.50%,95.50%,94.00%,92.00%,88.62%,89.50%,94.00%,86.00%,85.00%,71.75%,75.97%,77.59%,81.25%,75.00%,5.50%,9.00%,2.00%,7.00%,4.00%,94.44%,70.57%,MadeAgents,cc-by-nc-4.0
-28,54.75%,GoGoAgent,https://gogoagent.ai,N/A,2.18,1.49,4.89,85.75%,74.50%,92.00%,89.50%,87.00%,89.86%,95.43%,96.00%,88.00%,80.00%,74.84%,74.81%,72.08%,81.25%,66.67%,1.00%,1.50%,2.00%,0.50%,0.00%,94.12%,85.61%,BitAgent,Proprietary
-29,54.10%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.87,1.53,3.63,82.98%,76.92%,93.50%,84.00%,77.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.24%,81.01%,73.98%,87.50%,58.33%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,66.24%,Anthropic,Proprietary
-30,54.05%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.46,14.68,14.06,89.85%,77.92%,96.50%,94.00%,91.00%,90.12%,94.00%,98.00%,86.00%,82.50%,62.02%,77.52%,75.97%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,94.44%,54.84%,Meta,Meta Llama 3 Community
-31,53.54%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.39,0.88,0.94,1.48,83.81%,74.25%,93.00%,88.50%,79.50%,83.79%,96.14%,88.00%,86.00%,65.00%,62.98%,77.91%,78.35%,50.00%,54.17%,19.50%,32.50%,11.50%,21.50%,12.50%,94.12%,36.53%,OpenAI,Proprietary
-32,53.53%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.03,11.55,9.68,86.48%,75.92%,95.00%,91.00%,84.00%,88.29%,92.14%,90.00%,86.00%,85.00%,66.95%,74.81%,74.45%,62.50%,66.67%,7.62%,9.50%,8.50%,7.00%,5.50%,83.33%,65.22%,Qwen,apache-2.0
-33,53.12%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.19,3.93,87.10%,79.92%,93.00%,90.50%,85.00%,87.54%,96.64%,92.00%,84.00%,77.50%,66.44%,76.74%,75.50%,56.25%,58.33%,8.62%,13.50%,7.00%,11.00%,3.00%,88.24%,56.30%,Fireworks,Apache 2.0
-34,52.75%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,12.8,25.61,24.81,80.81%,74.25%,95.50%,80.50%,73.00%,79.88%,74.00%,96.00%,82.00%,67.50%,69.35%,71.32%,74.45%,50.00%,62.50%,10.00%,16.50%,8.50%,7.50%,7.50%,94.44%,65.10%,Salesforce,cc-by-nc-4.0
-35,52.54%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.5,4.66,9.49,67.33%,73.33%,90.00%,68.50%,37.50%,74.05%,89.21%,90.00%,72.00%,45.00%,71.08%,72.48%,78.16%,62.50%,66.67%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.88%,Salesforce,cc-by-nc-4.0
-36,52.05%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.91,17.8,9.97,73.04%,70.17%,88.50%,68.50%,65.00%,81.57%,93.29%,86.00%,72.00%,75.00%,77.20%,74.03%,73.69%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,64.71%,85.93%,Mistral AI,Proprietary
-37,51.89%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,2.31,2.61,6.8,89.10%,80.42%,93.00%,91.00%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,66.15%,83.33%,78.06%,68.75%,58.33%,2.38%,4.50%,2.00%,1.50%,1.50%,88.89%,59.13%,Google,gemma-terms-of-use
-38,51.79%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,2.88,3.13,8.77,87.77%,76.58%,94.50%,92.50%,87.50%,88.21%,95.86%,94.00%,78.00%,85.00%,65.04%,81.01%,78.54%,75.00%,70.83%,5.62%,10.00%,4.00%,6.00%,2.50%,94.44%,50.82%,Meta,Meta Llama 3 Community
-39,51.74%,Claude-3.5-Sonnet-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,1.23,1.78,1.33,3.22,72.90%,80.58%,92.00%,73.00%,46.00%,80.00%,100.00%,92.00%,68.00%,60.00%,71.96%,86.05%,80.44%,81.25%,45.83%,7.50%,9.00%,5.50%,5.00%,10.50%,76.47%,64.29%,Anthropic,Proprietary
-40,51.59%,MiniCPM3-4B-FC (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,159.07,185.16,464.0,81.06%,69.75%,92.00%,83.00%,79.50%,87.57%,89.29%,90.00%,86.00%,85.00%,69.66%,72.87%,63.63%,37.50%,62.50%,2.62%,5.00%,1.00%,3.00%,1.50%,77.78%,72.22%,openbmb,Apache-2.0
-41,51.42%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,5.36,8.6,13.34,84.92%,75.67%,90.50%,88.00%,85.50%,87.52%,88.07%,94.00%,88.00%,80.00%,67.61%,73.26%,74.26%,56.25%,66.67%,1.62%,2.00%,4.00%,0.50%,0.00%,77.78%,66.73%,Google,gemma-terms-of-use
-42,51.26%,Claude-3-Opus-20240229 (Prompt),https://www.anthropic.com/news/claude-3-family,10.48,4.41,8.27,10.61,85.02%,79.08%,95.00%,85.50%,80.50%,86.32%,99.29%,90.00%,86.00%,70.00%,66.80%,84.11%,78.73%,75.00%,54.17%,7.13%,11.50%,2.50%,6.00%,8.50%,82.35%,40.36%,Anthropic,Proprietary
-43,51.01%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.75,1.94,3.61,82.44%,64.75%,93.00%,87.50%,84.50%,77.66%,56.14%,94.00%,88.00%,72.50%,65.16%,75.19%,68.28%,75.00%,70.83%,9.12%,15.00%,3.50%,9.00%,9.00%,64.71%,63.25%,Mistral AI,Proprietary
-44,50.58%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.59,52.72,24.5,84.02%,72.58%,93.50%,87.00%,83.00%,86.30%,83.71%,96.00%,88.00%,77.50%,60.68%,73.26%,72.36%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,72.22%,49.10%,Meta,Meta Llama 3 Community
-45,50.28%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.39,6.27,3.15,87.90%,78.58%,94.00%,89.50%,89.50%,87.77%,93.57%,96.00%,84.00%,77.50%,65.82%,80.62%,72.84%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,82.35%,55.09%,Mistral AI,Proprietary
-46,50.12%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,3.66,6.19,8.01,83.85%,74.42%,90.50%,87.50%,83.00%,87.70%,92.79%,92.00%,86.00%,80.00%,64.86%,74.03%,68.47%,56.25%,70.83%,1.75%,2.00%,1.00%,1.50%,2.50%,83.33%,62.49%,MadeAgents,cc-by-nc-4.0
-47,49.43%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,15.82,1.21,0.78,2.08,78.79%,71.67%,88.50%,82.00%,73.00%,84.68%,93.21%,92.00%,76.00%,77.50%,68.27%,75.58%,76.26%,81.25%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,82.35%,59.46%,Cohere For AI,cc-by-nc-4.0
-48,49.19%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.97,1.94,5.3,82.21%,72.83%,91.00%,84.00%,81.00%,86.59%,86.36%,92.00%,88.00%,80.00%,59.22%,67.44%,55.56%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,75.29%,IBM,Apache-2.0
-49,48.33%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.74,1.79,1.21,72.75%,77.50%,92.50%,66.50%,54.50%,70.39%,57.57%,90.00%,74.00%,60.00%,68.62%,78.29%,78.25%,75.00%,62.50%,5.62%,9.00%,2.00%,7.00%,4.50%,94.12%,59.01%,OpenAI,Proprietary
-50,47.22%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,24.91,3.39,3.96,6.93,90.60%,82.92%,97.00%,92.00%,90.50%,90.12%,100.00%,94.00%,84.00%,82.50%,52.62%,86.05%,81.96%,93.75%,79.17%,8.38%,15.00%,6.00%,6.00%,6.50%,100.00%,4.18%,Mistral AI,Proprietary
-51,47.11%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,4.14,3.98,8.7,76.42%,64.17%,89.50%,79.50%,72.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.59%,69.77%,65.53%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.84%,NousResearch,apache-2.0
-52,46.75%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,7.85,45.67,14.72,79.98%,74.42%,92.00%,79.50%,74.00%,83.70%,87.29%,92.00%,78.00%,77.50%,55.53%,63.18%,64.39%,18.75%,45.83%,5.25%,8.50%,2.50%,4.50%,5.50%,83.33%,51.75%,Meta,Meta Llama 3 Community
-53,46.54%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,2.55,6.34,4.62,73.60%,70.92%,86.50%,70.00%,67.00%,85.61%,80.43%,94.00%,88.00%,80.00%,60.46%,68.60%,58.50%,56.25%,50.00%,1.12%,1.50%,2.50%,0.50%,0.00%,77.78%,62.92%,Qwen,apache-2.0
-54,46.06%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,5.43,2.63,7.01,3.81,78.58%,68.83%,91.50%,83.50%,70.50%,80.71%,90.86%,90.00%,82.00%,60.00%,58.89%,68.60%,61.82%,50.00%,45.83%,2.00%,3.50%,0.00%,1.50%,3.00%,100.00%,55.30%,Cohere For AI,cc-by-nc-4.0
-55,44.69%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.36,2.01,4.03,3.66,57.77%,67.58%,94.00%,22.50%,47.00%,53.84%,87.36%,92.00%,16.00%,20.00%,72.49%,64.34%,72.17%,12.50%,12.50%,2.62%,4.50%,0.00%,3.00%,3.00%,82.35%,80.97%,Mistral AI,Proprietary
-56,44.24%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.76,1.39,2.81,3.41,56.19%,66.75%,92.50%,39.00%,26.50%,64.93%,87.21%,86.00%,64.00%,22.50%,68.00%,73.26%,65.53%,37.50%,37.50%,2.88%,4.50%,1.00%,3.50%,2.50%,76.47%,71.36%,Google,Proprietary
-57,43.02%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,11.27,32.66,24.55,72.67%,60.67%,87.00%,78.50%,64.50%,76.00%,61.00%,94.00%,84.00%,65.00%,57.49%,67.44%,60.11%,50.00%,41.67%,2.63%,3.50%,4.00%,2.50%,0.50%,66.67%,38.82%,NousResearch,apache-2.0
-58,43.01%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,6.92,2.72,10.47,5.41,61.42%,71.67%,94.00%,10.50%,69.50%,63.64%,83.57%,94.00%,22.00%,55.00%,68.71%,75.19%,73.41%,6.25%,45.83%,1.50%,3.50%,0.00%,1.00%,1.50%,82.35%,45.93%,Mistral AI,Proprietary
-59,42.75%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.74,2.05,4.47,3.63,63.33%,64.83%,86.00%,58.50%,44.00%,69.61%,77.93%,86.00%,62.00%,52.50%,60.53%,60.85%,65.05%,68.75%,50.00%,1.50%,2.50%,0.00%,1.50%,2.00%,88.24%,59.41%,Mistral AI,Proprietary
-60,42.32%,Claude-3-Haiku-20240307 (FC),https://www.anthropic.com/news/claude-3-family,0.23,1.63,2.5,2.52,42.40%,74.08%,93.50%,2.00%,0.00%,48.41%,91.64%,96.00%,6.00%,0.00%,59.51%,79.07%,77.87%,0.00%,0.00%,24.50%,35.50%,11.50%,22.00%,29.00%,100.00%,28.56%,Anthropic,Proprietary
-61,42.31%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.79,1.86,10.01,3.37,85.29%,77.17%,92.50%,87.00%,84.50%,89.07%,93.79%,92.00%,88.00%,82.50%,48.67%,77.13%,73.31%,87.50%,70.83%,0.25%,0.50%,0.00%,0.00%,0.50%,94.12%,6.66%,Mistral AI,Proprietary
-62,42.17%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,3.43,10.0,8.0,75.85%,67.92%,88.00%,74.00%,73.50%,76.80%,80.21%,84.00%,78.00%,65.00%,50.56%,56.20%,61.73%,37.50%,66.67%,3.25%,4.00%,4.50%,2.50%,2.00%,83.33%,39.22%,Qwen,apache-2.0
-63,40.98%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.49,3.89,8.21,11.37,60.75%,73.50%,92.00%,40.00%,37.50%,69.14%,90.07%,88.00%,46.00%,52.50%,60.58%,77.52%,73.31%,75.00%,45.83%,0.00%,0.00%,0.00%,0.00%,0.00%,94.12%,40.79%,Databricks,Databricks Open Model
-64,40.78%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,3.67,12.44,6.92,67.19%,63.25%,80.50%,67.00%,58.00%,70.11%,53.93%,84.00%,80.00%,62.50%,53.22%,51.94%,44.25%,56.25%,41.67%,0.50%,0.50%,0.00%,0.50%,1.00%,72.22%,66.25%,MadeAgents,cc-by-nc-4.0
-65,40.02%,Claude-3-Haiku-20240307 (Prompt),https://www.anthropic.com/news/claude-3-family,0.21,1.12,2.15,2.13,57.52%,77.08%,91.50%,38.50%,23.00%,55.62%,94.00%,90.00%,6.00%,32.50%,64.22%,77.13%,74.17%,56.25%,54.17%,1.62%,3.50%,0.00%,0.00%,3.00%,70.59%,42.14%,Anthropic,Proprietary
-66,39.67%,FireFunction-v1 (FC),https://huggingface.co/fireworks-ai/firefunction-v1,N/A,2.43,4.83,3.93,42.90%,80.08%,91.50%,0.00%,0.00%,44.57%,88.29%,90.00%,0.00%,0.00%,69.56%,68.99%,71.79%,0.00%,0.00%,2.38%,5.00%,0.00%,2.00%,2.50%,94.12%,71.74%,Fireworks,Apache 2.0
-67,39.21%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,7.89,5.38,18.44,70.33%,76.83%,94.00%,72.00%,38.50%,60.63%,84.50%,92.00%,56.00%,10.00%,54.02%,78.29%,57.36%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,77.78%,46.20%,Salesforce,cc-by-nc-4.0
-68,38.85%,GLM-4-9b-Chat (FC),https://huggingface.co/THUDM/glm-4-9b-chat,N/A,5.03,13.94,11.28,36.65%,65.08%,81.50%,0.00%,0.00%,46.00%,94.00%,90.00%,0.00%,0.00%,66.50%,71.32%,64.10%,0.00%,0.00%,3.50%,3.50%,4.00%,2.50%,4.00%,66.67%,79.65%,THUDM,glm-4
-69,38.48%,MiniCPM3-4B (Prompt),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,20.44,49.11,65.53,65.73%,63.42%,73.50%,63.00%,63.00%,50.59%,40.36%,34.00%,48.00%,80.00%,54.20%,45.35%,34.19%,43.75%,45.83%,2.00%,3.00%,3.50%,1.00%,0.50%,55.56%,74.49%,openbmb,Apache-2.0
-70,36.98%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,1.12,0.55,2.27,46.12%,57.50%,53.00%,34.50%,39.50%,59.11%,47.93%,86.00%,40.00%,62.50%,54.22%,41.47%,38.75%,56.25%,37.50%,1.00%,1.50%,0.50%,1.00%,1.00%,58.82%,78.59%,Nexusflow,Apache 2.0
-71,35.76%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,1.36,2.41,3.08,58.40%,47.58%,60.50%,66.50%,59.00%,56.32%,49.79%,68.00%,60.00%,47.50%,48.80%,48.06%,46.53%,62.50%,37.50%,1.38%,2.50%,1.50%,0.50%,1.00%,82.35%,53.07%,Google,Proprietary
-72,34.24%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.29,8.88,20.8,60.79%,62.67%,83.00%,49.00%,48.50%,58.93%,47.71%,86.00%,42.00%,60.00%,47.76%,59.30%,61.73%,37.50%,33.33%,0.75%,1.50%,0.00%,1.00%,0.50%,77.78%,18.42%,Meta,Meta Llama 3 Community
-73,31.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.91,1.91,0.85,3.48,27.06%,23.25%,74.00%,8.50%,2.50%,30.36%,52.93%,64.00%,2.00%,2.50%,58.18%,34.50%,64.20%,0.00%,4.17%,0.75%,0.50%,0.00%,1.50%,1.00%,58.82%,69.80%,Mistral AI,Proprietary
-74,29.08%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,3.13,12.5,5.47,54.52%,51.08%,78.00%,46.50%,42.50%,52.39%,46.57%,76.00%,52.00%,35.00%,38.34%,47.67%,39.41%,18.75%,25.00%,0.50%,0.50%,1.00%,0.00%,0.50%,83.33%,21.02%,Qwen,apache-2.0
-75,27.45%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,6.23,16.77,14.39,47.92%,55.67%,54.00%,47.00%,35.00%,50.18%,58.71%,58.00%,54.00%,30.00%,33.19%,50.00%,48.62%,37.50%,37.50%,5.38%,5.00%,7.50%,5.00%,4.00%,94.44%,5.03%,Meta,Meta Llama 3 Community
-76,27.20%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,4.67,11.84,10.63,25.08%,48.83%,24.50%,12.50%,14.50%,31.62%,53.00%,36.00%,30.00%,7.50%,45.27%,51.94%,52.90%,31.25%,25.00%,4.88%,7.00%,4.00%,4.50%,4.00%,100.00%,45.08%,Meta,Meta Llama 3 Community
-77,25.12%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,6.65,14.46,13.9,40.96%,71.83%,85.00%,5.50%,1.50%,42.95%,77.79%,90.00%,4.00%,0.00%,37.54%,65.89%,53.56%,0.00%,0.00%,0.12%,0.50%,0.00%,0.00%,0.00%,100.00%,7.15%,Salesforce,cc-by-nc-4.0
-78,22.48%,DeepSeek-Coder-V2-Lite-Instruct (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,N/A,16.78,38.81,52.93,4.75%,0.00%,2.00%,3.50%,13.50%,33.18%,17.71%,42.00%,28.00%,45.00%,39.63%,1.94%,3.70%,6.25%,12.50%,0.12%,0.50%,0.00%,0.00%,0.00%,5.56%,96.54%,DeepSeek,DeepSeek License
-79,22.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,3.86,7.15,11.95,16.90%,15.08%,52.00%,0.00%,0.50%,19.12%,22.50%,54.00%,0.00%,0.00%,43.40%,26.74%,18.42%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,38.89%,72.58%,Google,gemma-terms-of-use
-80,20.46%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.21,18.2,34.15,27.60%,29.42%,33.50%,32.50%,15.00%,25.27%,34.07%,28.00%,34.00%,5.00%,31.36%,30.62%,7.50%,12.50%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,33.33%,59.93%,Meta,Meta Llama 3 Community
\ No newline at end of file
+1,74.24%,watt-tool-70B (FC),https://huggingface.co/watt-ai/watt-tool-70B/,N/A,3.4,12.61,7.7,84.06%,78.75%,94.00%,85.50%,78.00%,89.39%,98.57%,94.00%,90.00%,75.00%,77.65%,84.88%,83.48%,81.25%,66.67%,58.62%,67.00%,57.50%,48.50%,61.50%,94.44%,76.32%,Watt AI Lab,Apache-2.0
+2,72.02%,gpt-4o-2024-11-20 (Prompt),https://openai.com/index/hello-gpt-4o/,N/A,N/A,N/A,N/A,88.10%,79.42%,95.50%,94.00%,83.50%,89.38%,100.00%,94.00%,86.00%,77.50%,79.65%,83.72%,79.77%,87.50%,70.83%,47.62%,59.00%,41.00%,35.50%,55.00%,83.33%,83.76%,OpenAI,Proprietary
+3,69.56%,gpt-4o-2024-11-20 (FC),https://openai.com/index/hello-gpt-4o/,N/A,N/A,N/A,N/A,87.42%,77.17%,93.50%,93.00%,86.00%,89.20%,88.29%,92.00%,94.00%,82.50%,79.61%,81.01%,78.82%,87.50%,75.00%,41.00%,62.50%,6.00%,37.50%,58.00%,83.33%,83.15%,OpenAI,Proprietary
+4,67.94%,watt-tool-8B (FC),https://huggingface.co/watt-ai/watt-tool-8B/,N/A,1.31,2.79,4.04,86.56%,76.75%,95.00%,94.00%,80.50%,89.34%,97.86%,94.00%,88.00%,77.50%,76.37%,75.97%,77.49%,87.50%,66.67%,39.13%,47.00%,41.50%,27.50%,40.50%,83.33%,83.15%,Watt AI Lab,Apache-2.0
+5,67.87%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.22,2.47,6.27,5.08,84.73%,70.42%,91.00%,90.00%,87.50%,85.21%,87.36%,90.00%,86.00%,77.50%,80.45%,83.33%,78.63%,81.25%,70.83%,38.12%,54.00%,13.50%,35.50%,49.50%,72.22%,83.81%,OpenAI,Proprietary
+6,66.68%,o1-2024-12-17 (Prompt),https://openai.com/o1/,N/A,N/A,N/A,N/A,85.67%,72.67%,93.50%,91.50%,85.00%,79.77%,58.57%,92.00%,86.00%,82.50%,80.45%,81.78%,76.54%,81.25%,70.83%,36.00%,50.50%,0.50%,48.50%,44.50%,72.22%,87.78%,OpenAI,Proprietary
+7,64.09%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.49,9.88,3.01,85.21%,74.83%,92.00%,90.00%,84.00%,83.57%,83.29%,92.00%,84.00%,75.00%,74.37%,78.29%,76.16%,87.50%,70.83%,34.12%,47.50%,19.50%,29.00%,40.50%,83.33%,74.75%,OpenAI,Proprietary
+8,62.76%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.76,8.44,10.06,17.57,78.92%,71.17%,89.00%,83.50%,72.00%,82.70%,89.29%,86.00%,78.00%,77.50%,78.05%,71.71%,71.60%,75.00%,79.17%,28.25%,40.50%,5.00%,34.50%,33.00%,61.11%,89.62%,OpenAI,Proprietary
+9,62.63%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,14.06,57.4,35.06,89.88%,76.00%,97.00%,95.00%,91.50%,91.32%,99.29%,94.00%,92.00%,80.00%,76.59%,81.01%,83.29%,68.75%,75.00%,21.38%,31.50%,21.00%,26.50%,6.50%,72.22%,76.08%,MeetKai,MIT
+10,62.13%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.05,5.94,33.79,6.47,88.58%,78.33%,93.50%,92.50%,90.00%,91.27%,98.57%,94.00%,90.00%,82.50%,76.54%,81.78%,77.40%,87.50%,79.17%,20.75%,23.00%,19.50%,17.50%,23.00%,72.22%,78.15%,Google,Proprietary
+11,61.80%,Hammer2.1-7b (FC),https://huggingface.co/MadeAgents/Hammer2.1-7b,N/A,2.08,4.12,5.38,88.65%,78.08%,95.00%,93.50%,88.00%,85.48%,86.43%,92.00%,86.00%,77.50%,75.02%,76.36%,77.40%,81.25%,66.67%,23.50%,35.50%,25.50%,19.00%,14.00%,82.35%,78.59%,MadeAgents,cc-by-nc-4.0
+12,61.28%,Qwen2.5-72B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,N/A,3.72,6.88,9.64,90.81%,80.25%,97.50%,93.50%,92.00%,92.70%,99.29%,94.00%,90.00%,87.50%,75.21%,84.50%,82.15%,62.50%,75.00%,18.00%,24.50%,20.00%,15.50%,12.00%,100.00%,72.81%,Qwen,qwen
+13,60.94%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.39,2.07,2.64,4.07,87.29%,73.17%,95.00%,91.50%,89.50%,84.61%,75.93%,94.00%,86.00%,82.50%,76.19%,79.46%,75.21%,87.50%,75.00%,21.62%,31.00%,5.00%,21.00%,29.50%,72.22%,76.90%,Google,Proprietary
+14,60.83%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.84,1.31,6.89,2.39,86.77%,80.08%,90.50%,89.50%,87.00%,80.84%,62.86%,96.00%,82.00%,82.50%,76.32%,80.23%,76.73%,93.75%,75.00%,22.00%,33.00%,12.00%,17.00%,26.00%,83.33%,80.67%,OpenAI,Proprietary
+15,60.44%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,7.0,1.54,4.69,2.38,85.56%,75.25%,91.50%,91.50%,84.00%,85.77%,91.57%,90.00%,84.00%,77.50%,76.63%,75.97%,71.98%,93.75%,75.00%,18.88%,26.00%,5.00%,21.50%,23.00%,55.56%,84.81%,Google,Proprietary
+16,59.64%,Qwen2.5-32B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-32B-Instruct,N/A,2.26,4.62,5.92,85.81%,70.25%,94.50%,90.50%,88.00%,89.79%,96.64%,90.00%,90.00%,82.50%,74.14%,82.17%,78.54%,62.50%,58.33%,17.75%,25.00%,20.00%,15.00%,11.00%,100.00%,73.75%,Qwen,apache-2.0
+17,59.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,58.87,1.24,1.33,2.58,90.88%,82.50%,95.50%,93.50%,92.00%,89.45%,99.29%,96.00%,80.00%,82.50%,63.71%,87.21%,84.14%,100.00%,75.00%,30.25%,42.50%,25.00%,20.50%,33.00%,100.00%,35.57%,OpenAI,Proprietary
+18,59.42%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,5.1,1.43,1.85,2.48,84.33%,69.83%,93.00%,92.00%,82.50%,87.95%,91.79%,92.00%,88.00%,80.00%,76.23%,75.58%,70.75%,81.25%,62.50%,16.00%,24.50%,3.00%,15.50%,21.00%,50.00%,84.39%,Google,Proprietary
+19,59.03%,Hammer2.1-3b (FC),https://huggingface.co/MadeAgents/Hammer2.1-3b,N/A,1.95,4.31,5.09,86.85%,81.42%,95.00%,89.50%,81.50%,84.09%,82.86%,92.00%,84.00%,77.50%,73.91%,72.48%,73.31%,62.50%,62.50%,17.38%,27.50%,17.50%,14.50%,10.00%,82.35%,81.87%,MadeAgents,qwen-research
+20,58.44%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.68,3.12,10.75,6.21,86.81%,74.25%,92.50%,90.00%,90.50%,84.38%,75.00%,94.00%,86.00%,82.50%,69.84%,84.88%,78.54%,62.50%,79.17%,23.75%,33.50%,18.00%,23.50%,20.00%,72.22%,52.85%,Mistral AI,Proprietary
+21,58.39%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,5.24,15.7,9.8,87.54%,76.67%,93.50%,90.50%,89.50%,89.21%,97.36%,94.00%,88.00%,77.50%,78.50%,72.48%,76.73%,81.25%,70.83%,7.75%,7.50%,11.50%,5.00%,7.00%,83.33%,87.88%,Huawei Noah & USTC,Apache-2.0
+22,57.76%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,9.26,11.66,21.27,83.69%,77.75%,94.50%,86.50%,76.00%,87.88%,95.00%,94.00%,90.00%,72.50%,72.55%,79.46%,79.68%,81.25%,75.00%,16.25%,25.50%,16.00%,11.50%,12.00%,88.89%,67.81%,Salesforce,cc-by-nc-4.0
+23,57.62%,Qwen2.5-14B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-14B-Instruct,N/A,2.02,4.99,5.0,85.69%,73.25%,92.50%,92.00%,85.00%,88.84%,92.36%,90.00%,88.00%,85.00%,74.10%,74.03%,75.78%,62.50%,66.67%,12.12%,18.50%,11.50%,12.00%,6.50%,77.78%,77.06%,Qwen,apache-2.0
+24,57.20%,DeepSeek-V3 (FC),https://api-docs.deepseek.com/news/news1226,N/A,2.58,5.84,4.29,89.17%,78.67%,95.50%,91.00%,91.50%,83.39%,62.57%,94.00%,92.00%,85.00%,68.33%,82.95%,82.15%,81.25%,62.50%,18.62%,21.00%,20.50%,19.00%,14.00%,88.89%,59.36%,DeepSeek,DeepSeek License
+25,57.08%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.48,0.71,1.04,1.09,85.69%,70.75%,90.00%,91.50%,90.50%,83.59%,80.36%,92.00%,82.00%,80.00%,68.86%,76.74%,76.16%,93.75%,79.17%,19.50%,27.50%,20.00%,12.00%,18.50%,83.33%,62.78%,Google,Proprietary
+26,56.73%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.46,0.81,1.07,1.38,81.65%,73.58%,91.50%,90.00%,71.50%,80.64%,93.57%,92.00%,82.00%,55.00%,76.54%,80.62%,76.16%,93.75%,62.50%,12.50%,17.50%,6.00%,11.50%,15.00%,83.33%,78.49%,Google,Proprietary
+27,56.43%,Claude-3.5-Sonnet-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,2.53,3.07,5.61,4.78,45.44%,78.75%,94.50%,3.50%,5.00%,47.89%,97.57%,90.00%,4.00%,0.00%,78.85%,83.33%,81.96%,25.00%,20.83%,41.00%,55.00%,19.00%,42.50%,47.50%,77.78%,74.04%,Anthropic,Proprietary
+28,56.43%,Claude-3-Opus-20240229 (FC),https://www.anthropic.com/news/claude-3-family,20.15,9.46,9.94,17.14,57.92%,67.17%,93.00%,39.50%,32.00%,59.46%,80.36%,88.00%,42.00%,27.50%,77.92%,77.91%,75.78%,31.25%,37.50%,30.25%,41.50%,14.00%,33.50%,32.00%,61.11%,81.59%,Anthropic,Proprietary
+29,56.38%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,18.44,35.32,51.23,86.75%,74.00%,94.50%,90.50%,88.00%,87.12%,89.50%,94.00%,90.00%,75.00%,73.66%,79.07%,78.16%,81.25%,62.50%,9.88%,17.00%,2.50%,14.00%,6.00%,77.78%,70.89%,MeetKai,MIT
+30,56.25%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.3,0.73,1.05,1.28,81.75%,65.50%,91.50%,80.50%,89.50%,73.21%,68.86%,90.00%,54.00%,80.00%,77.97%,72.09%,70.18%,81.25%,79.17%,11.62%,19.00%,0.50%,10.50%,16.50%,55.56%,90.92%,Google,Proprietary
+31,55.55%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.31,0.59,0.73,0.84,77.54%,65.17%,94.50%,73.00%,77.50%,74.80%,62.21%,88.00%,74.00%,75.00%,76.28%,75.19%,74.26%,62.50%,58.33%,13.87%,19.00%,3.50%,14.00%,19.00%,50.00%,79.72%,Google,Proprietary
+32,55.49%,DeepSeek-Coder-V2 (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct-0724,N/A,29.53,108.9,59.61,89.44%,78.75%,94.50%,93.50%,91.00%,91.23%,96.43%,94.00%,92.00%,82.50%,73.43%,80.23%,77.02%,43.75%,70.83%,4.50%,7.50%,3.00%,4.00%,3.50%,88.89%,70.81%,DeepSeek,DeepSeek License
+33,54.86%,Hammer2.1-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-1.5b,N/A,2.73,3.86,7.45,82.79%,74.67%,92.00%,84.50%,80.00%,83.39%,86.57%,90.00%,82.00%,75.00%,70.59%,70.93%,69.80%,50.00%,62.50%,10.50%,14.50%,12.50%,9.00%,6.00%,77.78%,79.27%,MadeAgents,cc-by-nc-4.0
+34,54.70%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,12.74,25.13,24.76,81.06%,74.25%,95.50%,81.00%,73.50%,79.88%,74.00%,96.00%,82.00%,67.50%,75.08%,71.32%,74.93%,50.00%,62.50%,10.00%,16.50%,8.50%,7.50%,7.50%,94.44%,77.11%,Salesforce,cc-by-nc-4.0
+35,54.46%,o1-2024-12-17 (FC),https://openai.com/o1/,N/A,N/A,N/A,N/A,40.23%,67.92%,93.00%,0.00%,0.00%,38.66%,60.64%,94.00%,0.00%,0.00%,77.92%,81.01%,79.01%,0.00%,0.00%,41.00%,52.50%,38.00%,30.50%,43.00%,72.22%,81.97%,OpenAI,Proprietary
+36,54.26%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.84,5.21,3.57,83.19%,76.25%,93.00%,84.00%,79.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.64%,83.72%,75.02%,87.50%,54.17%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,65.78%,Anthropic,Proprietary
+37,54.09%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.95,14.92,14.75,89.98%,77.92%,96.00%,94.50%,91.50%,90.12%,94.00%,98.00%,86.00%,82.50%,62.06%,77.13%,76.16%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,100.00%,54.78%,Meta,Meta Llama 3 Community
+38,53.88%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.38,0.87,1.45,1.47,83.94%,74.25%,93.50%,89.00%,79.00%,83.79%,96.14%,88.00%,86.00%,65.00%,63.93%,80.62%,79.68%,43.75%,58.33%,19.50%,32.50%,11.50%,21.50%,12.50%,94.44%,36.53%,OpenAI,Proprietary
+39,53.66%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.54,11.64,11.02,86.46%,75.33%,94.50%,91.50%,84.50%,88.29%,92.14%,90.00%,86.00%,85.00%,67.35%,75.97%,74.93%,62.50%,70.83%,7.62%,9.50%,8.50%,7.00%,5.50%,88.89%,65.16%,Qwen,apache-2.0
+40,53.24%,claude-3.5-haiku-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,N/A,N/A,N/A,N/A,40.62%,68.00%,92.00%,2.50%,0.00%,50.46%,87.86%,90.00%,24.00%,0.00%,72.28%,82.17%,78.35%,18.75%,0.00%,40.00%,54.50%,26.50%,35.00%,44.00%,83.33%,63.68%,Anthropic,Proprietary
+41,53.03%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.17,3.87,88.46%,80.33%,94.00%,91.50%,88.00%,87.54%,96.64%,92.00%,84.00%,77.50%,65.57%,78.29%,78.35%,56.25%,70.83%,8.62%,13.50%,7.00%,11.00%,3.00%,94.44%,53.02%,Fireworks,Apache 2.0
+42,52.55%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.96,5.57,10.33,67.65%,73.58%,90.00%,69.00%,38.00%,74.05%,89.21%,90.00%,72.00%,45.00%,70.99%,74.03%,79.30%,43.75%,58.33%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.15%,Salesforce,cc-by-nc-4.0
+43,52.17%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.57,18.11,9.97,73.12%,69.50%,88.50%,69.00%,65.50%,81.57%,93.29%,86.00%,72.00%,75.00%,77.52%,75.19%,74.07%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,66.67%,85.93%,Mistral AI,Proprietary
+44,52.17%,Command R7B (FC),https://cohere.com/blog/command-r7b,N/A,N/A,N/A,N/A,81.67%,68.17%,91.50%,85.50%,81.50%,84.02%,87.07%,92.00%,82.00%,75.00%,69.21%,63.18%,58.69%,56.25%,66.67%,5.00%,6.50%,1.50%,6.50%,5.50%,55.56%,81.02%,Cohere,cc-by-nc-4.0
+45,52.17%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,4.97,8.07,13.9,88.94%,79.75%,92.50%,91.50%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,67.04%,84.50%,79.39%,68.75%,62.50%,2.38%,4.50%,2.00%,1.50%,1.50%,94.44%,59.19%,Google,gemma-terms-of-use
+46,51.75%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,3.65,3.82,10.68,87.81%,76.75%,95.00%,92.50%,87.00%,88.21%,95.86%,94.00%,78.00%,85.00%,64.90%,80.62%,78.25%,75.00%,66.67%,5.62%,10.00%,4.00%,6.00%,2.50%,100.00%,50.88%,Meta,Meta Llama 3 Community
+47,51.73%,Ministral-8B-Instruct-2410 (FC),https://huggingface.co/mistralai/Ministral-8B-Instruct-2410,N/A,12.79,45.03,47.12,83.83%,71.83%,91.50%,84.50%,87.50%,79.57%,71.29%,86.00%,86.00%,75.00%,64.93%,75.19%,72.27%,62.50%,66.67%,11.25%,21.00%,8.50%,10.00%,5.50%,70.59%,55.28%,Mistral AI,Mistral AI Research License
+48,51.66%,MiniCPM3-4B-FC (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,160.19,184.1,464.0,80.83%,69.83%,91.50%,82.50%,79.50%,87.57%,89.29%,90.00%,86.00%,85.00%,69.97%,74.42%,63.91%,43.75%,62.50%,2.62%,5.00%,1.00%,3.00%,1.50%,72.22%,72.22%,openbmb,Apache-2.0
+49,51.66%,Claude-3.5-Sonnet-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,1.23,1.81,1.35,3.3,72.48%,81.42%,92.00%,70.50%,46.00%,80.00%,100.00%,92.00%,68.00%,60.00%,71.88%,86.05%,80.06%,81.25%,45.83%,7.50%,9.00%,5.50%,5.00%,10.50%,77.78%,64.40%,Anthropic,Proprietary
+50,51.55%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,5.23,8.66,13.19,85.29%,75.67%,90.50%,88.50%,86.50%,87.52%,88.07%,94.00%,88.00%,80.00%,67.84%,76.36%,74.26%,62.50%,62.50%,1.62%,2.00%,4.00%,0.50%,0.00%,83.33%,66.51%,Google,gemma-terms-of-use
+51,51.37%,Llama-3.3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.98,25.27,23.42,85.08%,74.83%,94.50%,84.00%,87.00%,90.68%,95.71%,98.00%,84.00%,85.00%,62.59%,80.62%,77.11%,93.75%,62.50%,6.87%,9.00%,8.00%,4.50%,6.00%,100.00%,48.71%,Meta,Meta Llama 3 Community
+52,51.32%,Claude-3-Opus-20240229 (Prompt),https://www.anthropic.com/news/claude-3-family,10.48,4.6,8.24,10.54,85.31%,79.75%,95.00%,85.50%,81.00%,86.32%,99.29%,90.00%,86.00%,70.00%,66.86%,84.11%,79.11%,68.75%,54.17%,7.13%,11.50%,2.50%,6.00%,8.50%,83.33%,40.25%,Anthropic,Proprietary
+53,51.22%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.55,1.96,3.42,82.10%,64.42%,93.50%,85.50%,85.00%,77.66%,56.14%,94.00%,88.00%,72.50%,65.93%,77.13%,69.61%,75.00%,66.67%,9.12%,15.00%,3.50%,9.00%,9.00%,66.67%,63.19%,Mistral AI,Proprietary
+54,50.70%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.3,52.75,24.53,84.21%,72.83%,93.50%,87.00%,83.50%,86.30%,83.71%,96.00%,88.00%,77.50%,60.95%,73.26%,73.31%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,77.78%,48.82%,Meta,Meta Llama 3 Community
+55,50.33%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.36,6.03,3.17,88.02%,78.58%,94.00%,89.50%,90.00%,87.77%,93.57%,96.00%,84.00%,77.50%,65.93%,82.17%,72.65%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,83.33%,55.09%,Mistral AI,Proprietary
+56,49.32%,Command-R-Plus (FC),https://txt.cohere.com/command-r-plus-microsoft-azure,N/A,N/A,N/A,N/A,77.02%,72.08%,89.50%,82.50%,64.00%,81.21%,90.86%,90.00%,84.00%,60.00%,58.91%,69.77%,58.78%,62.50%,45.83%,13.12%,16.50%,10.00%,9.00%,17.00%,72.22%,53.16%,Cohere For AI,cc-by-nc-4.0
+57,49.28%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.84,1.74,5.0,82.46%,72.83%,91.50%,84.00%,81.50%,86.36%,84.93%,92.00%,86.00%,82.50%,59.57%,67.83%,56.32%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,74.82%,IBM,Apache-2.0
+58,48.29%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.72,1.79,1.21,72.85%,77.92%,93.50%,67.00%,53.00%,70.39%,57.57%,90.00%,74.00%,60.00%,68.46%,79.84%,78.63%,75.00%,58.33%,5.62%,9.00%,2.00%,7.00%,4.50%,94.44%,58.39%,OpenAI,Proprietary
+59,47.27%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,3.97,3.91,8.21,76.79%,64.17%,89.50%,80.00%,73.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.90%,71.71%,65.81%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.78%,NousResearch,apache-2.0
+60,47.23%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,24.91,3.32,3.99,6.94,90.54%,82.17%,97.00%,92.50%,90.50%,90.12%,100.00%,94.00%,84.00%,82.50%,52.69%,85.27%,81.96%,93.75%,79.17%,8.38%,15.00%,6.00%,6.00%,6.50%,100.00%,4.35%,Mistral AI,Proprietary
+61,47.06%,Qwen2.5-3B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-3B-Instruct,N/A,1.03,1.43,1.78,80.79%,74.17%,90.50%,79.50%,79.00%,81.71%,80.86%,86.00%,80.00%,80.00%,58.60%,68.99%,66.48%,56.25%,62.50%,3.38%,5.50%,3.50%,2.00%,2.50%,88.89%,54.19%,Qwen,qwen
+62,46.91%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,7.79,45.67,14.71,80.56%,73.75%,92.00%,80.50%,76.00%,83.70%,87.29%,92.00%,78.00%,77.50%,55.75%,63.57%,64.86%,12.50%,45.83%,5.25%,8.50%,2.50%,4.50%,5.50%,88.89%,51.69%,Meta,Meta Llama 3 Community
+63,46.70%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,2.51,6.07,4.63,73.37%,71.00%,86.00%,70.00%,66.50%,85.61%,80.43%,94.00%,88.00%,80.00%,61.04%,70.16%,59.26%,56.25%,41.67%,1.12%,1.50%,2.50%,0.50%,0.00%,83.33%,63.04%,Qwen,apache-2.0
+64,45.27%,Hammer2.1-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-0.5b,N/A,1.29,3.16,2.85,69.12%,68.00%,83.00%,71.50%,54.00%,70.46%,68.36%,84.00%,82.00%,47.50%,62.86%,59.69%,58.02%,50.00%,45.83%,2.25%,4.00%,0.50%,3.00%,1.50%,77.78%,73.94%,MadeAgents,cc-by-nc-4.0
+65,44.83%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.76,1.38,2.9,3.4,56.65%,66.58%,95.00%,40.00%,25.00%,64.93%,87.21%,86.00%,64.00%,22.50%,69.57%,77.13%,67.62%,43.75%,41.67%,2.88%,4.50%,1.00%,3.50%,2.50%,66.67%,71.53%,Google,Proprietary
+66,44.76%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.36,1.73,4.04,3.5,59.15%,67.58%,94.00%,24.50%,50.50%,53.84%,87.36%,92.00%,16.00%,20.00%,72.10%,64.73%,71.51%,12.50%,12.50%,2.62%,4.50%,0.00%,3.00%,3.00%,77.78%,80.86%,Mistral AI,Proprietary
+67,43.12%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,10.63,32.72,23.76,73.06%,60.75%,87.50%,78.50%,65.50%,76.00%,61.00%,94.00%,84.00%,65.00%,57.62%,68.99%,60.02%,43.75%,41.67%,2.63%,3.50%,4.00%,2.50%,0.50%,66.67%,38.88%,NousResearch,apache-2.0
+68,43.07%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.74,1.73,4.5,3.51,63.58%,64.83%,86.00%,59.00%,44.50%,69.61%,77.93%,86.00%,62.00%,52.50%,61.39%,63.18%,66.10%,68.75%,50.00%,1.50%,2.50%,0.00%,1.50%,2.00%,88.89%,59.52%,Mistral AI,Proprietary
+69,42.99%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,7.0,2.63,15.88,5.36,61.67%,71.67%,94.00%,10.50%,70.50%,63.64%,83.57%,94.00%,22.00%,55.00%,68.55%,76.36%,73.12%,6.25%,45.83%,1.50%,3.50%,0.00%,1.00%,1.50%,83.33%,45.71%,Mistral AI,Proprietary
+70,42.53%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.79,1.65,10.01,3.26,86.12%,77.00%,93.50%,89.50%,84.50%,89.07%,93.79%,92.00%,88.00%,82.50%,48.96%,77.13%,74.45%,87.50%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,88.89%,6.43%,Mistral AI,Proprietary
+71,42.30%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,3.99,10.26,9.78,76.65%,68.08%,88.00%,75.50%,75.00%,76.80%,80.21%,84.00%,78.00%,65.00%,50.60%,56.59%,62.01%,37.50%,66.67%,3.25%,4.00%,4.50%,2.50%,2.00%,88.89%,39.00%,Qwen,apache-2.0
+72,40.91%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.49,3.74,8.22,11.19,61.25%,73.50%,92.00%,42.50%,37.00%,69.14%,90.07%,88.00%,46.00%,52.50%,60.15%,77.13%,73.03%,75.00%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,40.50%,Databricks,Databricks Open Model
+73,39.97%,FireFunction-v1 (FC),https://huggingface.co/fireworks-ai/firefunction-v1,N/A,2.27,3.77,3.71,43.00%,80.00%,92.00%,0.00%,0.00%,44.57%,88.29%,90.00%,0.00%,0.00%,70.41%,71.32%,72.93%,0.00%,0.00%,2.38%,5.00%,0.00%,2.00%,2.50%,94.44%,71.80%,Fireworks,Apache 2.0
+74,39.25%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,6.26,4.43,13.94,72.08%,76.83%,93.50%,77.00%,41.00%,60.63%,84.50%,92.00%,56.00%,10.00%,53.35%,78.29%,58.02%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,77.78%,44.95%,Salesforce,cc-by-nc-4.0
+75,38.94%,GLM-4-9b-Chat (FC),https://huggingface.co/THUDM/glm-4-9b-chat,N/A,6.09,15.35,13.2,36.67%,65.17%,81.50%,0.00%,0.00%,46.00%,94.00%,90.00%,0.00%,0.00%,66.77%,72.09%,64.39%,0.00%,0.00%,3.50%,3.50%,4.00%,2.50%,4.00%,66.67%,79.71%,THUDM,glm-4
+76,38.59%,MiniCPM3-4B (Prompt),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,20.78,49.16,64.58,65.88%,63.50%,72.50%,65.50%,62.00%,50.59%,40.36%,34.00%,48.00%,80.00%,54.46%,46.51%,34.76%,43.75%,41.67%,2.00%,3.00%,3.50%,1.00%,0.50%,50.00%,74.43%,openbmb,Apache-2.0
+77,36.92%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,1.13,0.55,2.27,45.88%,57.50%,53.00%,34.00%,39.00%,59.11%,47.93%,86.00%,40.00%,62.50%,54.15%,41.47%,38.65%,56.25%,37.50%,1.00%,1.50%,0.50%,1.00%,1.00%,61.11%,78.53%,Nexusflow,Apache 2.0
+78,35.69%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,1.33,2.38,2.97,57.31%,46.25%,56.50%,63.50%,63.00%,56.32%,49.79%,68.00%,60.00%,47.50%,49.09%,50.39%,47.01%,62.50%,29.17%,1.38%,2.50%,1.50%,0.50%,1.00%,77.78%,52.95%,Google,Proprietary
+79,34.30%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.05,8.85,20.61,60.79%,62.67%,82.50%,48.00%,50.00%,58.93%,47.71%,86.00%,42.00%,60.00%,47.93%,60.85%,61.44%,37.50%,33.33%,0.75%,1.50%,0.00%,1.00%,0.50%,77.78%,18.59%,Meta,Meta Llama 3 Community
+80,31.78%,GoGoAgent,https://gogoagent.ai,N/A,2.8,1.97,6.23,10.92%,43.67%,0.00%,0.00%,0.00%,89.86%,95.43%,96.00%,88.00%,80.00%,39.18%,0.00%,0.00%,0.00%,0.00%,1.00%,1.50%,2.00%,0.50%,0.00%,0.00%,96.67%,BitAgent,Proprietary
+81,31.25%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.91,1.57,0.96,3.37,26.94%,23.25%,74.00%,8.50%,2.00%,30.36%,52.93%,64.00%,2.00%,2.50%,58.73%,36.05%,65.24%,0.00%,8.33%,0.75%,0.50%,0.00%,1.50%,1.00%,44.44%,69.74%,Mistral AI,Proprietary
+82,29.27%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,3.09,11.89,5.41,54.29%,51.17%,79.00%,46.50%,40.50%,52.39%,46.57%,76.00%,52.00%,35.00%,39.00%,48.45%,40.27%,12.50%,25.00%,0.50%,0.50%,1.00%,0.00%,0.50%,94.44%,21.19%,Qwen,apache-2.0
+83,28.06%,Qwen2.5-0.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct,N/A,0.95,1.25,1.47,53.19%,58.25%,68.00%,53.50%,33.00%,61.89%,63.07%,70.00%,62.00%,52.50%,31.59%,53.88%,34.76%,56.25%,16.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,16.44%,Qwen,apache-2.0
+84,27.58%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,5.79,16.83,13.08,48.21%,55.83%,54.00%,48.50%,34.50%,50.18%,58.71%,58.00%,54.00%,30.00%,33.45%,51.55%,49.00%,37.50%,41.67%,5.38%,5.00%,7.50%,5.00%,4.00%,94.44%,4.86%,Meta,Meta Llama 3 Community
+85,27.13%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,5.44,12.05,12.13,25.29%,49.17%,24.50%,12.50%,15.00%,31.62%,53.00%,36.00%,30.00%,7.50%,44.96%,51.94%,52.61%,31.25%,25.00%,4.88%,7.00%,4.00%,4.50%,4.00%,100.00%,44.85%,Meta,Meta Llama 3 Community
+86,24.95%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,6.26,14.51,13.84,41.17%,71.67%,86.00%,5.00%,2.00%,42.95%,77.79%,90.00%,4.00%,0.00%,36.92%,63.95%,53.37%,6.25%,0.00%,0.12%,0.50%,0.00%,0.00%,0.00%,100.00%,6.69%,Salesforce,cc-by-nc-4.0
+87,22.43%,DeepSeek-Coder-V2-Lite-Instruct (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,N/A,13.9,30.47,39.18,4.88%,0.00%,1.50%,3.50%,14.50%,33.18%,17.71%,42.00%,28.00%,45.00%,39.40%,2.33%,3.80%,0.00%,8.33%,0.12%,0.50%,0.00%,0.00%,0.00%,0.00%,96.31%,DeepSeek,DeepSeek License
+88,22.36%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,3.78,7.03,11.84,17.10%,15.42%,52.00%,0.00%,1.00%,19.12%,22.50%,54.00%,0.00%,0.00%,43.76%,26.36%,18.52%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,38.89%,73.03%,Google,gemma-terms-of-use
+89,20.59%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.08,17.77,32.86,28.44%,29.25%,33.50%,36.00%,15.00%,25.27%,34.07%,28.00%,34.00%,5.00%,31.36%,31.40%,7.60%,12.50%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,38.89%,59.70%,Meta,Meta Llama 3 Community
\ No newline at end of file
From 580fb7019ae24baf926bd98b60063bf5f2faa4d7 Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao"
Date: Tue, 31 Dec 2024 19:34:34 +0800
Subject: [PATCH 4/5] update data.csv for updated BitAgent/GoGoAgent result
---
data_live.csv | 114 +++++++++++++++++++-------------------
data_non_live.csv | 138 +++++++++++++++++++++++-----------------------
data_overall.csv | 92 +++++++++++++++----------------
3 files changed, 172 insertions(+), 172 deletions(-)
diff --git a/data_live.csv b/data_live.csv
index ae2467d19..057d78acc 100644
--- a/data_live.csv
+++ b/data_live.csv
@@ -26,63 +26,63 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Py
25,GPT-4o-mini-2024-07-18 (FC),74.37%,76.61%,78.29%,76.16%,87.50%,70.83%,70.75%,83.33%
26,Qwen2.5-32B-Instruct (Prompt),74.14%,78.68%,82.17%,78.54%,62.50%,58.33%,66.67%,100.00%
27,Qwen2.5-14B-Instruct (Prompt),74.10%,75.13%,74.03%,75.78%,62.50%,66.67%,72.45%,77.78%
-28,Hammer2.1-3b (FC),73.91%,72.83%,72.48%,73.31%,62.50%,62.50%,75.40%,82.35%
-29,Functionary-Small-v3.1 (FC),73.66%,78.09%,79.07%,78.16%,81.25%,62.50%,66.78%,77.78%
-30,DeepSeek-Coder-V2 (FC),73.43%,77.13%,80.23%,77.02%,43.75%,70.83%,67.46%,88.89%
-31,xLAM-8x22b-r (FC),72.55%,79.57%,79.46%,79.68%,81.25%,75.00%,61.45%,88.89%
-32,claude-3.5-haiku-20241022 (FC),72.28%,76.98%,82.17%,78.35%,18.75%,0.00%,64.85%,83.33%
-33,Mistral-small-2402 (FC),72.10%,68.47%,64.73%,71.51%,12.50%,12.50%,77.55%,77.78%
-34,Claude-3.5-Sonnet-20241022 (Prompt),71.88%,80.61%,86.05%,80.06%,81.25%,45.83%,58.39%,77.78%
-35,xLAM-8x7b-r (FC),70.99%,77.50%,74.03%,79.30%,43.75%,58.33%,60.54%,94.44%
-36,claude-3.5-haiku-20241022 (Prompt),70.64%,76.46%,83.72%,75.02%,87.50%,54.17%,61.56%,77.78%
-37,Hammer2.1-1.5b (FC),70.59%,69.65%,70.93%,69.80%,50.00%,62.50%,71.88%,77.78%
-38,FireFunction-v1 (FC),70.41%,70.47%,71.32%,72.93%,0.00%,0.00%,69.84%,94.44%
-39,MiniCPM3-4B-FC (FC),69.97%,65.66%,74.42%,63.91%,43.75%,62.50%,76.53%,72.22%
-40,mistral-large-2407 (FC),69.84%,79.57%,84.88%,78.54%,62.50%,79.17%,54.88%,72.22%
-41,Gemini-1.0-Pro-002 (FC),69.57%,68.69%,77.13%,67.62%,43.75%,41.67%,70.98%,66.67%
-42,Command R7B (FC),69.21%,59.66%,63.18%,58.69%,56.25%,66.67%,84.13%,55.56%
-43,Gemini-1.5-Flash-001 (Prompt),68.86%,76.54%,76.74%,76.16%,93.75%,79.17%,56.80%,83.33%
-44,Open-Mixtral-8x22b (FC),68.55%,72.46%,76.36%,73.12%,6.25%,45.83%,62.24%,83.33%
-45,GPT-3.5-Turbo-0125 (Prompt),68.46%,78.46%,79.84%,78.63%,75.00%,58.33%,52.61%,94.44%
-46,DeepSeek-V3 (FC),68.33%,81.94%,82.95%,82.15%,81.25%,62.50%,47.05%,88.89%
-47,Gemma-2-9b-it (Prompt),67.84%,74.32%,76.36%,74.26%,62.50%,62.50%,57.60%,83.33%
-48,Qwen2.5-7B-Instruct (Prompt),67.35%,74.91%,75.97%,74.93%,62.50%,70.83%,55.33%,88.89%
-49,Gemma-2-27b-it (Prompt),67.04%,79.94%,84.50%,79.39%,68.75%,62.50%,46.71%,94.44%
-50,Claude-3-Opus-20240229 (Prompt),66.86%,79.50%,84.11%,79.11%,68.75%,54.17%,47.17%,83.33%
-51,GLM-4-9b-Chat (FC),66.77%,63.95%,72.09%,64.39%,0.00%,0.00%,71.09%,66.67%
-52,Open-Mixtral-8x22b (Prompt),65.93%,74.61%,82.17%,72.65%,81.25%,75.00%,52.27%,83.33%
-53,Open-Mistral-Nemo-2407 (FC),65.93%,71.06%,77.13%,69.61%,75.00%,66.67%,58.05%,66.67%
-54,FireFunction-v2 (FC),65.57%,77.94%,78.29%,78.35%,56.25%,70.83%,46.03%,94.44%
-55,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.19%,72.27%,62.50%,66.67%,53.06%,70.59%
-56,Hermes-2-Pro-Llama-3-8B (FC),64.90%,66.54%,71.71%,65.81%,56.25%,50.00%,62.81%,44.44%
-57,Meta-Llama-3-70B-Instruct (Prompt),64.90%,78.46%,80.62%,78.25%,75.00%,66.67%,43.42%,100.00%
-58,GPT-3.5-Turbo-0125 (FC),63.93%,79.05%,80.62%,79.68%,43.75%,58.33%,40.14%,94.44%
-59,GPT-4-turbo-2024-04-09 (Prompt),63.71%,84.75%,87.21%,84.14%,100.00%,75.00%,30.73%,100.00%
-60,Hammer2.1-0.5b (FC),62.86%,58.03%,59.69%,58.02%,50.00%,45.83%,69.95%,77.78%
-61,Llama-3.3-70B-Instruct (Prompt),62.59%,77.72%,80.62%,77.11%,93.75%,62.50%,38.66%,100.00%
-62,Llama-3.1-70B-Instruct (Prompt),62.06%,76.24%,77.13%,76.16%,87.50%,62.50%,39.57%,100.00%
-63,Open-Mixtral-8x7b (Prompt),61.39%,65.28%,63.18%,66.10%,68.75%,50.00%,54.88%,88.89%
-64,Qwen2.5-1.5B-Instruct (Prompt),61.04%,60.99%,70.16%,59.26%,56.25%,41.67%,60.66%,83.33%
-65,Llama-3.1-8B-Instruct (Prompt),60.95%,72.69%,73.26%,73.31%,56.25%,50.00%,42.63%,77.78%
-66,DBRX-Instruct (Prompt),60.15%,73.28%,77.13%,73.03%,75.00%,41.67%,39.34%,94.44%
-67,Granite-20b-FunctionCalling (FC),59.57%,58.33%,67.83%,56.32%,43.75%,54.17%,60.88%,88.89%
-68,Command-R-Plus (FC),58.91%,60.70%,69.77%,58.78%,62.50%,45.83%,55.90%,72.22%
-69,Mistral-Small-2402 (Prompt),58.73%,57.88%,36.05%,65.24%,0.00%,8.33%,60.32%,44.44%
-70,Qwen2.5-3B-Instruct (Prompt),58.60%,66.77%,68.99%,66.48%,56.25%,62.50%,45.46%,88.89%
-71,Hermes-2-Pro-Mistral-7B (FC),57.62%,61.21%,68.99%,60.02%,43.75%,41.67%,51.93%,66.67%
-72,Llama-3.2-3B-Instruct (Prompt),55.75%,63.66%,63.57%,64.86%,12.50%,45.83%,42.97%,88.89%
-73,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
-74,Nexusflow-Raven-v2 (FC),54.15%,39.38%,41.47%,38.65%,56.25%,37.50%,76.64%,61.11%
-75,xLAM-7b-fc-r (FC),53.35%,60.99%,78.29%,58.02%,31.25%,25.00%,41.16%,77.78%
-76,mistral-large-2407 (Prompt),52.69%,82.68%,85.27%,81.96%,93.75%,79.17%,5.78%,100.00%
-77,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89%
-78,Gemini-1.0-Pro-002 (Prompt),49.09%,47.52%,50.39%,47.01%,62.50%,29.17%,50.91%,77.78%
-79,Open-Mistral-Nemo-2407 (Prompt),48.96%,74.98%,77.13%,74.45%,87.50%,66.67%,8.28%,88.89%
-80,Meta-Llama-3-8B-Instruct (Prompt),47.93%,60.55%,60.85%,61.44%,37.50%,33.33%,28.00%,77.78%
-81,Llama-3.1-70B-Instruct (FC),44.96%,51.74%,51.94%,52.61%,31.25%,25.00%,33.45%,100.00%
-82,Gemma-2-2b-it (Prompt),43.76%,19.47%,26.36%,18.52%,0.00%,0.00%,81.07%,38.89%
-83,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00%
-84,GoGoAgent,39.18%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%
+28,GoGoAgent,73.92%,74.54%,72.09%,75.40%,68.75%,66.67%,72.90%,77.78%
+29,Hammer2.1-3b (FC),73.91%,72.83%,72.48%,73.31%,62.50%,62.50%,75.40%,82.35%
+30,Functionary-Small-v3.1 (FC),73.66%,78.09%,79.07%,78.16%,81.25%,62.50%,66.78%,77.78%
+31,DeepSeek-Coder-V2 (FC),73.43%,77.13%,80.23%,77.02%,43.75%,70.83%,67.46%,88.89%
+32,xLAM-8x22b-r (FC),72.55%,79.57%,79.46%,79.68%,81.25%,75.00%,61.45%,88.89%
+33,claude-3.5-haiku-20241022 (FC),72.28%,76.98%,82.17%,78.35%,18.75%,0.00%,64.85%,83.33%
+34,Mistral-small-2402 (FC),72.10%,68.47%,64.73%,71.51%,12.50%,12.50%,77.55%,77.78%
+35,Claude-3.5-Sonnet-20241022 (Prompt),71.88%,80.61%,86.05%,80.06%,81.25%,45.83%,58.39%,77.78%
+36,xLAM-8x7b-r (FC),70.99%,77.50%,74.03%,79.30%,43.75%,58.33%,60.54%,94.44%
+37,claude-3.5-haiku-20241022 (Prompt),70.64%,76.46%,83.72%,75.02%,87.50%,54.17%,61.56%,77.78%
+38,Hammer2.1-1.5b (FC),70.59%,69.65%,70.93%,69.80%,50.00%,62.50%,71.88%,77.78%
+39,FireFunction-v1 (FC),70.41%,70.47%,71.32%,72.93%,0.00%,0.00%,69.84%,94.44%
+40,MiniCPM3-4B-FC (FC),69.97%,65.66%,74.42%,63.91%,43.75%,62.50%,76.53%,72.22%
+41,mistral-large-2407 (FC),69.84%,79.57%,84.88%,78.54%,62.50%,79.17%,54.88%,72.22%
+42,Gemini-1.0-Pro-002 (FC),69.57%,68.69%,77.13%,67.62%,43.75%,41.67%,70.98%,66.67%
+43,Command R7B (FC),69.21%,59.66%,63.18%,58.69%,56.25%,66.67%,84.13%,55.56%
+44,Gemini-1.5-Flash-001 (Prompt),68.86%,76.54%,76.74%,76.16%,93.75%,79.17%,56.80%,83.33%
+45,Open-Mixtral-8x22b (FC),68.55%,72.46%,76.36%,73.12%,6.25%,45.83%,62.24%,83.33%
+46,GPT-3.5-Turbo-0125 (Prompt),68.46%,78.46%,79.84%,78.63%,75.00%,58.33%,52.61%,94.44%
+47,DeepSeek-V3 (FC),68.33%,81.94%,82.95%,82.15%,81.25%,62.50%,47.05%,88.89%
+48,Gemma-2-9b-it (Prompt),67.84%,74.32%,76.36%,74.26%,62.50%,62.50%,57.60%,83.33%
+49,Qwen2.5-7B-Instruct (Prompt),67.35%,74.91%,75.97%,74.93%,62.50%,70.83%,55.33%,88.89%
+50,Gemma-2-27b-it (Prompt),67.04%,79.94%,84.50%,79.39%,68.75%,62.50%,46.71%,94.44%
+51,Claude-3-Opus-20240229 (Prompt),66.86%,79.50%,84.11%,79.11%,68.75%,54.17%,47.17%,83.33%
+52,GLM-4-9b-Chat (FC),66.77%,63.95%,72.09%,64.39%,0.00%,0.00%,71.09%,66.67%
+53,Open-Mixtral-8x22b (Prompt),65.93%,74.61%,82.17%,72.65%,81.25%,75.00%,52.27%,83.33%
+54,Open-Mistral-Nemo-2407 (FC),65.93%,71.06%,77.13%,69.61%,75.00%,66.67%,58.05%,66.67%
+55,FireFunction-v2 (FC),65.57%,77.94%,78.29%,78.35%,56.25%,70.83%,46.03%,94.44%
+56,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.19%,72.27%,62.50%,66.67%,53.06%,70.59%
+57,Hermes-2-Pro-Llama-3-8B (FC),64.90%,66.54%,71.71%,65.81%,56.25%,50.00%,62.81%,44.44%
+58,Meta-Llama-3-70B-Instruct (Prompt),64.90%,78.46%,80.62%,78.25%,75.00%,66.67%,43.42%,100.00%
+59,GPT-3.5-Turbo-0125 (FC),63.93%,79.05%,80.62%,79.68%,43.75%,58.33%,40.14%,94.44%
+60,GPT-4-turbo-2024-04-09 (Prompt),63.71%,84.75%,87.21%,84.14%,100.00%,75.00%,30.73%,100.00%
+61,Hammer2.1-0.5b (FC),62.86%,58.03%,59.69%,58.02%,50.00%,45.83%,69.95%,77.78%
+62,Llama-3.3-70B-Instruct (Prompt),62.59%,77.72%,80.62%,77.11%,93.75%,62.50%,38.66%,100.00%
+63,Llama-3.1-70B-Instruct (Prompt),62.06%,76.24%,77.13%,76.16%,87.50%,62.50%,39.57%,100.00%
+64,Open-Mixtral-8x7b (Prompt),61.39%,65.28%,63.18%,66.10%,68.75%,50.00%,54.88%,88.89%
+65,Qwen2.5-1.5B-Instruct (Prompt),61.04%,60.99%,70.16%,59.26%,56.25%,41.67%,60.66%,83.33%
+66,Llama-3.1-8B-Instruct (Prompt),60.95%,72.69%,73.26%,73.31%,56.25%,50.00%,42.63%,77.78%
+67,DBRX-Instruct (Prompt),60.15%,73.28%,77.13%,73.03%,75.00%,41.67%,39.34%,94.44%
+68,Granite-20b-FunctionCalling (FC),59.57%,58.33%,67.83%,56.32%,43.75%,54.17%,60.88%,88.89%
+69,Command-R-Plus (FC),58.91%,60.70%,69.77%,58.78%,62.50%,45.83%,55.90%,72.22%
+70,Mistral-Small-2402 (Prompt),58.73%,57.88%,36.05%,65.24%,0.00%,8.33%,60.32%,44.44%
+71,Qwen2.5-3B-Instruct (Prompt),58.60%,66.77%,68.99%,66.48%,56.25%,62.50%,45.46%,88.89%
+72,Hermes-2-Pro-Mistral-7B (FC),57.62%,61.21%,68.99%,60.02%,43.75%,41.67%,51.93%,66.67%
+73,Llama-3.2-3B-Instruct (Prompt),55.75%,63.66%,63.57%,64.86%,12.50%,45.83%,42.97%,88.89%
+74,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
+75,Nexusflow-Raven-v2 (FC),54.15%,39.38%,41.47%,38.65%,56.25%,37.50%,76.64%,61.11%
+76,xLAM-7b-fc-r (FC),53.35%,60.99%,78.29%,58.02%,31.25%,25.00%,41.16%,77.78%
+77,mistral-large-2407 (Prompt),52.69%,82.68%,85.27%,81.96%,93.75%,79.17%,5.78%,100.00%
+78,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89%
+79,Gemini-1.0-Pro-002 (Prompt),49.09%,47.52%,50.39%,47.01%,62.50%,29.17%,50.91%,77.78%
+80,Open-Mistral-Nemo-2407 (Prompt),48.96%,74.98%,77.13%,74.45%,87.50%,66.67%,8.28%,88.89%
+81,Meta-Llama-3-8B-Instruct (Prompt),47.93%,60.55%,60.85%,61.44%,37.50%,33.33%,28.00%,77.78%
+82,Llama-3.1-70B-Instruct (FC),44.96%,51.74%,51.94%,52.61%,31.25%,25.00%,33.45%,100.00%
+83,Gemma-2-2b-it (Prompt),43.76%,19.47%,26.36%,18.52%,0.00%,0.00%,81.07%,38.89%
+84,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00%
85,Qwen2-1.5B-Instruct (Prompt),39.00%,41.23%,48.45%,40.27%,12.50%,25.00%,34.47%,94.44%
86,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00%
87,Llama-3.1-8B-Instruct (FC),33.45%,49.22%,51.55%,49.00%,37.50%,41.67%,8.05%,94.44%
diff --git a/data_non_live.csv b/data_non_live.csv
index b0401370d..400c2b3fd 100644
--- a/data_non_live.csv
+++ b/data_non_live.csv
@@ -4,75 +4,75 @@ Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simpl
3,Gemini-1.5-Pro-002 (Prompt),89.10%,88.58%,91.27%,78.33%,95.00%,64.00%,76.00%,93.50%,92.50%,90.00%,98.57%,100.00%,97.14%,94.00%,90.00%,82.50%,82.50%
4,ToolACE-8B (FC),88.93%,87.54%,89.21%,76.67%,91.00%,65.00%,74.00%,93.50%,90.50%,89.50%,97.36%,99.00%,95.71%,94.00%,88.00%,77.50%,93.33%
5,gpt-4o-2024-11-20 (Prompt),88.79%,88.10%,89.38%,79.42%,96.25%,66.00%,76.00%,95.50%,94.00%,83.50%,100.00%,100.00%,100.00%,94.00%,86.00%,77.50%,89.17%
-6,DeepSeek-Coder-V2 (FC),88.54%,89.44%,91.23%,78.75%,96.25%,64.00%,76.00%,94.50%,93.50%,91.00%,96.43%,100.00%,92.86%,94.00%,92.00%,82.50%,74.17%
-7,watt-tool-8B (FC),88.32%,86.56%,89.34%,76.75%,93.25%,63.00%,74.00%,95.00%,94.00%,80.50%,97.86%,100.00%,95.71%,94.00%,88.00%,77.50%,91.25%
-8,gpt-4o-2024-11-20 (FC),88.08%,87.42%,89.20%,77.17%,91.50%,64.00%,76.00%,93.50%,93.00%,86.00%,88.29%,98.00%,78.57%,92.00%,94.00%,82.50%,86.25%
-9,Llama-3.1-70B-Instruct (Prompt),87.82%,89.98%,90.12%,77.92%,95.75%,62.00%,76.00%,96.00%,94.50%,91.50%,94.00%,98.00%,90.00%,98.00%,86.00%,82.50%,70.00%
-10,Gemma-2-27b-it (Prompt),87.09%,88.94%,89.09%,79.75%,94.25%,63.00%,82.00%,92.50%,91.50%,92.00%,87.86%,100.00%,75.71%,98.00%,88.00%,82.50%,71.67%
-11,Qwen2.5-32B-Instruct (Prompt),87.03%,85.81%,89.79%,70.25%,96.75%,52.00%,62.00%,94.50%,90.50%,88.00%,96.64%,99.00%,94.29%,90.00%,90.00%,82.50%,80.83%
-12,Hammer2.1-7b (FC),86.88%,88.65%,85.48%,78.08%,96.25%,66.00%,72.00%,95.00%,93.50%,88.00%,86.43%,100.00%,72.86%,92.00%,86.00%,77.50%,85.42%
-13,Qwen2.5-14B-Instruct (Prompt),86.64%,85.69%,88.84%,73.25%,95.75%,56.00%,68.00%,92.50%,92.00%,85.00%,92.36%,99.00%,85.71%,90.00%,88.00%,85.00%,81.67%
-14,watt-tool-70B (FC),86.44%,84.06%,89.39%,78.75%,98.25%,64.00%,74.00%,94.00%,85.50%,78.00%,98.57%,100.00%,97.14%,94.00%,90.00%,75.00%,84.17%
-15,Gemini-1.5-Pro-001 (FC),86.01%,84.33%,87.95%,69.83%,92.50%,55.00%,62.00%,93.00%,92.00%,82.50%,91.79%,95.00%,88.57%,92.00%,88.00%,80.00%,85.00%
-16,Qwen2.5-7B-Instruct (Prompt),86.00%,86.46%,88.29%,75.33%,96.00%,60.00%,70.00%,94.50%,91.50%,84.50%,92.14%,100.00%,84.29%,90.00%,86.00%,85.00%,75.00%
-17,Gemini-1.5-Pro-001 (Prompt),85.82%,85.56%,85.77%,75.25%,93.75%,60.00%,72.00%,91.50%,91.50%,84.00%,91.57%,96.00%,87.14%,90.00%,84.00%,77.50%,87.08%
-18,Hammer2.1-3b (FC),85.79%,86.85%,84.09%,81.42%,95.25%,67.00%,82.00%,95.00%,89.50%,81.50%,82.86%,100.00%,65.71%,92.00%,84.00%,77.50%,88.33%
-19,Functionary-Small-v3.1 (FC),85.61%,86.75%,87.12%,74.00%,96.00%,62.00%,64.00%,94.50%,90.50%,88.00%,89.50%,99.00%,80.00%,94.00%,90.00%,75.00%,75.00%
-20,Gemma-2-9b-it (Prompt),85.18%,85.29%,87.52%,75.67%,93.00%,60.00%,74.00%,90.50%,88.50%,86.50%,88.07%,99.00%,77.14%,94.00%,88.00%,80.00%,75.42%
-21,GPT-4-turbo-2024-04-09 (FC),85.02%,84.73%,85.21%,70.42%,92.25%,59.00%,60.00%,91.00%,90.00%,87.50%,87.36%,99.00%,75.71%,90.00%,86.00%,77.50%,85.42%
-22,Gemini-1.5-Pro-002 (FC),85.01%,87.29%,84.61%,73.17%,93.50%,58.00%,68.00%,95.00%,91.50%,89.50%,75.93%,99.00%,52.86%,94.00%,86.00%,82.50%,77.50%
-23,Granite-20b-FunctionCalling (FC),84.89%,82.46%,86.36%,72.83%,90.50%,66.00%,62.00%,91.50%,84.00%,81.50%,84.93%,97.00%,72.86%,92.00%,86.00%,82.50%,88.75%
-24,FireFunction-v2 (FC),84.89%,88.46%,87.54%,80.33%,96.00%,65.00%,80.00%,94.00%,91.50%,88.00%,96.64%,99.00%,94.29%,92.00%,84.00%,77.50%,60.00%
-25,Meta-Llama-3-70B-Instruct (Prompt),84.72%,87.81%,88.21%,76.75%,95.25%,61.00%,74.00%,95.00%,92.50%,87.00%,95.86%,96.00%,95.71%,94.00%,78.00%,85.00%,58.33%
-26,DeepSeek-V3 (FC),84.66%,89.17%,83.39%,78.67%,97.00%,65.00%,74.00%,95.50%,91.00%,91.50%,62.57%,98.00%,27.14%,94.00%,92.00%,85.00%,71.67%
-27,Llama-3.3-70B-Instruct (Prompt),84.64%,85.08%,90.68%,74.83%,94.50%,60.00%,70.00%,94.50%,84.00%,87.00%,95.71%,100.00%,91.43%,98.00%,84.00%,85.00%,58.75%
-28,GPT-4-turbo-2024-04-09 (Prompt),84.63%,90.88%,89.45%,82.50%,96.50%,67.00%,84.00%,95.50%,93.50%,92.00%,99.29%,100.00%,98.57%,96.00%,80.00%,82.50%,40.42%
-29,Open-Mixtral-8x22b (Prompt),84.56%,88.02%,87.77%,78.58%,93.75%,60.00%,82.00%,94.00%,89.50%,90.00%,93.57%,100.00%,87.14%,96.00%,84.00%,77.50%,57.92%
-30,xLAM-8x22b-r (FC),84.49%,83.69%,87.88%,77.75%,95.25%,64.00%,74.00%,94.50%,86.50%,76.00%,95.00%,100.00%,90.00%,94.00%,90.00%,72.50%,74.17%
-31,GPT-4o-mini-2024-07-18 (Prompt),84.17%,86.77%,80.84%,80.08%,94.25%,66.00%,80.00%,90.50%,89.50%,87.00%,62.86%,100.00%,25.71%,96.00%,82.00%,82.50%,87.08%
-32,GPT-4o-mini-2024-07-18 (FC),83.76%,85.21%,83.57%,74.83%,90.50%,64.00%,70.00%,92.00%,90.00%,84.00%,83.29%,98.00%,68.57%,92.00%,84.00%,75.00%,78.75%
-33,o1-2024-12-17 (Prompt),83.57%,85.67%,79.77%,72.67%,92.00%,60.00%,66.00%,93.50%,91.50%,85.00%,58.57%,100.00%,17.14%,92.00%,86.00%,82.50%,90.42%
-34,Hammer2.1-1.5b (FC),83.49%,82.79%,83.39%,74.67%,90.00%,64.00%,70.00%,92.00%,84.50%,80.00%,86.57%,96.00%,77.14%,90.00%,82.00%,75.00%,86.67%
-35,Gemini-1.5-Flash-001 (Prompt),82.87%,85.69%,83.59%,70.75%,84.25%,64.00%,64.00%,90.00%,91.50%,90.50%,80.36%,85.00%,75.71%,92.00%,82.00%,80.00%,68.75%
-36,claude-3.5-haiku-20241022 (Prompt),82.40%,83.19%,84.71%,76.25%,92.75%,64.00%,72.00%,93.00%,84.00%,79.50%,97.86%,100.00%,95.71%,90.00%,76.00%,75.00%,70.00%
-37,MiniCPM3-4B-FC (FC),82.39%,80.83%,87.57%,69.83%,90.50%,59.00%,60.00%,91.50%,82.50%,79.50%,89.29%,100.00%,78.57%,90.00%,86.00%,85.00%,67.92%
-38,Command R7B (FC),82.29%,81.67%,84.02%,68.17%,92.50%,56.00%,56.00%,91.50%,85.50%,81.50%,87.07%,97.00%,77.14%,92.00%,82.00%,75.00%,77.92%
-39,o1-mini-2024-09-12 (Prompt),81.97%,78.92%,82.70%,71.17%,87.50%,62.00%,64.00%,89.00%,83.50%,72.00%,89.29%,100.00%,78.57%,86.00%,78.00%,77.50%,91.25%
-40,Llama-3.1-8B-Instruct (Prompt),81.89%,84.21%,86.30%,72.83%,93.50%,59.00%,66.00%,93.50%,87.00%,83.50%,83.71%,96.00%,71.43%,96.00%,88.00%,77.50%,55.00%
-41,mistral-large-2407 (FC),81.73%,86.81%,84.38%,74.25%,95.75%,61.00%,66.00%,92.50%,90.00%,90.50%,75.00%,100.00%,50.00%,94.00%,86.00%,82.50%,50.83%
-42,Gemini-1.5-Flash-002 (Prompt),81.16%,81.65%,80.64%,73.58%,94.75%,60.00%,66.00%,91.50%,90.00%,71.50%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25%
-43,mistral-large-2407 (Prompt),80.62%,90.54%,90.12%,82.17%,96.50%,66.00%,84.00%,97.00%,92.50%,90.50%,100.00%,100.00%,100.00%,94.00%,84.00%,82.50%,2.92%
-44,Claude-3-Opus-20240229 (Prompt),79.99%,85.31%,86.32%,79.75%,96.25%,65.00%,78.00%,95.00%,85.50%,81.00%,99.29%,100.00%,98.57%,90.00%,86.00%,70.00%,33.33%
-45,Llama-3.2-3B-Instruct (Prompt),79.72%,80.56%,83.70%,73.75%,92.25%,57.00%,72.00%,92.00%,80.50%,76.00%,87.29%,96.00%,78.57%,92.00%,78.00%,77.50%,60.42%
-46,Qwen2.5-3B-Instruct (Prompt),79.22%,80.79%,81.71%,74.17%,91.50%,59.00%,72.00%,90.50%,79.50%,79.00%,80.86%,96.00%,65.71%,86.00%,80.00%,80.00%,62.92%
-47,Gemini-1.5-Flash-002 (FC),79.15%,81.75%,73.21%,65.50%,87.50%,57.00%,52.00%,91.50%,80.50%,89.50%,68.86%,72.00%,65.71%,90.00%,54.00%,80.00%,92.50%
-48,xLAM-7b-r (FC),79.03%,81.06%,79.88%,74.25%,90.75%,62.00%,70.00%,95.50%,81.00%,73.50%,74.00%,98.00%,50.00%,96.00%,82.00%,67.50%,67.50%
-49,Ministral-8B-Instruct-2410 (FC),79.01%,83.83%,79.57%,71.83%,93.50%,60.00%,62.00%,91.50%,84.50%,87.50%,71.29%,94.00%,48.57%,86.00%,86.00%,75.00%,57.50%
-50,Mistral-Medium-2312 (Prompt),78.62%,73.12%,81.57%,69.50%,91.50%,57.00%,60.00%,88.50%,69.00%,65.50%,93.29%,98.00%,88.57%,86.00%,72.00%,75.00%,88.75%
-51,Open-Mistral-Nemo-2407 (FC),78.60%,82.10%,77.66%,64.42%,91.25%,34.00%,68.00%,93.50%,85.50%,85.00%,56.14%,98.00%,14.29%,94.00%,88.00%,72.50%,68.33%
-52,Open-Mistral-Nemo-2407 (Prompt),78.37%,86.12%,89.07%,77.00%,92.00%,59.00%,80.00%,93.50%,89.50%,84.50%,93.79%,99.00%,88.57%,92.00%,88.00%,82.50%,4.58%
-53,GPT-3.5-Turbo-0125 (FC),78.20%,83.94%,83.79%,74.25%,94.75%,62.00%,66.00%,93.50%,89.00%,79.00%,96.14%,98.00%,94.29%,88.00%,86.00%,65.00%,32.92%
-54,Qwen2.5-1.5B-Instruct (Prompt),77.93%,73.37%,85.61%,71.00%,89.00%,54.00%,70.00%,86.00%,70.00%,66.50%,80.43%,98.00%,62.86%,94.00%,88.00%,80.00%,65.42%
-55,Gemini-1.5-Flash-001 (FC),76.51%,77.54%,74.80%,65.17%,93.50%,56.00%,46.00%,94.50%,73.00%,77.50%,62.21%,93.00%,31.43%,88.00%,74.00%,75.00%,79.17%
-56,Command-R-Plus (FC),75.93%,77.02%,81.21%,72.08%,87.25%,59.00%,70.00%,89.50%,82.50%,64.00%,90.86%,96.00%,85.71%,90.00%,84.00%,60.00%,50.42%
-57,Claude-3.5-Sonnet-20241022 (Prompt),75.59%,72.48%,80.00%,81.42%,94.25%,68.00%,82.00%,92.00%,70.50%,46.00%,100.00%,100.00%,100.00%,92.00%,68.00%,60.00%,70.42%
-58,Hermes-2-Pro-Llama-3-8B (FC),74.54%,76.79%,76.23%,64.17%,90.50%,56.00%,46.00%,89.50%,80.00%,73.50%,70.43%,98.00%,42.86%,94.00%,78.00%,62.50%,58.75%
-59,Qwen2-7B-Instruct (Prompt),73.06%,76.65%,76.80%,68.08%,84.25%,58.00%,62.00%,88.00%,75.50%,75.00%,80.21%,89.00%,71.43%,84.00%,78.00%,65.00%,43.75%
-60,xLAM-8x7b-r (FC),71.17%,67.65%,74.05%,73.58%,91.75%,59.00%,70.00%,90.00%,69.00%,38.00%,89.21%,97.00%,81.43%,90.00%,72.00%,45.00%,73.75%
-61,GPT-3.5-Turbo-0125 (Prompt),70.79%,72.85%,70.39%,77.92%,96.75%,61.00%,76.00%,93.50%,67.00%,53.00%,57.57%,98.00%,17.14%,90.00%,74.00%,60.00%,64.17%
-62,Hammer2.1-0.5b (FC),70.70%,69.12%,70.46%,68.00%,84.00%,62.00%,58.00%,83.00%,71.50%,54.00%,68.36%,91.00%,45.71%,84.00%,82.00%,47.50%,77.92%
-63,Hermes-2-Pro-Mistral-7B (FC),69.12%,73.06%,76.00%,60.75%,86.25%,56.00%,40.00%,87.50%,78.50%,65.50%,61.00%,92.00%,30.00%,94.00%,84.00%,65.00%,25.83%
-64,Open-Mixtral-8x7b (Prompt),66.33%,63.58%,69.61%,64.83%,89.50%,51.00%,54.00%,86.00%,59.00%,44.50%,77.93%,93.00%,62.86%,86.00%,62.00%,52.50%,64.17%
-65,xLAM-7b-fc-r (FC),64.40%,72.08%,60.63%,76.83%,93.50%,65.00%,72.00%,93.50%,77.00%,41.00%,84.50%,99.00%,70.00%,92.00%,56.00%,10.00%,48.75%
-66,DBRX-Instruct (Prompt),62.58%,61.25%,69.14%,73.50%,92.50%,56.00%,72.00%,92.00%,42.50%,37.00%,90.07%,93.00%,87.14%,88.00%,46.00%,52.50%,41.67%
-67,Gemini-1.0-Pro-002 (FC),62.04%,56.65%,64.93%,66.58%,93.75%,52.00%,54.00%,95.00%,40.00%,25.00%,87.21%,93.00%,81.43%,86.00%,64.00%,22.50%,72.08%
-68,Claude-3-Opus-20240229 (FC),61.10%,57.92%,59.46%,67.17%,88.50%,59.00%,54.00%,93.00%,39.50%,32.00%,80.36%,95.00%,65.71%,88.00%,42.00%,27.50%,80.42%
-69,Mistral-small-2402 (FC),59.57%,59.15%,53.84%,67.58%,91.75%,59.00%,52.00%,94.00%,24.50%,50.50%,87.36%,99.00%,75.71%,92.00%,16.00%,20.00%,84.17%
-70,MiniCPM3-4B (Prompt),59.31%,65.88%,50.59%,63.50%,84.50%,48.00%,58.00%,72.50%,65.50%,62.00%,40.36%,35.00%,45.71%,34.00%,48.00%,80.00%,67.92%
-71,Open-Mixtral-8x22b (FC),58.93%,61.67%,63.64%,71.67%,93.00%,66.00%,56.00%,94.00%,10.50%,70.50%,83.57%,100.00%,67.14%,94.00%,22.00%,55.00%,29.17%
-72,Gemini-1.0-Pro-002 (Prompt),56.62%,57.31%,56.32%,46.25%,58.75%,26.00%,54.00%,56.50%,63.50%,63.00%,49.79%,61.00%,38.57%,68.00%,60.00%,47.50%,55.00%
-73,Nexusflow-Raven-v2 (FC),55.59%,45.88%,59.11%,57.50%,37.50%,63.00%,72.00%,53.00%,34.00%,39.00%,47.93%,83.00%,12.86%,86.00%,40.00%,62.50%,80.42%
-74,GoGoAgent,55.16%,10.92%,89.86%,43.67%,0.00%,63.00%,68.00%,0.00%,0.00%,0.00%,95.43%,98.00%,92.86%,96.00%,88.00%,80.00%,93.33%
+6,GoGoAgent,88.63%,86.23%,89.86%,75.42%,95.25%,63.00%,68.00%,93.00%,92.00%,84.50%,95.43%,98.00%,92.86%,96.00%,88.00%,80.00%,93.33%
+7,DeepSeek-Coder-V2 (FC),88.54%,89.44%,91.23%,78.75%,96.25%,64.00%,76.00%,94.50%,93.50%,91.00%,96.43%,100.00%,92.86%,94.00%,92.00%,82.50%,74.17%
+8,watt-tool-8B (FC),88.32%,86.56%,89.34%,76.75%,93.25%,63.00%,74.00%,95.00%,94.00%,80.50%,97.86%,100.00%,95.71%,94.00%,88.00%,77.50%,91.25%
+9,gpt-4o-2024-11-20 (FC),88.08%,87.42%,89.20%,77.17%,91.50%,64.00%,76.00%,93.50%,93.00%,86.00%,88.29%,98.00%,78.57%,92.00%,94.00%,82.50%,86.25%
+10,Llama-3.1-70B-Instruct (Prompt),87.82%,89.98%,90.12%,77.92%,95.75%,62.00%,76.00%,96.00%,94.50%,91.50%,94.00%,98.00%,90.00%,98.00%,86.00%,82.50%,70.00%
+11,Gemma-2-27b-it (Prompt),87.09%,88.94%,89.09%,79.75%,94.25%,63.00%,82.00%,92.50%,91.50%,92.00%,87.86%,100.00%,75.71%,98.00%,88.00%,82.50%,71.67%
+12,Qwen2.5-32B-Instruct (Prompt),87.03%,85.81%,89.79%,70.25%,96.75%,52.00%,62.00%,94.50%,90.50%,88.00%,96.64%,99.00%,94.29%,90.00%,90.00%,82.50%,80.83%
+13,Hammer2.1-7b (FC),86.88%,88.65%,85.48%,78.08%,96.25%,66.00%,72.00%,95.00%,93.50%,88.00%,86.43%,100.00%,72.86%,92.00%,86.00%,77.50%,85.42%
+14,Qwen2.5-14B-Instruct (Prompt),86.64%,85.69%,88.84%,73.25%,95.75%,56.00%,68.00%,92.50%,92.00%,85.00%,92.36%,99.00%,85.71%,90.00%,88.00%,85.00%,81.67%
+15,watt-tool-70B (FC),86.44%,84.06%,89.39%,78.75%,98.25%,64.00%,74.00%,94.00%,85.50%,78.00%,98.57%,100.00%,97.14%,94.00%,90.00%,75.00%,84.17%
+16,Gemini-1.5-Pro-001 (FC),86.01%,84.33%,87.95%,69.83%,92.50%,55.00%,62.00%,93.00%,92.00%,82.50%,91.79%,95.00%,88.57%,92.00%,88.00%,80.00%,85.00%
+17,Qwen2.5-7B-Instruct (Prompt),86.00%,86.46%,88.29%,75.33%,96.00%,60.00%,70.00%,94.50%,91.50%,84.50%,92.14%,100.00%,84.29%,90.00%,86.00%,85.00%,75.00%
+18,Gemini-1.5-Pro-001 (Prompt),85.82%,85.56%,85.77%,75.25%,93.75%,60.00%,72.00%,91.50%,91.50%,84.00%,91.57%,96.00%,87.14%,90.00%,84.00%,77.50%,87.08%
+19,Hammer2.1-3b (FC),85.79%,86.85%,84.09%,81.42%,95.25%,67.00%,82.00%,95.00%,89.50%,81.50%,82.86%,100.00%,65.71%,92.00%,84.00%,77.50%,88.33%
+20,Functionary-Small-v3.1 (FC),85.61%,86.75%,87.12%,74.00%,96.00%,62.00%,64.00%,94.50%,90.50%,88.00%,89.50%,99.00%,80.00%,94.00%,90.00%,75.00%,75.00%
+21,Gemma-2-9b-it (Prompt),85.18%,85.29%,87.52%,75.67%,93.00%,60.00%,74.00%,90.50%,88.50%,86.50%,88.07%,99.00%,77.14%,94.00%,88.00%,80.00%,75.42%
+22,GPT-4-turbo-2024-04-09 (FC),85.02%,84.73%,85.21%,70.42%,92.25%,59.00%,60.00%,91.00%,90.00%,87.50%,87.36%,99.00%,75.71%,90.00%,86.00%,77.50%,85.42%
+23,Gemini-1.5-Pro-002 (FC),85.01%,87.29%,84.61%,73.17%,93.50%,58.00%,68.00%,95.00%,91.50%,89.50%,75.93%,99.00%,52.86%,94.00%,86.00%,82.50%,77.50%
+24,Granite-20b-FunctionCalling (FC),84.89%,82.46%,86.36%,72.83%,90.50%,66.00%,62.00%,91.50%,84.00%,81.50%,84.93%,97.00%,72.86%,92.00%,86.00%,82.50%,88.75%
+25,FireFunction-v2 (FC),84.89%,88.46%,87.54%,80.33%,96.00%,65.00%,80.00%,94.00%,91.50%,88.00%,96.64%,99.00%,94.29%,92.00%,84.00%,77.50%,60.00%
+26,Meta-Llama-3-70B-Instruct (Prompt),84.72%,87.81%,88.21%,76.75%,95.25%,61.00%,74.00%,95.00%,92.50%,87.00%,95.86%,96.00%,95.71%,94.00%,78.00%,85.00%,58.33%
+27,DeepSeek-V3 (FC),84.66%,89.17%,83.39%,78.67%,97.00%,65.00%,74.00%,95.50%,91.00%,91.50%,62.57%,98.00%,27.14%,94.00%,92.00%,85.00%,71.67%
+28,Llama-3.3-70B-Instruct (Prompt),84.64%,85.08%,90.68%,74.83%,94.50%,60.00%,70.00%,94.50%,84.00%,87.00%,95.71%,100.00%,91.43%,98.00%,84.00%,85.00%,58.75%
+29,GPT-4-turbo-2024-04-09 (Prompt),84.63%,90.88%,89.45%,82.50%,96.50%,67.00%,84.00%,95.50%,93.50%,92.00%,99.29%,100.00%,98.57%,96.00%,80.00%,82.50%,40.42%
+30,Open-Mixtral-8x22b (Prompt),84.56%,88.02%,87.77%,78.58%,93.75%,60.00%,82.00%,94.00%,89.50%,90.00%,93.57%,100.00%,87.14%,96.00%,84.00%,77.50%,57.92%
+31,xLAM-8x22b-r (FC),84.49%,83.69%,87.88%,77.75%,95.25%,64.00%,74.00%,94.50%,86.50%,76.00%,95.00%,100.00%,90.00%,94.00%,90.00%,72.50%,74.17%
+32,GPT-4o-mini-2024-07-18 (Prompt),84.17%,86.77%,80.84%,80.08%,94.25%,66.00%,80.00%,90.50%,89.50%,87.00%,62.86%,100.00%,25.71%,96.00%,82.00%,82.50%,87.08%
+33,GPT-4o-mini-2024-07-18 (FC),83.76%,85.21%,83.57%,74.83%,90.50%,64.00%,70.00%,92.00%,90.00%,84.00%,83.29%,98.00%,68.57%,92.00%,84.00%,75.00%,78.75%
+34,o1-2024-12-17 (Prompt),83.57%,85.67%,79.77%,72.67%,92.00%,60.00%,66.00%,93.50%,91.50%,85.00%,58.57%,100.00%,17.14%,92.00%,86.00%,82.50%,90.42%
+35,Hammer2.1-1.5b (FC),83.49%,82.79%,83.39%,74.67%,90.00%,64.00%,70.00%,92.00%,84.50%,80.00%,86.57%,96.00%,77.14%,90.00%,82.00%,75.00%,86.67%
+36,Gemini-1.5-Flash-001 (Prompt),82.87%,85.69%,83.59%,70.75%,84.25%,64.00%,64.00%,90.00%,91.50%,90.50%,80.36%,85.00%,75.71%,92.00%,82.00%,80.00%,68.75%
+37,claude-3.5-haiku-20241022 (Prompt),82.40%,83.19%,84.71%,76.25%,92.75%,64.00%,72.00%,93.00%,84.00%,79.50%,97.86%,100.00%,95.71%,90.00%,76.00%,75.00%,70.00%
+38,MiniCPM3-4B-FC (FC),82.39%,80.83%,87.57%,69.83%,90.50%,59.00%,60.00%,91.50%,82.50%,79.50%,89.29%,100.00%,78.57%,90.00%,86.00%,85.00%,67.92%
+39,Command R7B (FC),82.29%,81.67%,84.02%,68.17%,92.50%,56.00%,56.00%,91.50%,85.50%,81.50%,87.07%,97.00%,77.14%,92.00%,82.00%,75.00%,77.92%
+40,o1-mini-2024-09-12 (Prompt),81.97%,78.92%,82.70%,71.17%,87.50%,62.00%,64.00%,89.00%,83.50%,72.00%,89.29%,100.00%,78.57%,86.00%,78.00%,77.50%,91.25%
+41,Llama-3.1-8B-Instruct (Prompt),81.89%,84.21%,86.30%,72.83%,93.50%,59.00%,66.00%,93.50%,87.00%,83.50%,83.71%,96.00%,71.43%,96.00%,88.00%,77.50%,55.00%
+42,mistral-large-2407 (FC),81.73%,86.81%,84.38%,74.25%,95.75%,61.00%,66.00%,92.50%,90.00%,90.50%,75.00%,100.00%,50.00%,94.00%,86.00%,82.50%,50.83%
+43,Gemini-1.5-Flash-002 (Prompt),81.16%,81.65%,80.64%,73.58%,94.75%,60.00%,66.00%,91.50%,90.00%,71.50%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25%
+44,mistral-large-2407 (Prompt),80.62%,90.54%,90.12%,82.17%,96.50%,66.00%,84.00%,97.00%,92.50%,90.50%,100.00%,100.00%,100.00%,94.00%,84.00%,82.50%,2.92%
+45,Claude-3-Opus-20240229 (Prompt),79.99%,85.31%,86.32%,79.75%,96.25%,65.00%,78.00%,95.00%,85.50%,81.00%,99.29%,100.00%,98.57%,90.00%,86.00%,70.00%,33.33%
+46,Llama-3.2-3B-Instruct (Prompt),79.72%,80.56%,83.70%,73.75%,92.25%,57.00%,72.00%,92.00%,80.50%,76.00%,87.29%,96.00%,78.57%,92.00%,78.00%,77.50%,60.42%
+47,Qwen2.5-3B-Instruct (Prompt),79.22%,80.79%,81.71%,74.17%,91.50%,59.00%,72.00%,90.50%,79.50%,79.00%,80.86%,96.00%,65.71%,86.00%,80.00%,80.00%,62.92%
+48,Gemini-1.5-Flash-002 (FC),79.15%,81.75%,73.21%,65.50%,87.50%,57.00%,52.00%,91.50%,80.50%,89.50%,68.86%,72.00%,65.71%,90.00%,54.00%,80.00%,92.50%
+49,xLAM-7b-r (FC),79.03%,81.06%,79.88%,74.25%,90.75%,62.00%,70.00%,95.50%,81.00%,73.50%,74.00%,98.00%,50.00%,96.00%,82.00%,67.50%,67.50%
+50,Ministral-8B-Instruct-2410 (FC),79.01%,83.83%,79.57%,71.83%,93.50%,60.00%,62.00%,91.50%,84.50%,87.50%,71.29%,94.00%,48.57%,86.00%,86.00%,75.00%,57.50%
+51,Mistral-Medium-2312 (Prompt),78.62%,73.12%,81.57%,69.50%,91.50%,57.00%,60.00%,88.50%,69.00%,65.50%,93.29%,98.00%,88.57%,86.00%,72.00%,75.00%,88.75%
+52,Open-Mistral-Nemo-2407 (FC),78.60%,82.10%,77.66%,64.42%,91.25%,34.00%,68.00%,93.50%,85.50%,85.00%,56.14%,98.00%,14.29%,94.00%,88.00%,72.50%,68.33%
+53,Open-Mistral-Nemo-2407 (Prompt),78.37%,86.12%,89.07%,77.00%,92.00%,59.00%,80.00%,93.50%,89.50%,84.50%,93.79%,99.00%,88.57%,92.00%,88.00%,82.50%,4.58%
+54,GPT-3.5-Turbo-0125 (FC),78.20%,83.94%,83.79%,74.25%,94.75%,62.00%,66.00%,93.50%,89.00%,79.00%,96.14%,98.00%,94.29%,88.00%,86.00%,65.00%,32.92%
+55,Qwen2.5-1.5B-Instruct (Prompt),77.93%,73.37%,85.61%,71.00%,89.00%,54.00%,70.00%,86.00%,70.00%,66.50%,80.43%,98.00%,62.86%,94.00%,88.00%,80.00%,65.42%
+56,Gemini-1.5-Flash-001 (FC),76.51%,77.54%,74.80%,65.17%,93.50%,56.00%,46.00%,94.50%,73.00%,77.50%,62.21%,93.00%,31.43%,88.00%,74.00%,75.00%,79.17%
+57,Command-R-Plus (FC),75.93%,77.02%,81.21%,72.08%,87.25%,59.00%,70.00%,89.50%,82.50%,64.00%,90.86%,96.00%,85.71%,90.00%,84.00%,60.00%,50.42%
+58,Claude-3.5-Sonnet-20241022 (Prompt),75.59%,72.48%,80.00%,81.42%,94.25%,68.00%,82.00%,92.00%,70.50%,46.00%,100.00%,100.00%,100.00%,92.00%,68.00%,60.00%,70.42%
+59,Hermes-2-Pro-Llama-3-8B (FC),74.54%,76.79%,76.23%,64.17%,90.50%,56.00%,46.00%,89.50%,80.00%,73.50%,70.43%,98.00%,42.86%,94.00%,78.00%,62.50%,58.75%
+60,Qwen2-7B-Instruct (Prompt),73.06%,76.65%,76.80%,68.08%,84.25%,58.00%,62.00%,88.00%,75.50%,75.00%,80.21%,89.00%,71.43%,84.00%,78.00%,65.00%,43.75%
+61,xLAM-8x7b-r (FC),71.17%,67.65%,74.05%,73.58%,91.75%,59.00%,70.00%,90.00%,69.00%,38.00%,89.21%,97.00%,81.43%,90.00%,72.00%,45.00%,73.75%
+62,GPT-3.5-Turbo-0125 (Prompt),70.79%,72.85%,70.39%,77.92%,96.75%,61.00%,76.00%,93.50%,67.00%,53.00%,57.57%,98.00%,17.14%,90.00%,74.00%,60.00%,64.17%
+63,Hammer2.1-0.5b (FC),70.70%,69.12%,70.46%,68.00%,84.00%,62.00%,58.00%,83.00%,71.50%,54.00%,68.36%,91.00%,45.71%,84.00%,82.00%,47.50%,77.92%
+64,Hermes-2-Pro-Mistral-7B (FC),69.12%,73.06%,76.00%,60.75%,86.25%,56.00%,40.00%,87.50%,78.50%,65.50%,61.00%,92.00%,30.00%,94.00%,84.00%,65.00%,25.83%
+65,Open-Mixtral-8x7b (Prompt),66.33%,63.58%,69.61%,64.83%,89.50%,51.00%,54.00%,86.00%,59.00%,44.50%,77.93%,93.00%,62.86%,86.00%,62.00%,52.50%,64.17%
+66,xLAM-7b-fc-r (FC),64.40%,72.08%,60.63%,76.83%,93.50%,65.00%,72.00%,93.50%,77.00%,41.00%,84.50%,99.00%,70.00%,92.00%,56.00%,10.00%,48.75%
+67,DBRX-Instruct (Prompt),62.58%,61.25%,69.14%,73.50%,92.50%,56.00%,72.00%,92.00%,42.50%,37.00%,90.07%,93.00%,87.14%,88.00%,46.00%,52.50%,41.67%
+68,Gemini-1.0-Pro-002 (FC),62.04%,56.65%,64.93%,66.58%,93.75%,52.00%,54.00%,95.00%,40.00%,25.00%,87.21%,93.00%,81.43%,86.00%,64.00%,22.50%,72.08%
+69,Claude-3-Opus-20240229 (FC),61.10%,57.92%,59.46%,67.17%,88.50%,59.00%,54.00%,93.00%,39.50%,32.00%,80.36%,95.00%,65.71%,88.00%,42.00%,27.50%,80.42%
+70,Mistral-small-2402 (FC),59.57%,59.15%,53.84%,67.58%,91.75%,59.00%,52.00%,94.00%,24.50%,50.50%,87.36%,99.00%,75.71%,92.00%,16.00%,20.00%,84.17%
+71,MiniCPM3-4B (Prompt),59.31%,65.88%,50.59%,63.50%,84.50%,48.00%,58.00%,72.50%,65.50%,62.00%,40.36%,35.00%,45.71%,34.00%,48.00%,80.00%,67.92%
+72,Open-Mixtral-8x22b (FC),58.93%,61.67%,63.64%,71.67%,93.00%,66.00%,56.00%,94.00%,10.50%,70.50%,83.57%,100.00%,67.14%,94.00%,22.00%,55.00%,29.17%
+73,Gemini-1.0-Pro-002 (Prompt),56.62%,57.31%,56.32%,46.25%,58.75%,26.00%,54.00%,56.50%,63.50%,63.00%,49.79%,61.00%,38.57%,68.00%,60.00%,47.50%,55.00%
+74,Nexusflow-Raven-v2 (FC),55.59%,45.88%,59.11%,57.50%,37.50%,63.00%,72.00%,53.00%,34.00%,39.00%,47.93%,83.00%,12.86%,86.00%,40.00%,62.50%,80.42%
75,Meta-Llama-3-8B-Instruct (Prompt),54.23%,60.79%,58.93%,62.67%,87.00%,47.00%,54.00%,82.50%,48.00%,50.00%,47.71%,84.00%,11.43%,86.00%,42.00%,60.00%,9.17%
76,Qwen2.5-0.5B-Instruct (Prompt),52.58%,53.19%,61.89%,58.25%,76.75%,44.00%,54.00%,68.00%,53.50%,33.00%,63.07%,89.00%,37.14%,70.00%,62.00%,52.50%,12.92%
77,Claude-3.5-Sonnet-20241022 (FC),49.44%,45.44%,47.89%,78.75%,95.25%,65.00%,76.00%,94.50%,3.50%,5.00%,97.57%,98.00%,97.14%,90.00%,4.00%,0.00%,71.67%
diff --git a/data_overall.csv b/data_overall.csv
index c007560bd..d3e23d25e 100644
--- a/data_overall.csv
+++ b/data_overall.csv
@@ -33,52 +33,52 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s
32,55.49%,DeepSeek-Coder-V2 (FC),https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct-0724,N/A,29.53,108.9,59.61,89.44%,78.75%,94.50%,93.50%,91.00%,91.23%,96.43%,94.00%,92.00%,82.50%,73.43%,80.23%,77.02%,43.75%,70.83%,4.50%,7.50%,3.00%,4.00%,3.50%,88.89%,70.81%,DeepSeek,DeepSeek License
33,54.86%,Hammer2.1-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-1.5b,N/A,2.73,3.86,7.45,82.79%,74.67%,92.00%,84.50%,80.00%,83.39%,86.57%,90.00%,82.00%,75.00%,70.59%,70.93%,69.80%,50.00%,62.50%,10.50%,14.50%,12.50%,9.00%,6.00%,77.78%,79.27%,MadeAgents,cc-by-nc-4.0
34,54.70%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,12.74,25.13,24.76,81.06%,74.25%,95.50%,81.00%,73.50%,79.88%,74.00%,96.00%,82.00%,67.50%,75.08%,71.32%,74.93%,50.00%,62.50%,10.00%,16.50%,8.50%,7.50%,7.50%,94.44%,77.11%,Salesforce,cc-by-nc-4.0
-35,54.46%,o1-2024-12-17 (FC),https://openai.com/o1/,N/A,N/A,N/A,N/A,40.23%,67.92%,93.00%,0.00%,0.00%,38.66%,60.64%,94.00%,0.00%,0.00%,77.92%,81.01%,79.01%,0.00%,0.00%,41.00%,52.50%,38.00%,30.50%,43.00%,72.22%,81.97%,OpenAI,Proprietary
-36,54.26%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.84,5.21,3.57,83.19%,76.25%,93.00%,84.00%,79.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.64%,83.72%,75.02%,87.50%,54.17%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,65.78%,Anthropic,Proprietary
-37,54.09%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.95,14.92,14.75,89.98%,77.92%,96.00%,94.50%,91.50%,90.12%,94.00%,98.00%,86.00%,82.50%,62.06%,77.13%,76.16%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,100.00%,54.78%,Meta,Meta Llama 3 Community
-38,53.88%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.38,0.87,1.45,1.47,83.94%,74.25%,93.50%,89.00%,79.00%,83.79%,96.14%,88.00%,86.00%,65.00%,63.93%,80.62%,79.68%,43.75%,58.33%,19.50%,32.50%,11.50%,21.50%,12.50%,94.44%,36.53%,OpenAI,Proprietary
-39,53.66%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.54,11.64,11.02,86.46%,75.33%,94.50%,91.50%,84.50%,88.29%,92.14%,90.00%,86.00%,85.00%,67.35%,75.97%,74.93%,62.50%,70.83%,7.62%,9.50%,8.50%,7.00%,5.50%,88.89%,65.16%,Qwen,apache-2.0
-40,53.24%,claude-3.5-haiku-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,N/A,N/A,N/A,N/A,40.62%,68.00%,92.00%,2.50%,0.00%,50.46%,87.86%,90.00%,24.00%,0.00%,72.28%,82.17%,78.35%,18.75%,0.00%,40.00%,54.50%,26.50%,35.00%,44.00%,83.33%,63.68%,Anthropic,Proprietary
-41,53.03%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.17,3.87,88.46%,80.33%,94.00%,91.50%,88.00%,87.54%,96.64%,92.00%,84.00%,77.50%,65.57%,78.29%,78.35%,56.25%,70.83%,8.62%,13.50%,7.00%,11.00%,3.00%,94.44%,53.02%,Fireworks,Apache 2.0
-42,52.55%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.96,5.57,10.33,67.65%,73.58%,90.00%,69.00%,38.00%,74.05%,89.21%,90.00%,72.00%,45.00%,70.99%,74.03%,79.30%,43.75%,58.33%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.15%,Salesforce,cc-by-nc-4.0
-43,52.17%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.57,18.11,9.97,73.12%,69.50%,88.50%,69.00%,65.50%,81.57%,93.29%,86.00%,72.00%,75.00%,77.52%,75.19%,74.07%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,66.67%,85.93%,Mistral AI,Proprietary
-44,52.17%,Command R7B (FC),https://cohere.com/blog/command-r7b,N/A,N/A,N/A,N/A,81.67%,68.17%,91.50%,85.50%,81.50%,84.02%,87.07%,92.00%,82.00%,75.00%,69.21%,63.18%,58.69%,56.25%,66.67%,5.00%,6.50%,1.50%,6.50%,5.50%,55.56%,81.02%,Cohere,cc-by-nc-4.0
-45,52.17%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,4.97,8.07,13.9,88.94%,79.75%,92.50%,91.50%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,67.04%,84.50%,79.39%,68.75%,62.50%,2.38%,4.50%,2.00%,1.50%,1.50%,94.44%,59.19%,Google,gemma-terms-of-use
-46,51.75%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,3.65,3.82,10.68,87.81%,76.75%,95.00%,92.50%,87.00%,88.21%,95.86%,94.00%,78.00%,85.00%,64.90%,80.62%,78.25%,75.00%,66.67%,5.62%,10.00%,4.00%,6.00%,2.50%,100.00%,50.88%,Meta,Meta Llama 3 Community
-47,51.73%,Ministral-8B-Instruct-2410 (FC),https://huggingface.co/mistralai/Ministral-8B-Instruct-2410,N/A,12.79,45.03,47.12,83.83%,71.83%,91.50%,84.50%,87.50%,79.57%,71.29%,86.00%,86.00%,75.00%,64.93%,75.19%,72.27%,62.50%,66.67%,11.25%,21.00%,8.50%,10.00%,5.50%,70.59%,55.28%,Mistral AI,Mistral AI Research License
-48,51.66%,MiniCPM3-4B-FC (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,160.19,184.1,464.0,80.83%,69.83%,91.50%,82.50%,79.50%,87.57%,89.29%,90.00%,86.00%,85.00%,69.97%,74.42%,63.91%,43.75%,62.50%,2.62%,5.00%,1.00%,3.00%,1.50%,72.22%,72.22%,openbmb,Apache-2.0
-49,51.66%,Claude-3.5-Sonnet-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,1.23,1.81,1.35,3.3,72.48%,81.42%,92.00%,70.50%,46.00%,80.00%,100.00%,92.00%,68.00%,60.00%,71.88%,86.05%,80.06%,81.25%,45.83%,7.50%,9.00%,5.50%,5.00%,10.50%,77.78%,64.40%,Anthropic,Proprietary
-50,51.55%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,5.23,8.66,13.19,85.29%,75.67%,90.50%,88.50%,86.50%,87.52%,88.07%,94.00%,88.00%,80.00%,67.84%,76.36%,74.26%,62.50%,62.50%,1.62%,2.00%,4.00%,0.50%,0.00%,83.33%,66.51%,Google,gemma-terms-of-use
-51,51.37%,Llama-3.3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.98,25.27,23.42,85.08%,74.83%,94.50%,84.00%,87.00%,90.68%,95.71%,98.00%,84.00%,85.00%,62.59%,80.62%,77.11%,93.75%,62.50%,6.87%,9.00%,8.00%,4.50%,6.00%,100.00%,48.71%,Meta,Meta Llama 3 Community
-52,51.32%,Claude-3-Opus-20240229 (Prompt),https://www.anthropic.com/news/claude-3-family,10.48,4.6,8.24,10.54,85.31%,79.75%,95.00%,85.50%,81.00%,86.32%,99.29%,90.00%,86.00%,70.00%,66.86%,84.11%,79.11%,68.75%,54.17%,7.13%,11.50%,2.50%,6.00%,8.50%,83.33%,40.25%,Anthropic,Proprietary
-53,51.22%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.55,1.96,3.42,82.10%,64.42%,93.50%,85.50%,85.00%,77.66%,56.14%,94.00%,88.00%,72.50%,65.93%,77.13%,69.61%,75.00%,66.67%,9.12%,15.00%,3.50%,9.00%,9.00%,66.67%,63.19%,Mistral AI,Proprietary
-54,50.70%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.3,52.75,24.53,84.21%,72.83%,93.50%,87.00%,83.50%,86.30%,83.71%,96.00%,88.00%,77.50%,60.95%,73.26%,73.31%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,77.78%,48.82%,Meta,Meta Llama 3 Community
-55,50.33%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.36,6.03,3.17,88.02%,78.58%,94.00%,89.50%,90.00%,87.77%,93.57%,96.00%,84.00%,77.50%,65.93%,82.17%,72.65%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,83.33%,55.09%,Mistral AI,Proprietary
-56,49.32%,Command-R-Plus (FC),https://txt.cohere.com/command-r-plus-microsoft-azure,N/A,N/A,N/A,N/A,77.02%,72.08%,89.50%,82.50%,64.00%,81.21%,90.86%,90.00%,84.00%,60.00%,58.91%,69.77%,58.78%,62.50%,45.83%,13.12%,16.50%,10.00%,9.00%,17.00%,72.22%,53.16%,Cohere For AI,cc-by-nc-4.0
-57,49.28%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.84,1.74,5.0,82.46%,72.83%,91.50%,84.00%,81.50%,86.36%,84.93%,92.00%,86.00%,82.50%,59.57%,67.83%,56.32%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,74.82%,IBM,Apache-2.0
-58,48.29%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.72,1.79,1.21,72.85%,77.92%,93.50%,67.00%,53.00%,70.39%,57.57%,90.00%,74.00%,60.00%,68.46%,79.84%,78.63%,75.00%,58.33%,5.62%,9.00%,2.00%,7.00%,4.50%,94.44%,58.39%,OpenAI,Proprietary
-59,47.27%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,3.97,3.91,8.21,76.79%,64.17%,89.50%,80.00%,73.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.90%,71.71%,65.81%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.78%,NousResearch,apache-2.0
-60,47.23%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,24.91,3.32,3.99,6.94,90.54%,82.17%,97.00%,92.50%,90.50%,90.12%,100.00%,94.00%,84.00%,82.50%,52.69%,85.27%,81.96%,93.75%,79.17%,8.38%,15.00%,6.00%,6.00%,6.50%,100.00%,4.35%,Mistral AI,Proprietary
-61,47.06%,Qwen2.5-3B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-3B-Instruct,N/A,1.03,1.43,1.78,80.79%,74.17%,90.50%,79.50%,79.00%,81.71%,80.86%,86.00%,80.00%,80.00%,58.60%,68.99%,66.48%,56.25%,62.50%,3.38%,5.50%,3.50%,2.00%,2.50%,88.89%,54.19%,Qwen,qwen
-62,46.91%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,7.79,45.67,14.71,80.56%,73.75%,92.00%,80.50%,76.00%,83.70%,87.29%,92.00%,78.00%,77.50%,55.75%,63.57%,64.86%,12.50%,45.83%,5.25%,8.50%,2.50%,4.50%,5.50%,88.89%,51.69%,Meta,Meta Llama 3 Community
-63,46.70%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,2.51,6.07,4.63,73.37%,71.00%,86.00%,70.00%,66.50%,85.61%,80.43%,94.00%,88.00%,80.00%,61.04%,70.16%,59.26%,56.25%,41.67%,1.12%,1.50%,2.50%,0.50%,0.00%,83.33%,63.04%,Qwen,apache-2.0
-64,45.27%,Hammer2.1-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-0.5b,N/A,1.29,3.16,2.85,69.12%,68.00%,83.00%,71.50%,54.00%,70.46%,68.36%,84.00%,82.00%,47.50%,62.86%,59.69%,58.02%,50.00%,45.83%,2.25%,4.00%,0.50%,3.00%,1.50%,77.78%,73.94%,MadeAgents,cc-by-nc-4.0
-65,44.83%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.76,1.38,2.9,3.4,56.65%,66.58%,95.00%,40.00%,25.00%,64.93%,87.21%,86.00%,64.00%,22.50%,69.57%,77.13%,67.62%,43.75%,41.67%,2.88%,4.50%,1.00%,3.50%,2.50%,66.67%,71.53%,Google,Proprietary
-66,44.76%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.36,1.73,4.04,3.5,59.15%,67.58%,94.00%,24.50%,50.50%,53.84%,87.36%,92.00%,16.00%,20.00%,72.10%,64.73%,71.51%,12.50%,12.50%,2.62%,4.50%,0.00%,3.00%,3.00%,77.78%,80.86%,Mistral AI,Proprietary
-67,43.12%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,10.63,32.72,23.76,73.06%,60.75%,87.50%,78.50%,65.50%,76.00%,61.00%,94.00%,84.00%,65.00%,57.62%,68.99%,60.02%,43.75%,41.67%,2.63%,3.50%,4.00%,2.50%,0.50%,66.67%,38.88%,NousResearch,apache-2.0
-68,43.07%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.74,1.73,4.5,3.51,63.58%,64.83%,86.00%,59.00%,44.50%,69.61%,77.93%,86.00%,62.00%,52.50%,61.39%,63.18%,66.10%,68.75%,50.00%,1.50%,2.50%,0.00%,1.50%,2.00%,88.89%,59.52%,Mistral AI,Proprietary
-69,42.99%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,7.0,2.63,15.88,5.36,61.67%,71.67%,94.00%,10.50%,70.50%,63.64%,83.57%,94.00%,22.00%,55.00%,68.55%,76.36%,73.12%,6.25%,45.83%,1.50%,3.50%,0.00%,1.00%,1.50%,83.33%,45.71%,Mistral AI,Proprietary
-70,42.53%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.79,1.65,10.01,3.26,86.12%,77.00%,93.50%,89.50%,84.50%,89.07%,93.79%,92.00%,88.00%,82.50%,48.96%,77.13%,74.45%,87.50%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,88.89%,6.43%,Mistral AI,Proprietary
-71,42.30%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,3.99,10.26,9.78,76.65%,68.08%,88.00%,75.50%,75.00%,76.80%,80.21%,84.00%,78.00%,65.00%,50.60%,56.59%,62.01%,37.50%,66.67%,3.25%,4.00%,4.50%,2.50%,2.00%,88.89%,39.00%,Qwen,apache-2.0
-72,40.91%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.49,3.74,8.22,11.19,61.25%,73.50%,92.00%,42.50%,37.00%,69.14%,90.07%,88.00%,46.00%,52.50%,60.15%,77.13%,73.03%,75.00%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,40.50%,Databricks,Databricks Open Model
-73,39.97%,FireFunction-v1 (FC),https://huggingface.co/fireworks-ai/firefunction-v1,N/A,2.27,3.77,3.71,43.00%,80.00%,92.00%,0.00%,0.00%,44.57%,88.29%,90.00%,0.00%,0.00%,70.41%,71.32%,72.93%,0.00%,0.00%,2.38%,5.00%,0.00%,2.00%,2.50%,94.44%,71.80%,Fireworks,Apache 2.0
-74,39.25%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,6.26,4.43,13.94,72.08%,76.83%,93.50%,77.00%,41.00%,60.63%,84.50%,92.00%,56.00%,10.00%,53.35%,78.29%,58.02%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,77.78%,44.95%,Salesforce,cc-by-nc-4.0
-75,38.94%,GLM-4-9b-Chat (FC),https://huggingface.co/THUDM/glm-4-9b-chat,N/A,6.09,15.35,13.2,36.67%,65.17%,81.50%,0.00%,0.00%,46.00%,94.00%,90.00%,0.00%,0.00%,66.77%,72.09%,64.39%,0.00%,0.00%,3.50%,3.50%,4.00%,2.50%,4.00%,66.67%,79.71%,THUDM,glm-4
-76,38.59%,MiniCPM3-4B (Prompt),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,20.78,49.16,64.58,65.88%,63.50%,72.50%,65.50%,62.00%,50.59%,40.36%,34.00%,48.00%,80.00%,54.46%,46.51%,34.76%,43.75%,41.67%,2.00%,3.00%,3.50%,1.00%,0.50%,50.00%,74.43%,openbmb,Apache-2.0
-77,36.92%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,1.13,0.55,2.27,45.88%,57.50%,53.00%,34.00%,39.00%,59.11%,47.93%,86.00%,40.00%,62.50%,54.15%,41.47%,38.65%,56.25%,37.50%,1.00%,1.50%,0.50%,1.00%,1.00%,61.11%,78.53%,Nexusflow,Apache 2.0
-78,35.69%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,1.33,2.38,2.97,57.31%,46.25%,56.50%,63.50%,63.00%,56.32%,49.79%,68.00%,60.00%,47.50%,49.09%,50.39%,47.01%,62.50%,29.17%,1.38%,2.50%,1.50%,0.50%,1.00%,77.78%,52.95%,Google,Proprietary
-79,34.30%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.05,8.85,20.61,60.79%,62.67%,82.50%,48.00%,50.00%,58.93%,47.71%,86.00%,42.00%,60.00%,47.93%,60.85%,61.44%,37.50%,33.33%,0.75%,1.50%,0.00%,1.00%,0.50%,77.78%,18.59%,Meta,Meta Llama 3 Community
-80,31.78%,GoGoAgent,https://gogoagent.ai,N/A,2.8,1.97,6.23,10.92%,43.67%,0.00%,0.00%,0.00%,89.86%,95.43%,96.00%,88.00%,80.00%,39.18%,0.00%,0.00%,0.00%,0.00%,1.00%,1.50%,2.00%,0.50%,0.00%,0.00%,96.67%,BitAgent,Proprietary
+35,54.52%,GoGoAgent,https://gogoagent.ai,N/A,2.66,3.08,5.56,86.23%,75.42%,93.00%,92.00%,84.50%,89.86%,95.43%,96.00%,88.00%,80.00%,73.92%,72.09%,75.40%,68.75%,66.67%,1.00%,1.50%,2.00%,0.50%,0.00%,77.78%,83.12%,BitAgent,Proprietary
+36,54.46%,o1-2024-12-17 (FC),https://openai.com/o1/,N/A,N/A,N/A,N/A,40.23%,67.92%,93.00%,0.00%,0.00%,38.66%,60.64%,94.00%,0.00%,0.00%,77.92%,81.01%,79.01%,0.00%,0.00%,41.00%,52.50%,38.00%,30.50%,43.00%,72.22%,81.97%,OpenAI,Proprietary
+37,54.26%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.84,5.21,3.57,83.19%,76.25%,93.00%,84.00%,79.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.64%,83.72%,75.02%,87.50%,54.17%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,65.78%,Anthropic,Proprietary
+38,54.09%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.95,14.92,14.75,89.98%,77.92%,96.00%,94.50%,91.50%,90.12%,94.00%,98.00%,86.00%,82.50%,62.06%,77.13%,76.16%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,100.00%,54.78%,Meta,Meta Llama 3 Community
+39,53.88%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.38,0.87,1.45,1.47,83.94%,74.25%,93.50%,89.00%,79.00%,83.79%,96.14%,88.00%,86.00%,65.00%,63.93%,80.62%,79.68%,43.75%,58.33%,19.50%,32.50%,11.50%,21.50%,12.50%,94.44%,36.53%,OpenAI,Proprietary
+40,53.66%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.54,11.64,11.02,86.46%,75.33%,94.50%,91.50%,84.50%,88.29%,92.14%,90.00%,86.00%,85.00%,67.35%,75.97%,74.93%,62.50%,70.83%,7.62%,9.50%,8.50%,7.00%,5.50%,88.89%,65.16%,Qwen,apache-2.0
+41,53.24%,claude-3.5-haiku-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,N/A,N/A,N/A,N/A,40.62%,68.00%,92.00%,2.50%,0.00%,50.46%,87.86%,90.00%,24.00%,0.00%,72.28%,82.17%,78.35%,18.75%,0.00%,40.00%,54.50%,26.50%,35.00%,44.00%,83.33%,63.68%,Anthropic,Proprietary
+42,53.03%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.17,3.87,88.46%,80.33%,94.00%,91.50%,88.00%,87.54%,96.64%,92.00%,84.00%,77.50%,65.57%,78.29%,78.35%,56.25%,70.83%,8.62%,13.50%,7.00%,11.00%,3.00%,94.44%,53.02%,Fireworks,Apache 2.0
+43,52.55%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.96,5.57,10.33,67.65%,73.58%,90.00%,69.00%,38.00%,74.05%,89.21%,90.00%,72.00%,45.00%,70.99%,74.03%,79.30%,43.75%,58.33%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.15%,Salesforce,cc-by-nc-4.0
+44,52.17%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.57,18.11,9.97,73.12%,69.50%,88.50%,69.00%,65.50%,81.57%,93.29%,86.00%,72.00%,75.00%,77.52%,75.19%,74.07%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,66.67%,85.93%,Mistral AI,Proprietary
+45,52.17%,Command R7B (FC),https://cohere.com/blog/command-r7b,N/A,N/A,N/A,N/A,81.67%,68.17%,91.50%,85.50%,81.50%,84.02%,87.07%,92.00%,82.00%,75.00%,69.21%,63.18%,58.69%,56.25%,66.67%,5.00%,6.50%,1.50%,6.50%,5.50%,55.56%,81.02%,Cohere,cc-by-nc-4.0
+46,52.17%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,4.97,8.07,13.9,88.94%,79.75%,92.50%,91.50%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,67.04%,84.50%,79.39%,68.75%,62.50%,2.38%,4.50%,2.00%,1.50%,1.50%,94.44%,59.19%,Google,gemma-terms-of-use
+47,51.75%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,3.65,3.82,10.68,87.81%,76.75%,95.00%,92.50%,87.00%,88.21%,95.86%,94.00%,78.00%,85.00%,64.90%,80.62%,78.25%,75.00%,66.67%,5.62%,10.00%,4.00%,6.00%,2.50%,100.00%,50.88%,Meta,Meta Llama 3 Community
+48,51.73%,Ministral-8B-Instruct-2410 (FC),https://huggingface.co/mistralai/Ministral-8B-Instruct-2410,N/A,12.79,45.03,47.12,83.83%,71.83%,91.50%,84.50%,87.50%,79.57%,71.29%,86.00%,86.00%,75.00%,64.93%,75.19%,72.27%,62.50%,66.67%,11.25%,21.00%,8.50%,10.00%,5.50%,70.59%,55.28%,Mistral AI,Mistral AI Research License
+49,51.66%,MiniCPM3-4B-FC (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,160.19,184.1,464.0,80.83%,69.83%,91.50%,82.50%,79.50%,87.57%,89.29%,90.00%,86.00%,85.00%,69.97%,74.42%,63.91%,43.75%,62.50%,2.62%,5.00%,1.00%,3.00%,1.50%,72.22%,72.22%,openbmb,Apache-2.0
+50,51.66%,Claude-3.5-Sonnet-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,1.23,1.81,1.35,3.3,72.48%,81.42%,92.00%,70.50%,46.00%,80.00%,100.00%,92.00%,68.00%,60.00%,71.88%,86.05%,80.06%,81.25%,45.83%,7.50%,9.00%,5.50%,5.00%,10.50%,77.78%,64.40%,Anthropic,Proprietary
+51,51.55%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,5.23,8.66,13.19,85.29%,75.67%,90.50%,88.50%,86.50%,87.52%,88.07%,94.00%,88.00%,80.00%,67.84%,76.36%,74.26%,62.50%,62.50%,1.62%,2.00%,4.00%,0.50%,0.00%,83.33%,66.51%,Google,gemma-terms-of-use
+52,51.37%,Llama-3.3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.98,25.27,23.42,85.08%,74.83%,94.50%,84.00%,87.00%,90.68%,95.71%,98.00%,84.00%,85.00%,62.59%,80.62%,77.11%,93.75%,62.50%,6.87%,9.00%,8.00%,4.50%,6.00%,100.00%,48.71%,Meta,Meta Llama 3 Community
+53,51.32%,Claude-3-Opus-20240229 (Prompt),https://www.anthropic.com/news/claude-3-family,10.48,4.6,8.24,10.54,85.31%,79.75%,95.00%,85.50%,81.00%,86.32%,99.29%,90.00%,86.00%,70.00%,66.86%,84.11%,79.11%,68.75%,54.17%,7.13%,11.50%,2.50%,6.00%,8.50%,83.33%,40.25%,Anthropic,Proprietary
+54,51.22%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.55,1.96,3.42,82.10%,64.42%,93.50%,85.50%,85.00%,77.66%,56.14%,94.00%,88.00%,72.50%,65.93%,77.13%,69.61%,75.00%,66.67%,9.12%,15.00%,3.50%,9.00%,9.00%,66.67%,63.19%,Mistral AI,Proprietary
+55,50.70%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.3,52.75,24.53,84.21%,72.83%,93.50%,87.00%,83.50%,86.30%,83.71%,96.00%,88.00%,77.50%,60.95%,73.26%,73.31%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,77.78%,48.82%,Meta,Meta Llama 3 Community
+56,50.33%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.36,6.03,3.17,88.02%,78.58%,94.00%,89.50%,90.00%,87.77%,93.57%,96.00%,84.00%,77.50%,65.93%,82.17%,72.65%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,83.33%,55.09%,Mistral AI,Proprietary
+57,49.32%,Command-R-Plus (FC),https://txt.cohere.com/command-r-plus-microsoft-azure,N/A,N/A,N/A,N/A,77.02%,72.08%,89.50%,82.50%,64.00%,81.21%,90.86%,90.00%,84.00%,60.00%,58.91%,69.77%,58.78%,62.50%,45.83%,13.12%,16.50%,10.00%,9.00%,17.00%,72.22%,53.16%,Cohere For AI,cc-by-nc-4.0
+58,49.28%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.84,1.74,5.0,82.46%,72.83%,91.50%,84.00%,81.50%,86.36%,84.93%,92.00%,86.00%,82.50%,59.57%,67.83%,56.32%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,74.82%,IBM,Apache-2.0
+59,48.29%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.72,1.79,1.21,72.85%,77.92%,93.50%,67.00%,53.00%,70.39%,57.57%,90.00%,74.00%,60.00%,68.46%,79.84%,78.63%,75.00%,58.33%,5.62%,9.00%,2.00%,7.00%,4.50%,94.44%,58.39%,OpenAI,Proprietary
+60,47.27%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,3.97,3.91,8.21,76.79%,64.17%,89.50%,80.00%,73.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.90%,71.71%,65.81%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.78%,NousResearch,apache-2.0
+61,47.23%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,24.91,3.32,3.99,6.94,90.54%,82.17%,97.00%,92.50%,90.50%,90.12%,100.00%,94.00%,84.00%,82.50%,52.69%,85.27%,81.96%,93.75%,79.17%,8.38%,15.00%,6.00%,6.00%,6.50%,100.00%,4.35%,Mistral AI,Proprietary
+62,47.06%,Qwen2.5-3B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-3B-Instruct,N/A,1.03,1.43,1.78,80.79%,74.17%,90.50%,79.50%,79.00%,81.71%,80.86%,86.00%,80.00%,80.00%,58.60%,68.99%,66.48%,56.25%,62.50%,3.38%,5.50%,3.50%,2.00%,2.50%,88.89%,54.19%,Qwen,qwen
+63,46.91%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,7.79,45.67,14.71,80.56%,73.75%,92.00%,80.50%,76.00%,83.70%,87.29%,92.00%,78.00%,77.50%,55.75%,63.57%,64.86%,12.50%,45.83%,5.25%,8.50%,2.50%,4.50%,5.50%,88.89%,51.69%,Meta,Meta Llama 3 Community
+64,46.70%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,2.51,6.07,4.63,73.37%,71.00%,86.00%,70.00%,66.50%,85.61%,80.43%,94.00%,88.00%,80.00%,61.04%,70.16%,59.26%,56.25%,41.67%,1.12%,1.50%,2.50%,0.50%,0.00%,83.33%,63.04%,Qwen,apache-2.0
+65,45.27%,Hammer2.1-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-0.5b,N/A,1.29,3.16,2.85,69.12%,68.00%,83.00%,71.50%,54.00%,70.46%,68.36%,84.00%,82.00%,47.50%,62.86%,59.69%,58.02%,50.00%,45.83%,2.25%,4.00%,0.50%,3.00%,1.50%,77.78%,73.94%,MadeAgents,cc-by-nc-4.0
+66,44.83%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.76,1.38,2.9,3.4,56.65%,66.58%,95.00%,40.00%,25.00%,64.93%,87.21%,86.00%,64.00%,22.50%,69.57%,77.13%,67.62%,43.75%,41.67%,2.88%,4.50%,1.00%,3.50%,2.50%,66.67%,71.53%,Google,Proprietary
+67,44.76%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.36,1.73,4.04,3.5,59.15%,67.58%,94.00%,24.50%,50.50%,53.84%,87.36%,92.00%,16.00%,20.00%,72.10%,64.73%,71.51%,12.50%,12.50%,2.62%,4.50%,0.00%,3.00%,3.00%,77.78%,80.86%,Mistral AI,Proprietary
+68,43.12%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,10.63,32.72,23.76,73.06%,60.75%,87.50%,78.50%,65.50%,76.00%,61.00%,94.00%,84.00%,65.00%,57.62%,68.99%,60.02%,43.75%,41.67%,2.63%,3.50%,4.00%,2.50%,0.50%,66.67%,38.88%,NousResearch,apache-2.0
+69,43.07%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.74,1.73,4.5,3.51,63.58%,64.83%,86.00%,59.00%,44.50%,69.61%,77.93%,86.00%,62.00%,52.50%,61.39%,63.18%,66.10%,68.75%,50.00%,1.50%,2.50%,0.00%,1.50%,2.00%,88.89%,59.52%,Mistral AI,Proprietary
+70,42.99%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,7.0,2.63,15.88,5.36,61.67%,71.67%,94.00%,10.50%,70.50%,63.64%,83.57%,94.00%,22.00%,55.00%,68.55%,76.36%,73.12%,6.25%,45.83%,1.50%,3.50%,0.00%,1.00%,1.50%,83.33%,45.71%,Mistral AI,Proprietary
+71,42.53%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.79,1.65,10.01,3.26,86.12%,77.00%,93.50%,89.50%,84.50%,89.07%,93.79%,92.00%,88.00%,82.50%,48.96%,77.13%,74.45%,87.50%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,88.89%,6.43%,Mistral AI,Proprietary
+72,42.30%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,3.99,10.26,9.78,76.65%,68.08%,88.00%,75.50%,75.00%,76.80%,80.21%,84.00%,78.00%,65.00%,50.60%,56.59%,62.01%,37.50%,66.67%,3.25%,4.00%,4.50%,2.50%,2.00%,88.89%,39.00%,Qwen,apache-2.0
+73,40.91%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.49,3.74,8.22,11.19,61.25%,73.50%,92.00%,42.50%,37.00%,69.14%,90.07%,88.00%,46.00%,52.50%,60.15%,77.13%,73.03%,75.00%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,40.50%,Databricks,Databricks Open Model
+74,39.97%,FireFunction-v1 (FC),https://huggingface.co/fireworks-ai/firefunction-v1,N/A,2.27,3.77,3.71,43.00%,80.00%,92.00%,0.00%,0.00%,44.57%,88.29%,90.00%,0.00%,0.00%,70.41%,71.32%,72.93%,0.00%,0.00%,2.38%,5.00%,0.00%,2.00%,2.50%,94.44%,71.80%,Fireworks,Apache 2.0
+75,39.25%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,6.26,4.43,13.94,72.08%,76.83%,93.50%,77.00%,41.00%,60.63%,84.50%,92.00%,56.00%,10.00%,53.35%,78.29%,58.02%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,77.78%,44.95%,Salesforce,cc-by-nc-4.0
+76,38.94%,GLM-4-9b-Chat (FC),https://huggingface.co/THUDM/glm-4-9b-chat,N/A,6.09,15.35,13.2,36.67%,65.17%,81.50%,0.00%,0.00%,46.00%,94.00%,90.00%,0.00%,0.00%,66.77%,72.09%,64.39%,0.00%,0.00%,3.50%,3.50%,4.00%,2.50%,4.00%,66.67%,79.71%,THUDM,glm-4
+77,38.59%,MiniCPM3-4B (Prompt),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,20.78,49.16,64.58,65.88%,63.50%,72.50%,65.50%,62.00%,50.59%,40.36%,34.00%,48.00%,80.00%,54.46%,46.51%,34.76%,43.75%,41.67%,2.00%,3.00%,3.50%,1.00%,0.50%,50.00%,74.43%,openbmb,Apache-2.0
+78,36.92%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,1.13,0.55,2.27,45.88%,57.50%,53.00%,34.00%,39.00%,59.11%,47.93%,86.00%,40.00%,62.50%,54.15%,41.47%,38.65%,56.25%,37.50%,1.00%,1.50%,0.50%,1.00%,1.00%,61.11%,78.53%,Nexusflow,Apache 2.0
+79,35.69%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,1.33,2.38,2.97,57.31%,46.25%,56.50%,63.50%,63.00%,56.32%,49.79%,68.00%,60.00%,47.50%,49.09%,50.39%,47.01%,62.50%,29.17%,1.38%,2.50%,1.50%,0.50%,1.00%,77.78%,52.95%,Google,Proprietary
+80,34.30%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,6.05,8.85,20.61,60.79%,62.67%,82.50%,48.00%,50.00%,58.93%,47.71%,86.00%,42.00%,60.00%,47.93%,60.85%,61.44%,37.50%,33.33%,0.75%,1.50%,0.00%,1.00%,0.50%,77.78%,18.59%,Meta,Meta Llama 3 Community
81,31.25%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.91,1.57,0.96,3.37,26.94%,23.25%,74.00%,8.50%,2.00%,30.36%,52.93%,64.00%,2.00%,2.50%,58.73%,36.05%,65.24%,0.00%,8.33%,0.75%,0.50%,0.00%,1.50%,1.00%,44.44%,69.74%,Mistral AI,Proprietary
82,29.27%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,3.09,11.89,5.41,54.29%,51.17%,79.00%,46.50%,40.50%,52.39%,46.57%,76.00%,52.00%,35.00%,39.00%,48.45%,40.27%,12.50%,25.00%,0.50%,0.50%,1.00%,0.00%,0.50%,94.44%,21.19%,Qwen,apache-2.0
83,28.06%,Qwen2.5-0.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct,N/A,0.95,1.25,1.47,53.19%,58.25%,68.00%,53.50%,33.00%,61.89%,63.07%,70.00%,62.00%,52.50%,31.59%,53.88%,34.76%,56.25%,16.67%,0.00%,0.00%,0.00%,0.00%,0.00%,94.44%,16.44%,Qwen,apache-2.0
From 5a44385ce642e0c8f3119c405e372c2f300f4268 Mon Sep 17 00:00:00 2001
From: "Huanzhi (Hans) Mao"
Date: Tue, 31 Dec 2024 21:01:31 +0800
Subject: [PATCH 5/5] update cost & latency info
---
data_overall.csv | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/data_overall.csv b/data_overall.csv
index d3e23d25e..3c56a5c90 100644
--- a/data_overall.csv
+++ b/data_overall.csv
@@ -1,10 +1,10 @@
Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Relevance Detection,Irrelevance Detection,Organization,License
1,74.24%,watt-tool-70B (FC),https://huggingface.co/watt-ai/watt-tool-70B/,N/A,3.4,12.61,7.7,84.06%,78.75%,94.00%,85.50%,78.00%,89.39%,98.57%,94.00%,90.00%,75.00%,77.65%,84.88%,83.48%,81.25%,66.67%,58.62%,67.00%,57.50%,48.50%,61.50%,94.44%,76.32%,Watt AI Lab,Apache-2.0
-2,72.02%,gpt-4o-2024-11-20 (Prompt),https://openai.com/index/hello-gpt-4o/,N/A,N/A,N/A,N/A,88.10%,79.42%,95.50%,94.00%,83.50%,89.38%,100.00%,94.00%,86.00%,77.50%,79.65%,83.72%,79.77%,87.50%,70.83%,47.62%,59.00%,41.00%,35.50%,55.00%,83.33%,83.76%,OpenAI,Proprietary
-3,69.56%,gpt-4o-2024-11-20 (FC),https://openai.com/index/hello-gpt-4o/,N/A,N/A,N/A,N/A,87.42%,77.17%,93.50%,93.00%,86.00%,89.20%,88.29%,92.00%,94.00%,82.50%,79.61%,81.01%,78.82%,87.50%,75.00%,41.00%,62.50%,6.00%,37.50%,58.00%,83.33%,83.15%,OpenAI,Proprietary
+2,72.02%,gpt-4o-2024-11-20 (Prompt),https://openai.com/index/hello-gpt-4o/,13.54,0.78,0.93,1.48,88.10%,79.42%,95.50%,94.00%,83.50%,89.38%,100.00%,94.00%,86.00%,77.50%,79.65%,83.72%,79.77%,87.50%,70.83%,47.62%,59.00%,41.00%,35.50%,55.00%,83.33%,83.76%,OpenAI,Proprietary
+3,69.56%,gpt-4o-2024-11-20 (FC),https://openai.com/index/hello-gpt-4o/,8.23,1.11,1.73,2.29,87.42%,77.17%,93.50%,93.00%,86.00%,89.20%,88.29%,92.00%,94.00%,82.50%,79.61%,81.01%,78.82%,87.50%,75.00%,41.00%,62.50%,6.00%,37.50%,58.00%,83.33%,83.15%,OpenAI,Proprietary
4,67.94%,watt-tool-8B (FC),https://huggingface.co/watt-ai/watt-tool-8B/,N/A,1.31,2.79,4.04,86.56%,76.75%,95.00%,94.00%,80.50%,89.34%,97.86%,94.00%,88.00%,77.50%,76.37%,75.97%,77.49%,87.50%,66.67%,39.13%,47.00%,41.50%,27.50%,40.50%,83.33%,83.15%,Watt AI Lab,Apache-2.0
5,67.87%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.22,2.47,6.27,5.08,84.73%,70.42%,91.00%,90.00%,87.50%,85.21%,87.36%,90.00%,86.00%,77.50%,80.45%,83.33%,78.63%,81.25%,70.83%,38.12%,54.00%,13.50%,35.50%,49.50%,72.22%,83.81%,OpenAI,Proprietary
-6,66.68%,o1-2024-12-17 (Prompt),https://openai.com/o1/,N/A,N/A,N/A,N/A,85.67%,72.67%,93.50%,91.50%,85.00%,79.77%,58.57%,92.00%,86.00%,82.50%,80.45%,81.78%,76.54%,81.25%,70.83%,36.00%,50.50%,0.50%,48.50%,44.50%,72.22%,87.78%,OpenAI,Proprietary
+6,66.68%,o1-2024-12-17 (Prompt),https://openai.com/o1/,102.47,5.3,4.29,13.0,85.67%,72.67%,93.50%,91.50%,85.00%,79.77%,58.57%,92.00%,86.00%,82.50%,80.45%,81.78%,76.54%,81.25%,70.83%,36.00%,50.50%,0.50%,48.50%,44.50%,72.22%,87.78%,OpenAI,Proprietary
7,64.09%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.49,9.88,3.01,85.21%,74.83%,92.00%,90.00%,84.00%,83.57%,83.29%,92.00%,84.00%,75.00%,74.37%,78.29%,76.16%,87.50%,70.83%,34.12%,47.50%,19.50%,29.00%,40.50%,83.33%,74.75%,OpenAI,Proprietary
8,62.76%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.76,8.44,10.06,17.57,78.92%,71.17%,89.00%,83.50%,72.00%,82.70%,89.29%,86.00%,78.00%,77.50%,78.05%,71.71%,71.60%,75.00%,79.17%,28.25%,40.50%,5.00%,34.50%,33.00%,61.11%,89.62%,OpenAI,Proprietary
9,62.63%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,14.06,57.4,35.06,89.88%,76.00%,97.00%,95.00%,91.50%,91.32%,99.29%,94.00%,92.00%,80.00%,76.59%,81.01%,83.29%,68.75%,75.00%,21.38%,31.50%,21.00%,26.50%,6.50%,72.22%,76.08%,MeetKai,MIT
@@ -34,16 +34,16 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s
33,54.86%,Hammer2.1-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.1-1.5b,N/A,2.73,3.86,7.45,82.79%,74.67%,92.00%,84.50%,80.00%,83.39%,86.57%,90.00%,82.00%,75.00%,70.59%,70.93%,69.80%,50.00%,62.50%,10.50%,14.50%,12.50%,9.00%,6.00%,77.78%,79.27%,MadeAgents,cc-by-nc-4.0
34,54.70%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,12.74,25.13,24.76,81.06%,74.25%,95.50%,81.00%,73.50%,79.88%,74.00%,96.00%,82.00%,67.50%,75.08%,71.32%,74.93%,50.00%,62.50%,10.00%,16.50%,8.50%,7.50%,7.50%,94.44%,77.11%,Salesforce,cc-by-nc-4.0
35,54.52%,GoGoAgent,https://gogoagent.ai,N/A,2.66,3.08,5.56,86.23%,75.42%,93.00%,92.00%,84.50%,89.86%,95.43%,96.00%,88.00%,80.00%,73.92%,72.09%,75.40%,68.75%,66.67%,1.00%,1.50%,2.00%,0.50%,0.00%,77.78%,83.12%,BitAgent,Proprietary
-36,54.46%,o1-2024-12-17 (FC),https://openai.com/o1/,N/A,N/A,N/A,N/A,40.23%,67.92%,93.00%,0.00%,0.00%,38.66%,60.64%,94.00%,0.00%,0.00%,77.92%,81.01%,79.01%,0.00%,0.00%,41.00%,52.50%,38.00%,30.50%,43.00%,72.22%,81.97%,OpenAI,Proprietary
+36,54.46%,o1-2024-12-17 (FC),https://openai.com/o1/,68.63,4.86,5.1,13.75,40.23%,67.92%,93.00%,0.00%,0.00%,38.66%,60.64%,94.00%,0.00%,0.00%,77.92%,81.01%,79.01%,0.00%,0.00%,41.00%,52.50%,38.00%,30.50%,43.00%,72.22%,81.97%,OpenAI,Proprietary
37,54.26%,claude-3.5-haiku-20241022 (Prompt),https://www.anthropic.com/news/3-5-models-and-computer-use,0.48,1.84,5.21,3.57,83.19%,76.25%,93.00%,84.00%,79.50%,84.71%,97.86%,90.00%,76.00%,75.00%,70.64%,83.72%,75.02%,87.50%,54.17%,9.75%,16.00%,0.50%,8.00%,14.50%,77.78%,65.78%,Anthropic,Proprietary
38,54.09%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,4.95,14.92,14.75,89.98%,77.92%,96.00%,94.50%,91.50%,90.12%,94.00%,98.00%,86.00%,82.50%,62.06%,77.13%,76.16%,87.50%,62.50%,12.38%,16.50%,13.00%,10.50%,9.50%,100.00%,54.78%,Meta,Meta Llama 3 Community
39,53.88%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.38,0.87,1.45,1.47,83.94%,74.25%,93.50%,89.00%,79.00%,83.79%,96.14%,88.00%,86.00%,65.00%,63.93%,80.62%,79.68%,43.75%,58.33%,19.50%,32.50%,11.50%,21.50%,12.50%,94.44%,36.53%,OpenAI,Proprietary
40,53.66%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,4.54,11.64,11.02,86.46%,75.33%,94.50%,91.50%,84.50%,88.29%,92.14%,90.00%,86.00%,85.00%,67.35%,75.97%,74.93%,62.50%,70.83%,7.62%,9.50%,8.50%,7.00%,5.50%,88.89%,65.16%,Qwen,apache-2.0
-41,53.24%,claude-3.5-haiku-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,N/A,N/A,N/A,N/A,40.62%,68.00%,92.00%,2.50%,0.00%,50.46%,87.86%,90.00%,24.00%,0.00%,72.28%,82.17%,78.35%,18.75%,0.00%,40.00%,54.50%,26.50%,35.00%,44.00%,83.33%,63.68%,Anthropic,Proprietary
+41,53.24%,claude-3.5-haiku-20241022 (FC),https://www.anthropic.com/news/3-5-models-and-computer-use,0.83,2.83,1.94,5.18,40.62%,68.00%,92.00%,2.50%,0.00%,50.46%,87.86%,90.00%,24.00%,0.00%,72.28%,82.17%,78.35%,18.75%,0.00%,40.00%,54.50%,26.50%,35.00%,44.00%,83.33%,63.68%,Anthropic,Proprietary
42,53.03%,FireFunction-v2 (FC),https://huggingface.co/fireworks-ai/firefunction-v2,N/A,2.13,1.17,3.87,88.46%,80.33%,94.00%,91.50%,88.00%,87.54%,96.64%,92.00%,84.00%,77.50%,65.57%,78.29%,78.35%,56.25%,70.83%,8.62%,13.50%,7.00%,11.00%,3.00%,94.44%,53.02%,Fireworks,Apache 2.0
43,52.55%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,4.96,5.57,10.33,67.65%,73.58%,90.00%,69.00%,38.00%,74.05%,89.21%,90.00%,72.00%,45.00%,70.99%,74.03%,79.30%,43.75%,58.33%,15.50%,26.00%,13.00%,11.50%,11.50%,94.44%,67.15%,Salesforce,cc-by-nc-4.0
44,52.17%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,11.07,4.57,18.11,9.97,73.12%,69.50%,88.50%,69.00%,65.50%,81.57%,93.29%,86.00%,72.00%,75.00%,77.52%,75.19%,74.07%,81.25%,54.17%,0.38%,1.00%,0.00%,0.00%,0.50%,66.67%,85.93%,Mistral AI,Proprietary
-45,52.17%,Command R7B (FC),https://cohere.com/blog/command-r7b,N/A,N/A,N/A,N/A,81.67%,68.17%,91.50%,85.50%,81.50%,84.02%,87.07%,92.00%,82.00%,75.00%,69.21%,63.18%,58.69%,56.25%,66.67%,5.00%,6.50%,1.50%,6.50%,5.50%,55.56%,81.02%,Cohere,cc-by-nc-4.0
+45,52.17%,Command R7B (FC),https://cohere.com/blog/command-r7b,0.1,1.35,4.86,2.47,81.67%,68.17%,91.50%,85.50%,81.50%,84.02%,87.07%,92.00%,82.00%,75.00%,69.21%,63.18%,58.69%,56.25%,66.67%,5.00%,6.50%,1.50%,6.50%,5.50%,55.56%,81.02%,Cohere,cc-by-nc-4.0
46,52.17%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,4.97,8.07,13.9,88.94%,79.75%,92.50%,91.50%,92.00%,89.09%,87.86%,98.00%,88.00%,82.50%,67.04%,84.50%,79.39%,68.75%,62.50%,2.38%,4.50%,2.00%,1.50%,1.50%,94.44%,59.19%,Google,gemma-terms-of-use
47,51.75%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,3.65,3.82,10.68,87.81%,76.75%,95.00%,92.50%,87.00%,88.21%,95.86%,94.00%,78.00%,85.00%,64.90%,80.62%,78.25%,75.00%,66.67%,5.62%,10.00%,4.00%,6.00%,2.50%,100.00%,50.88%,Meta,Meta Llama 3 Community
48,51.73%,Ministral-8B-Instruct-2410 (FC),https://huggingface.co/mistralai/Ministral-8B-Instruct-2410,N/A,12.79,45.03,47.12,83.83%,71.83%,91.50%,84.50%,87.50%,79.57%,71.29%,86.00%,86.00%,75.00%,64.93%,75.19%,72.27%,62.50%,66.67%,11.25%,21.00%,8.50%,10.00%,5.50%,70.59%,55.28%,Mistral AI,Mistral AI Research License
@@ -55,7 +55,7 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s
54,51.22%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.18,1.55,1.96,3.42,82.10%,64.42%,93.50%,85.50%,85.00%,77.66%,56.14%,94.00%,88.00%,72.50%,65.93%,77.13%,69.61%,75.00%,66.67%,9.12%,15.00%,3.50%,9.00%,9.00%,66.67%,63.19%,Mistral AI,Proprietary
55,50.70%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,10.3,52.75,24.53,84.21%,72.83%,93.50%,87.00%,83.50%,86.30%,83.71%,96.00%,88.00%,77.50%,60.95%,73.26%,73.31%,56.25%,50.00%,9.25%,12.00%,10.00%,7.00%,8.00%,77.78%,48.82%,Meta,Meta Llama 3 Community
56,50.33%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,12.83,1.36,6.03,3.17,88.02%,78.58%,94.00%,89.50%,90.00%,87.77%,93.57%,96.00%,84.00%,77.50%,65.93%,82.17%,72.65%,81.25%,75.00%,0.50%,1.00%,0.00%,0.00%,1.00%,83.33%,55.09%,Mistral AI,Proprietary
-57,49.32%,Command-R-Plus (FC),https://txt.cohere.com/command-r-plus-microsoft-azure,N/A,N/A,N/A,N/A,77.02%,72.08%,89.50%,82.50%,64.00%,81.21%,90.86%,90.00%,84.00%,60.00%,58.91%,69.77%,58.78%,62.50%,45.83%,13.12%,16.50%,10.00%,9.00%,17.00%,72.22%,53.16%,Cohere For AI,cc-by-nc-4.0
+57,49.32%,Command-R-Plus (FC),https://txt.cohere.com/command-r-plus-microsoft-azure,7.8,2.58,9.12,3.87,77.02%,72.08%,89.50%,82.50%,64.00%,81.21%,90.86%,90.00%,84.00%,60.00%,58.91%,69.77%,58.78%,62.50%,45.83%,13.12%,16.50%,10.00%,9.00%,17.00%,72.22%,53.16%,Cohere For AI,cc-by-nc-4.0
58,49.28%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,1.84,1.74,5.0,82.46%,72.83%,91.50%,84.00%,81.50%,86.36%,84.93%,92.00%,86.00%,82.50%,59.57%,67.83%,56.32%,43.75%,54.17%,3.38%,6.00%,1.50%,4.50%,1.50%,88.89%,74.82%,IBM,Apache-2.0
59,48.29%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.16,0.72,1.79,1.21,72.85%,77.92%,93.50%,67.00%,53.00%,70.39%,57.57%,90.00%,74.00%,60.00%,68.46%,79.84%,78.63%,75.00%,58.33%,5.62%,9.00%,2.00%,7.00%,4.50%,94.44%,58.39%,OpenAI,Proprietary
60,47.27%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,3.97,3.91,8.21,76.79%,64.17%,89.50%,80.00%,73.50%,76.23%,70.43%,94.00%,78.00%,62.50%,64.90%,71.71%,65.81%,56.25%,50.00%,2.38%,4.50%,1.50%,2.00%,1.50%,44.44%,60.78%,NousResearch,apache-2.0