-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Copilot Arena leaderboard (#3618)
Co-authored-by: Wayne Chi <[email protected]>
- Loading branch information
1 parent
1cd4b74
commit 7a912c7
Showing
2 changed files
with
105 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import gradio as gr | ||
import pandas as pd | ||
import requests | ||
import os | ||
|
||
from fastchat.serve.monitor.monitor import recompute_final_ranking | ||
|
||
copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL") | ||
|
||
if not copilot_arena_leaderboard_url: | ||
raise ValueError( | ||
"COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. " | ||
"Please configure it to a valid URL." | ||
) | ||
|
||
|
||
def process_copilot_arena_leaderboard(leaderboard): | ||
leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"] | ||
leaderboard["score"] = leaderboard["score"].round().astype(int) | ||
leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int) | ||
leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int) | ||
|
||
leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"] | ||
leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"] | ||
|
||
leaderboard["confidence_interval"] = ( | ||
"+" | ||
+ leaderboard["upper_diff"].astype(str) | ||
+ " / -" | ||
+ leaderboard["lower_diff"].astype(str) | ||
) | ||
|
||
rankings_ub = recompute_final_ranking(leaderboard) | ||
leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub) | ||
|
||
leaderboard = leaderboard.sort_values( | ||
by=["Rank* (UB)", "score"], ascending=[True, False] | ||
) | ||
|
||
return leaderboard | ||
|
||
|
||
def build_copilot_arena_tab(): | ||
if copilot_arena_leaderboard_url is None: | ||
print("Copilot Arena Leaderboard URL is not set. Skipping this leaderboard.") | ||
return | ||
response = requests.get(copilot_arena_leaderboard_url) | ||
if response.status_code == 200: | ||
leaderboard = pd.DataFrame(response.json()["elo_data"]) | ||
leaderboard = process_copilot_arena_leaderboard(leaderboard) | ||
leaderboard = leaderboard.rename( | ||
columns={ | ||
"name": "Model", | ||
"confidence_interval": "Confidence Interval", | ||
"score": "Arena Score", | ||
"organization": "Organization", | ||
"votes": "Votes", | ||
} | ||
) | ||
|
||
column_order = [ | ||
"Rank* (UB)", | ||
"Model", | ||
"Arena Score", | ||
"Confidence Interval", | ||
"Votes", | ||
"Organization", | ||
] | ||
leaderboard = leaderboard[column_order] | ||
num_models = len(leaderboard) | ||
total_battles = int(leaderboard["Votes"].sum()) // 2 | ||
md = f""" | ||
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles. | ||
""" | ||
|
||
gr.Markdown(md, elem_id="leaderboard_markdown") | ||
gr.DataFrame( | ||
leaderboard, | ||
datatype=["str" for _ in leaderboard.columns], | ||
elem_id="arena_hard_leaderboard", | ||
height=600, | ||
wrap=True, | ||
interactive=False, | ||
column_widths=[70, 130, 60, 80, 50, 80], | ||
) | ||
|
||
gr.Markdown( | ||
""" | ||
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model. | ||
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n | ||
**Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound. | ||
""", | ||
elem_id="leaderboard_markdown", | ||
) | ||
else: | ||
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters