Skip to content

Commit

Permalink
Add Copilot Arena leaderboard (#3618)
Browse files Browse the repository at this point in the history
Co-authored-by: Wayne Chi <[email protected]>
  • Loading branch information
adityamittal13 and waynchi authored Dec 19, 2024
1 parent 1cd4b74 commit 7a912c7
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 0 deletions.
96 changes: 96 additions & 0 deletions fastchat/serve/monitor/copilot_arena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import gradio as gr
import pandas as pd
import requests
import os

from fastchat.serve.monitor.monitor import recompute_final_ranking

copilot_arena_leaderboard_url = os.getenv("COPILOT_ARENA_LEADERBOARD_URL")

if not copilot_arena_leaderboard_url:
raise ValueError(
"COPILOT_ARENA_LEADERBOARD_URL environment variable is not set. "
"Please configure it to a valid URL."
)


def process_copilot_arena_leaderboard(leaderboard):
leaderboard = leaderboard.copy().loc[leaderboard["visibility"] == "public"]
leaderboard["score"] = leaderboard["score"].round().astype(int)
leaderboard["rating_q975"] = leaderboard["upper"].round().astype(int)
leaderboard["rating_q025"] = leaderboard["lower"].round().astype(int)

leaderboard["upper_diff"] = leaderboard["rating_q975"] - leaderboard["score"]
leaderboard["lower_diff"] = leaderboard["score"] - leaderboard["rating_q025"]

leaderboard["confidence_interval"] = (
"+"
+ leaderboard["upper_diff"].astype(str)
+ " / -"
+ leaderboard["lower_diff"].astype(str)
)

rankings_ub = recompute_final_ranking(leaderboard)
leaderboard.insert(loc=0, column="Rank* (UB)", value=rankings_ub)

leaderboard = leaderboard.sort_values(
by=["Rank* (UB)", "score"], ascending=[True, False]
)

return leaderboard


def build_copilot_arena_tab():
if copilot_arena_leaderboard_url is None:
print("Copilot Arena Leaderboard URL is not set. Skipping this leaderboard.")
return
response = requests.get(copilot_arena_leaderboard_url)
if response.status_code == 200:
leaderboard = pd.DataFrame(response.json()["elo_data"])
leaderboard = process_copilot_arena_leaderboard(leaderboard)
leaderboard = leaderboard.rename(
columns={
"name": "Model",
"confidence_interval": "Confidence Interval",
"score": "Arena Score",
"organization": "Organization",
"votes": "Votes",
}
)

column_order = [
"Rank* (UB)",
"Model",
"Arena Score",
"Confidence Interval",
"Votes",
"Organization",
]
leaderboard = leaderboard[column_order]
num_models = len(leaderboard)
total_battles = int(leaderboard["Votes"].sum()) // 2
md = f"""
[Copilot Arena](https://blog.lmarena.ai/blog/2024/copilot-arena/) is a free AI coding assistant that provides paired responses from different state-of-the-art LLMs. This leaderboard contains the relative performance and ranking of {num_models} models over {total_battles} battles.
"""

gr.Markdown(md, elem_id="leaderboard_markdown")
gr.DataFrame(
leaderboard,
datatype=["str" for _ in leaderboard.columns],
elem_id="arena_hard_leaderboard",
height=600,
wrap=True,
interactive=False,
column_widths=[70, 130, 60, 80, 50, 80],
)

gr.Markdown(
"""
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval). \n
**Confidence Interval**: represents the range of uncertainty around the Arena Score. It's displayed as +X / -Y, where X is the difference between the upper bound and the score, and Y is the difference between the score and the lower bound.
""",
elem_id="leaderboard_markdown",
)
else:
gr.Markdown("Error with fetching Copilot Arena data. Check back in later.")
9 changes: 9 additions & 0 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,15 @@ def build_leaderboard_tab(
build_full_leaderboard_tab(
elo_results_text, model_table_df, model_to_score
)
try:
with gr.Tab("Copilot Arena Leaderboard", id=5):
from fastchat.serve.monitor.copilot_arena import (
build_copilot_arena_tab,
)

build_copilot_arena_tab()
except Exception as e:
print(f"Unable to build Copilot Arena's Leaderboard. Error: {e}")

if not show_plot:
gr.Markdown(
Expand Down

0 comments on commit 7a912c7

Please sign in to comment.