diff --git a/fastchat/constants.py b/fastchat/constants.py
index 5be0b17b6..9605b03ed 100644
--- a/fastchat/constants.py
+++ b/fastchat/constants.py
@@ -7,6 +7,13 @@
REPO_PATH = os.path.dirname(os.path.dirname(__file__))
+# Survey Link URL (to be removed)
+SURVEY_LINK = """
+
+
We would love your feedback! Fill out this short survey to tell us what you like about the arena, what you don't like, and what you want to see in the future.
+
+
"""
+
##### For the gradio web server
SERVER_ERROR_MSG = (
"**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index 7b06fcc8d..b1c5de6cb 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -15,6 +15,7 @@
SLOW_MODEL_MSG,
BLIND_MODE_INPUT_CHAR_LEN_LIMIT,
CONVERSATION_TURN_LIMIT,
+ SURVEY_LINK,
)
from fastchat.model.model_adapter import get_conversation_template
from fastchat.serve.gradio_block_arena_named import flash_buttons
@@ -412,10 +413,12 @@ def bot_response_multi(
def build_side_by_side_ui_anony(models):
- notice_markdown = """
+ notice_markdown = f"""
# ⚔️ LMSYS Chatbot Arena: Benchmarking LLMs in the Wild
[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
+{SURVEY_LINK}
+
## 📣 News
- Chatbot Arena now supports images in beta. Check it out [here](https://chat.lmsys.org/?vision).
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index f8b381522..7ee19b041 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -14,6 +14,7 @@
CONVERSATION_LIMIT_MSG,
INPUT_CHAR_LEN_LIMIT,
CONVERSATION_TURN_LIMIT,
+ SURVEY_LINK,
)
from fastchat.model.model_adapter import get_conversation_template
from fastchat.serve.gradio_web_server import (
@@ -306,10 +307,11 @@ def flash_buttons():
def build_side_by_side_ui_named(models):
- notice_markdown = """
+ notice_markdown = f"""
# ⚔️ LMSYS Chatbot Arena: Benchmarking LLMs in the Wild
[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
+{SURVEY_LINK}
## 📜 Rules
- Ask any question to two chosen models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one!
diff --git a/fastchat/serve/gradio_block_arena_vision.py b/fastchat/serve/gradio_block_arena_vision.py
index 1d1d8b9e2..25ff78c08 100644
--- a/fastchat/serve/gradio_block_arena_vision.py
+++ b/fastchat/serve/gradio_block_arena_vision.py
@@ -22,6 +22,7 @@
CONVERSATION_LIMIT_MSG,
INPUT_CHAR_LEN_LIMIT,
CONVERSATION_TURN_LIMIT,
+ SURVEY_LINK,
)
from fastchat.model.model_adapter import (
get_conversation_template,
@@ -255,9 +256,11 @@ def build_single_vision_language_model_ui(
models, add_promotion_links=False, random_questions=None
):
promotion = (
- """
+ f"""
- [GitHub](https://github.com/lm-sys/FastChat) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx)
+{SURVEY_LINK}
+
**❗️ For research purposes, we log user prompts and images, and may release this data to the public in the future. Please do not upload any confidential or personal information.**
Note: You can only chat with one image per conversation. You can upload images less than 15MB. Click the "Random Example" button to chat with a random image."""
diff --git a/fastchat/serve/gradio_block_arena_vision_anony.py b/fastchat/serve/gradio_block_arena_vision_anony.py
index 2c11957c6..7da3f5405 100644
--- a/fastchat/serve/gradio_block_arena_vision_anony.py
+++ b/fastchat/serve/gradio_block_arena_vision_anony.py
@@ -17,6 +17,7 @@
SLOW_MODEL_MSG,
BLIND_MODE_INPUT_CHAR_LEN_LIMIT,
CONVERSATION_TURN_LIMIT,
+ SURVEY_LINK,
)
from fastchat.model.model_adapter import get_conversation_template
from fastchat.serve.gradio_block_arena_named import flash_buttons
@@ -377,6 +378,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
# ⚔️ LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild
[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
+{SURVEY_LINK}
## 📜 Rules
- Ask any question to two anonymous models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one!
diff --git a/fastchat/serve/gradio_block_arena_vision_named.py b/fastchat/serve/gradio_block_arena_vision_named.py
index 524c38b8a..ecca169ca 100644
--- a/fastchat/serve/gradio_block_arena_vision_named.py
+++ b/fastchat/serve/gradio_block_arena_vision_named.py
@@ -18,6 +18,7 @@
SLOW_MODEL_MSG,
INPUT_CHAR_LEN_LIMIT,
CONVERSATION_TURN_LIMIT,
+ SURVEY_LINK,
)
from fastchat.model.model_adapter import get_conversation_template
from fastchat.serve.gradio_block_arena_named import (
@@ -247,10 +248,12 @@ def add_text(
def build_side_by_side_vision_ui_named(models, random_questions=None):
- notice_markdown = """
+ notice_markdown = f"""
# ⚔️ LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild
[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx)
+{SURVEY_LINK}
+
## 📜 Rules
- Chat with any two models side-by-side and vote!
- You can continue chatting for multiple rounds.
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index 878f562bf..f47883beb 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -18,12 +18,26 @@
import gradio as gr
import numpy as np
+from fastchat.constants import SURVEY_LINK
from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files
from fastchat.serve.monitor.clean_battle_data import clean_battle_data
from fastchat.serve.monitor.elo_analysis import report_elo_analysis_results
from fastchat.utils import build_logger, get_window_url_params_js
+from fastchat.serve.monitor.monitor_md import (
+ cat_name_to_baseline,
+ key_to_category_name,
+ arena_hard_title,
+ make_default_md_1,
+ make_default_md_2,
+ make_arena_leaderboard_md,
+ make_category_arena_leaderboard_md,
+ make_full_leaderboard_md,
+ make_leaderboard_md_live,
+)
+
+
notebook_url = (
"https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH"
)
@@ -32,92 +46,20 @@
leader_component_values = [None] * 5
-def make_default_md_1(mirror=False):
- link_color = "#1976D2" # This color should be clear in both light and dark mode
- leaderboard_md = f"""
- # 🏆 LMSYS Chatbot Arena Leaderboard
- [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
- """
-
- return leaderboard_md
-
-
-def make_default_md_2(mirror=False):
- mirror_str = "This is a mirror of the live leaderboard created and maintained by the LMSYS Organization. Please link to leaderboard.lmsys.org for citation purposes."
- leaderboard_md = f"""
- {mirror_str if mirror else ""}
-
- LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 1,000,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale.
- You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!**
- """
-
- return leaderboard_md
-
-
-def make_arena_leaderboard_md(arena_df, last_updated_time, vision=False):
- total_votes = sum(arena_df["num_battles"]) // 2
- total_models = len(arena_df)
- space = " "
-
- leaderboard_md = f"""
-Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
-"""
- if not vision:
- leaderboard_md += """
-📣 **NEW!** View leaderboard for different categories (e.g., coding, long user query)! This is still in preview and subject to change.
-"""
-
- leaderboard_md += f"""
-Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). You can contribute your vote at [chat.lmsys.org](https://chat.lmsys.org)!
-"""
- return leaderboard_md
-
-
-def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
- total_votes = sum(arena_df["num_battles"]) // 2
- total_models = len(arena_df)
- space = " "
- total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
- total_subset_models = len(arena_subset_df)
- leaderboard_md = f"""### {cat_name_to_explanation[name]}
-#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
-"""
- return leaderboard_md
-
-
-def make_full_leaderboard_md():
- leaderboard_md = """
-Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
-- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute model strength.
-- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
-- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
-
-💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
-The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
-Higher values are better for all benchmarks. Empty cells mean not available.
-"""
- return leaderboard_md
-
-
-def make_leaderboard_md_live(elo_results):
- leaderboard_md = f"""
-# Leaderboard
-Last updated: {elo_results["last_updated_datetime"]}
-{elo_results["leaderboard_table"]}
-"""
- return leaderboard_md
-
-
-def arena_hard_title(date):
- arena_hard_title = f"""
-Last Updated: {date}
-
-**Arena-Hard-Auto v0.1** - an automatic evaluation tool for instruction-tuned LLMs with 500 challenging user queries curated from Chatbot Arena.
-
-We prompt GPT-4-Turbo as judge to compare the models' responses against a baseline model (default: GPT-4-0314). If you are curious to see how well your model might perform on Chatbot Arena, we recommend trying Arena-Hard-Auto. Check out our paper for more details about how Arena-Hard-Auto works as an fully automated data pipeline converting crowdsourced data into high-quality benchmarks ->
-[[Paper](https://arxiv.org/abs/2406.11939) | [Repo](https://github.com/lm-sys/arena-hard-auto)]
- """
- return arena_hard_title
+def recompute_final_ranking(arena_df):
+ # compute ranking based on CI
+ ranking = {}
+ for i, model_a in enumerate(arena_df.index):
+ ranking[model_a] = 1
+ for j, model_b in enumerate(arena_df.index):
+ if i == j:
+ continue
+ if (
+ arena_df.loc[model_b]["rating_q025"]
+ > arena_df.loc[model_a]["rating_q975"]
+ ):
+ ranking[model_a] += 1
+ return list(ranking.values())
def update_elo_components(
@@ -316,35 +258,6 @@ def create_ranking_str(ranking, ranking_difference):
return f"{int(ranking)}"
-def recompute_final_ranking(arena_df):
- # compute ranking based on CI
- ranking = {}
- for i, model_a in enumerate(arena_df.index):
- ranking[model_a] = 1
- for j, model_b in enumerate(arena_df.index):
- if i == j:
- continue
- if (
- arena_df.loc[model_b]["rating_q025"]
- > arena_df.loc[model_a]["rating_q975"]
- ):
- ranking[model_a] += 1
- return list(ranking.values())
-
-
-def highlight_top_models(df):
- def highlight_max_rank(s):
- # Pastel Yellow with transparency, rgba(red, green, blue, alpha)
- highlight_color = "rgba(255, 255, 128, 0.2)" # 50% transparent
- if int(s["Rank* (UB)"].replace("↑", "").replace("↓", "")) == 1:
- return [f"background-color: {highlight_color}" for _ in s]
- else:
- return ["" for _ in s]
-
- # Apply and return the styled DataFrame
- return df.apply(highlight_max_rank, axis=1)
-
-
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
@@ -436,59 +349,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
return values
-key_to_category_name = {
- "full": "Overall",
- "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
- "math": "Math",
- "if": "Instruction Following",
- "multiturn": "Multi-Turn",
- "coding": "Coding",
- "hard_6": "Hard Prompts (Overall)",
- "hard_english_6": "Hard Prompts (English)",
- "long_user": "Longer Query",
- "english": "English",
- "chinese": "Chinese",
- "french": "French",
- "german": "German",
- "spanish": "Spanish",
- "russian": "Russian",
- "japanese": "Japanese",
- "korean": "Korean",
- "no_tie": "Exclude Ties",
- "no_short": "Exclude Short Query (< 5 tokens)",
- "no_refusal": "Exclude Refusal",
- "overall_limit_5_user_vote": "overall_limit_5_user_vote",
- "full_old": "Overall (Deprecated)",
-}
-cat_name_to_explanation = {
- "Overall": "Overall Questions",
- "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
- "Math": "Math",
- "Instruction Following": "Instruction Following",
- "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
- "Coding": "Coding: whether conversation contains code snippets",
- "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
- "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
- "Longer Query": "Longer Query (>= 500 tokens)",
- "English": "English Prompts",
- "Chinese": "Chinese Prompts",
- "French": "French Prompts",
- "German": "German Prompts",
- "Spanish": "Spanish Prompts",
- "Russian": "Russian Prompts",
- "Japanese": "Japanese Prompts",
- "Korean": "Korean Prompts",
- "Exclude Ties": "Exclude Ties and Bothbad",
- "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
- "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
- "overall_limit_5_user_vote": "overall_limit_5_user_vote",
- "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
-}
-cat_name_to_baseline = {
- "Hard Prompts (English)": "English",
-}
-
-
def update_leaderboard_df(arena_table_vals):
elo_datarame = pd.DataFrame(
arena_table_vals,
@@ -609,10 +469,10 @@ def update_leaderboard_and_plots(category):
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
- "🤖 Model",
- "⭐ Arena Score",
- "📊 95% CI",
- "🗳️ Votes",
+ "Model",
+ "Arena Score",
+ "95% CI",
+ "Votes",
"Organization",
"License",
"Knowledge Cutoff",
@@ -688,10 +548,10 @@ def update_leaderboard_and_plots(category):
elo_display_df = gr.Dataframe(
headers=[
"Rank* (UB)",
- "🤖 Model",
- "⭐ Arena Elo",
- "📊 95% CI",
- "🗳️ Votes",
+ "Model",
+ "Arena Elo",
+ "95% CI",
+ "Votes",
"Organization",
"License",
"Knowledge Cutoff",
@@ -800,6 +660,108 @@ def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score):
)
+def get_arena_category_table(elo_results, model_table_df, categories):
+ new_category_dfs = []
+ for i, category in enumerate(categories):
+ category_df = elo_results[category]["leaderboard_table_df"]
+ category_df[key_to_category_name[category]] = recompute_final_ranking(
+ category_df
+ )
+ if i == 0:
+ new_category_dfs.append(
+ category_df[[key_to_category_name[category], "rating"]]
+ )
+ else:
+ new_category_dfs.append(category_df[[key_to_category_name[category]]])
+ category_df = pd.concat(new_category_dfs, axis=1)
+ category_df = category_df.sort_values(
+ by=[category_df.columns[0], "rating"], ascending=[True, False]
+ )
+ category_df = category_df.drop(columns=["rating"])
+
+ def get_model_name(model_key):
+ try:
+ model_name = model_table_df[model_table_df["key"] == model_key][
+ "Model"
+ ].values[0]
+ return model_name
+ except:
+ return None
+
+ category_df["Model"] = category_df.index
+ category_df["Model"] = category_df["Model"].apply(get_model_name)
+ # remove models that are not in the model table
+ category_df = category_df[category_df["Model"].notnull()]
+ # Move Model column to the front
+ cols = category_df.columns.tolist()
+ cols = cols[-1:] + cols[:-1]
+ category_df = category_df[cols]
+
+ category_df = pd.DataFrame(
+ category_df.values,
+ columns=["Model"] + [key_to_category_name[k] for k in categories],
+ )
+
+ def highlight_top_3(s):
+ return [
+ (
+ "background-color: rgba(255, 215, 0, 0.3); text-align: center; font-size: 110%"
+ if int(v) == 1
+ else "background-color: rgba(211, 211, 211, 0.4); text-align: center; font-size: 110%"
+ if int(v) == 2
+ else "background-color: rgba(209, 139, 71, 0.2); text-align: center; font-size: 110%"
+ if int(v) == 3
+ else "text-align: center; font-size: 110%"
+ )
+ for v in s
+ ]
+
+ return category_df.style.apply(
+ highlight_top_3, subset=[key_to_category_name[k] for k in categories]
+ )
+
+
+def build_category_leaderboard_tab(
+ elo_results, model_table_df, categories, categories_width
+):
+ full_table_vals = get_arena_category_table(elo_results, model_table_df, categories)
+ gr.Dataframe(
+ headers=["Model"] + [key_to_category_name[k] for k in categories],
+ datatype=["markdown"] + ["str" for k in categories],
+ value=full_table_vals,
+ elem_id="full_leaderboard_dataframe",
+ column_widths=[250]
+ + categories_width, # IMPORTANT: THIS IS HARDCODED WITH THE CURRENT CATEGORIES
+ height=800,
+ wrap=True,
+ )
+
+
+selected_categories = [
+ "full",
+ "coding",
+ "if",
+ "math",
+ "hard_6",
+ "multiturn",
+ "long_user",
+ "no_refusal",
+]
+selected_categories_width = [85, 75, 120, 75, 125, 110, 100, 100]
+
+language_categories = [
+ "english",
+ "chinese",
+ "german",
+ "french",
+ "spanish",
+ "russian",
+ "japanese",
+ "korean",
+]
+language_categories_width = [100] * len(language_categories)
+
+
def build_leaderboard_tab(
elo_results_file,
leaderboard_table_file,
@@ -834,14 +796,60 @@ def build_leaderboard_tab(
model_table_df = pd.DataFrame(data)
with gr.Tabs() as tabs:
- with gr.Tab("Arena", id=0):
+ with gr.Tab("Ranking Breakdown", id=0):
+ gr.Markdown(
+ f"""
+
+ We've updated the leaderboard to show model rank (UB) across categories to provide a more holistic comparison. Check out the Arena tab for more categories, statistics, and model info.
+
+ """,
+ )
+ last_updated_time = elo_results_text["full"][
+ "last_updated_datetime"
+ ].split(" ")[0]
+ gr.Markdown(
+ make_arena_leaderboard_md(
+ elo_results_text["full"]["leaderboard_table_df"],
+ last_updated_time,
+ ),
+ elem_id="leaderboard_markdown",
+ )
+ gr.Markdown(
+ """Task Leaderboard"""
+ )
+ gr_plots = build_category_leaderboard_tab(
+ elo_results_text,
+ model_table_df,
+ selected_categories,
+ selected_categories_width,
+ )
+ gr.Markdown(
+ """Language Leaderboard"""
+ )
+ build_category_leaderboard_tab(
+ elo_results_text,
+ model_table_df,
+ language_categories,
+ language_categories_width,
+ )
+ gr.Markdown(
+ f"""
+ ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
+ Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
+ See Figure 1 below for visualization of the confidence intervals of model scores.
+
+ Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large.
+ """,
+ elem_id="leaderboard_markdown",
+ )
+ with gr.Tab("Arena", id=1):
gr_plots = build_arena_tab(
elo_results_text,
model_table_df,
default_md,
show_plot=show_plot,
)
- with gr.Tab("📣 NEW: Arena (Vision)", id=1):
+ with gr.Tab("Arena (Vision)", id=2):
build_arena_tab(
elo_results_vision,
model_table_df,
@@ -850,7 +858,7 @@ def build_leaderboard_tab(
show_plot=show_plot,
)
if arena_hard_leaderboard is not None:
- with gr.Tab("Arena-Hard-Auto", id=2):
+ with gr.Tab("Arena-Hard-Auto", id=3):
dataFrame = arena_hard_process(
leaderboard_table_file, arena_hard_leaderboard
)
@@ -887,7 +895,7 @@ def build_leaderboard_tab(
column_widths=[70, 190, 80, 80, 90, 150],
)
- with gr.Tab("Full Leaderboard", id=3):
+ with gr.Tab("Full Leaderboard", id=4):
build_full_leaderboard_tab(
elo_results_text, model_table_df, model_to_score
)
diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py
new file mode 100644
index 000000000..5bbde6225
--- /dev/null
+++ b/fastchat/serve/monitor/monitor_md.py
@@ -0,0 +1,150 @@
+import pandas as pd
+import pickle
+import gradio as gr
+
+from fastchat.constants import SURVEY_LINK
+
+key_to_category_name = {
+ "full": "Overall",
+ "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
+ "math": "Math",
+ "if": "Instruction Following",
+ "multiturn": "Multi-Turn",
+ "coding": "Coding",
+ "hard_6": "Hard Prompts (Overall)",
+ "hard_english_6": "Hard Prompts (English)",
+ "long_user": "Longer Query",
+ "english": "English",
+ "chinese": "Chinese",
+ "french": "French",
+ "german": "German",
+ "spanish": "Spanish",
+ "russian": "Russian",
+ "japanese": "Japanese",
+ "korean": "Korean",
+ "no_tie": "Exclude Ties",
+ "no_short": "Exclude Short Query (< 5 tokens)",
+ "no_refusal": "Exclude Refusal",
+ "overall_limit_5_user_vote": "overall_limit_5_user_vote",
+ "full_old": "Overall (Deprecated)",
+}
+cat_name_to_explanation = {
+ "Overall": "Overall Questions",
+ "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
+ "Math": "Math",
+ "Instruction Following": "Instruction Following",
+ "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
+ "Coding": "Coding: whether conversation contains code snippets",
+ "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
+ "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
+ "Longer Query": "Longer Query (>= 500 tokens)",
+ "English": "English Prompts",
+ "Chinese": "Chinese Prompts",
+ "French": "French Prompts",
+ "German": "German Prompts",
+ "Spanish": "Spanish Prompts",
+ "Russian": "Russian Prompts",
+ "Japanese": "Japanese Prompts",
+ "Korean": "Korean Prompts",
+ "Exclude Ties": "Exclude Ties and Bothbad",
+ "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
+ "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
+ "overall_limit_5_user_vote": "overall_limit_5_user_vote",
+ "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
+}
+cat_name_to_baseline = {
+ "Hard Prompts (English)": "English",
+}
+
+notebook_url = (
+ "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH"
+)
+
+basic_component_values = [None] * 6
+leader_component_values = [None] * 5
+
+
+def make_default_md_1(mirror=False):
+ link_color = "#1976D2" # This color should be clear in both light and dark mode
+ leaderboard_md = f"""
+ # 🏆 LMSYS Chatbot Arena Leaderboard
+ [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
+ """
+
+ return leaderboard_md
+
+
+def make_default_md_2(mirror=False):
+ mirror_str = "This is a mirror of the live leaderboard created and maintained by the LMSYS Organization. Please link to leaderboard.lmsys.org for citation purposes."
+ leaderboard_md = f"""
+{mirror_str if mirror else ""}
+
+LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 1,000,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale.
+You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!**
+
+{SURVEY_LINK}
+"""
+
+ return leaderboard_md
+
+
+def make_arena_leaderboard_md(arena_df, last_updated_time, vision=False):
+ total_votes = sum(arena_df["num_battles"]) // 2
+ total_models = len(arena_df)
+ space = " "
+
+ leaderboard_md = f"""
+Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
+"""
+
+ leaderboard_md += f"""
+Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). You can contribute your vote at [chat.lmsys.org](https://chat.lmsys.org)!
+"""
+ return leaderboard_md
+
+
+def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
+ total_votes = sum(arena_df["num_battles"]) // 2
+ total_models = len(arena_df)
+ space = " "
+ total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
+ total_subset_models = len(arena_subset_df)
+ leaderboard_md = f"""### {cat_name_to_explanation[name]}
+#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
+"""
+ return leaderboard_md
+
+
+def make_full_leaderboard_md():
+ leaderboard_md = """
+Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
+- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute model strength.
+- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
+- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
+
+💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
+The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
+Higher values are better for all benchmarks. Empty cells mean not available.
+"""
+ return leaderboard_md
+
+
+def make_leaderboard_md_live(elo_results):
+ leaderboard_md = f"""
+# Leaderboard
+Last updated: {elo_results["last_updated_datetime"]}
+{elo_results["leaderboard_table"]}
+"""
+ return leaderboard_md
+
+
+def arena_hard_title(date):
+ arena_hard_title = f"""
+Last Updated: {date}
+
+**Arena-Hard-Auto v0.1** - an automatic evaluation tool for instruction-tuned LLMs with 500 challenging user queries curated from Chatbot Arena.
+
+We prompt GPT-4-Turbo as judge to compare the models' responses against a baseline model (default: GPT-4-0314). If you are curious to see how well your model might perform on Chatbot Arena, we recommend trying Arena-Hard-Auto. Check out our paper for more details about how Arena-Hard-Auto works as an fully automated data pipeline converting crowdsourced data into high-quality benchmarks ->
+[[Paper](https://arxiv.org/abs/2406.11939) | [Repo](https://github.com/lm-sys/arena-hard-auto)]
+ """
+ return arena_hard_title