Skip to content

Commit

Permalink
Leaderboard 2.0: added performance x n_parameters plot + more benchma…
Browse files Browse the repository at this point in the history
…rk info (#1437)

* Added elementary speed/performance plot

* Refactored table formatting code

* Bumped Gradio version

* Added more general info to benchmark description markdown block

* Adjusted margin an range on plot

* Made hover information easier to read on plot

* Made range scaling dynamic in plot

* Moved citation next to benchmark description

* Made titles in benchmark info bold
  • Loading branch information
x-tabdeveloping authored Nov 12, 2024
1 parent 19aefa3 commit 76c2112
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 32 deletions.
25 changes: 22 additions & 3 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import mteb
from mteb.caching import json_cache
from mteb.leaderboard.figures import performance_size_plot
from mteb.leaderboard.table import scores_to_tables


Expand All @@ -32,11 +33,22 @@ def update_citation(benchmark_name: str) -> str:
return citation


def update_description(benchmark_name: str) -> str:
def update_description(
benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str]
) -> str:
benchmark = mteb.get_benchmark(benchmark_name)
description = f"## {benchmark.name}\n{benchmark.description}\n"
n_languages = len(languages)
n_task_types = len(task_types)
n_tasks = len(benchmark.tasks)
n_domains = len(domains)
description += f" - **Number of languages**: {n_languages}\n"
description += f" - **Number of datasets**: {n_tasks}\n"
description += f" - **Number of task types**: {n_task_types}\n"
description += f" - **Number of domains**: {n_domains}\n"
if str(benchmark.reference) != "None":
description += f"\n[Click for More Info]({benchmark.reference})"

return description


Expand Down Expand Up @@ -194,14 +206,21 @@ def update_task_info(task_names: str) -> str:
interactive=True,
)
scores = gr.State(default_scores)
description = gr.Markdown(update_description, inputs=[benchmark_select])
with gr.Row():
with gr.Column():
description = gr.Markdown(
update_description,
inputs=[benchmark_select, lang_select, type_select, domain_select],
)
citation = gr.Markdown(update_citation, inputs=[benchmark_select])
with gr.Column():
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
with gr.Tab("Summary"):
summary_table.render()
with gr.Tab("Performance per task"):
per_task_table.render()
with gr.Tab("Task information"):
task_info_table = gr.DataFrame(update_task_info, inputs=[task_select])
citation = gr.Markdown(update_citation, inputs=[benchmark_select])

@gr.on(inputs=[scores, searchbar], outputs=[summary_table, per_task_table])
def update_tables(scores, search_query: str):
Expand Down
82 changes: 82 additions & 0 deletions mteb/leaderboard/figures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


def parse_n_params(text: str) -> int:
if text.endswith("M"):
return float(text[:-1]) * 1e6
if text.endswith("B"):
return float(text[:-1]) * 1e9


def parse_model_name(name: str) -> str:
name, _ = name.split("]")
return name[1:]


models_to_annotate = [
"all-MiniLM-L6-v2",
"GritLM-7B",
"LaBSE",
"multilingual-e5-large-instruct",
]


def performance_size_plot(df: pd.DataFrame) -> go.Figure:
df = df.copy()
df["Number of Parameters"] = df["Number of Parameters"].map(parse_n_params)
df["Model"] = df["Model"].map(parse_model_name)
df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "")
df["Embedding Dimensions"] = df["Embedding Dimensions"].map(int)
df["Max Tokens"] = df["Max Tokens"].map(int)
df["Log(Tokens)"] = np.log10(df["Max Tokens"])
min_score, max_score = df["Mean (Task)"].min(), df["Mean (Task)"].max()
fig = px.scatter(
df,
x="Number of Parameters",
y="Mean (Task)",
log_x=True,
template="plotly_white",
text="model_text",
size="Embedding Dimensions",
color="Log(Tokens)",
range_color=[2, 5],
range_x=[8 * 1e6, 11 * 1e9],
range_y=[min(0, min_score * 1.25), max_score * 1.25],
hover_data={
"Max Tokens": True,
"Embedding Dimensions": True,
"Number of Parameters": True,
"Mean (Task)": True,
"Rank (Borda)": True,
"Log(Tokens)": False,
"model_text": False,
},
hover_name="Model",
)
fig.update_layout(
coloraxis_colorbar=dict(
title="Max Tokens",
tickvals=[2, 3, 4, 5],
ticktext=[
"100",
"1K",
"10K",
"100K",
],
),
hoverlabel=dict(
bgcolor="white",
font_size=16,
),
)
fig.update_traces(
textposition="top center",
)
fig.update_layout(
font=dict(size=16, color="black"),
margin=dict(b=20, t=10, l=20, r=10),
)
return fig
47 changes: 19 additions & 28 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,6 @@ def scores_to_tables(
joint_table = joint_table.drop(columns=["model_revision"])
model_metas = joint_table["model_name"].map(get_model_meta)
joint_table["model_link"] = model_metas.map(lambda m: m.reference)
# joint_table.insert(
# 1,
# "Rank (Mean)",
# joint_table["mean"].rank(ascending=False, method="min").astype(int),
# )
joint_table.insert(
1,
"Max Tokens",
Expand Down Expand Up @@ -163,36 +158,32 @@ def scores_to_tables(
}
)
joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank"))
to_format = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns]
joint_table[to_format] = joint_table[to_format].map(format_scores)
joint_table = joint_table.style.highlight_max(
subset=to_format,
props="font-weight: bold",
)
joint_table = joint_table.format(
"{:.2f}", subset=joint_table.data.select_dtypes("number").columns
)
joint_table = joint_table.format("{:,}", subset=["Rank (Borda)"])
joint_table = joint_table.highlight_min(
subset=["Rank (Borda)"], props="font-weight: bold"
)
numerics = per_task.select_dtypes("number").columns
per_task[numerics] = per_task[numerics].map(format_scores)
per_task = per_task.style.highlight_max(
subset=numerics, props="font-weight: bold"
).format("{:.2f}", subset=numerics)
column_widths = get_column_widths(joint_table.data)
column_widths = get_column_widths(joint_table)
# overriding for model name
column_widths[1] = "250px"
column_types = get_column_types(joint_table.data)
column_types = get_column_types(joint_table)
# setting model name column to markdown
column_types[1] = "markdown"
score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns]
joint_table[score_columns] *= 100
joint_table_style = (
joint_table.style.format(
{**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"}
)
.highlight_min("Rank (Borda)", props="font-weight: bold")
.highlight_max(subset=score_columns, props="font-weight: bold")
)
task_score_columns = per_task.select_dtypes("number").columns
per_task[task_score_columns] *= 100
per_task_style = per_task.style.format(
"{:.2f}", subset=task_score_columns
).highlight_max(subset=task_score_columns, props="font-weight: bold")
return (
gr.DataFrame(
joint_table,
column_widths=column_widths,
joint_table_style,
# column_widths=column_widths,
datatype=column_types,
wrap=True,
),
gr.DataFrame(per_task),
gr.DataFrame(per_task_style),
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint
codecarbon = ["codecarbon"]
speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"]
peft = ["peft>=0.11.0"]
leaderboard = ["gradio>=4.44.0", "gradio_rangeslider>=0.0.6"]
leaderboard = ["gradio>=5.5.0", "gradio_rangeslider>=0.0.8"]
flagembedding = ["FlagEmbedding"]
jina = ["einops>=0.8.0"]
flash_attention = ["flash-attn>=2.6.3"]
Expand Down

0 comments on commit 76c2112

Please sign in to comment.