diff --git a/fastchat/constants.py b/fastchat/constants.py
index 5be0b17b6..9605b03ed 100644
--- a/fastchat/constants.py
+++ b/fastchat/constants.py
@@ -7,6 +7,13 @@
 
 REPO_PATH = os.path.dirname(os.path.dirname(__file__))
 
+# Survey Link URL (to be removed)
+SURVEY_LINK = """<div style='text-align: center; margin: 20px 0;'>
+    <div style='display: inline-block; border: 2px solid #DE3163; padding: 10px; border-radius: 5px;'>
+        <span style='color: #DE3163; font-weight: bold;'>We would love your feedback! Fill out <a href='https://docs.google.com/forms/d/e/1FAIpQLSfKSxwFOW6qD05phh4fwYjk8q0YV1VQe_bmK0_qOVTbC66_MA/viewform?usp=sf_link' style='color: #DE3163; text-decoration: underline;'>this short survey</a> to tell us what you like about the arena, what you don't like, and what you want to see in the future.</span>
+    </div>
+</div>"""
+
 ##### For the gradio web server
 SERVER_ERROR_MSG = (
     "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index 7b06fcc8d..b1c5de6cb 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -15,6 +15,7 @@
     SLOW_MODEL_MSG,
     BLIND_MODE_INPUT_CHAR_LEN_LIMIT,
     CONVERSATION_TURN_LIMIT,
+    SURVEY_LINK,
 )
 from fastchat.model.model_adapter import get_conversation_template
 from fastchat.serve.gradio_block_arena_named import flash_buttons
@@ -412,10 +413,12 @@ def bot_response_multi(
 
 
 def build_side_by_side_ui_anony(models):
-    notice_markdown = """
+    notice_markdown = f"""
 # ⚔️  LMSYS Chatbot Arena: Benchmarking LLMs in the Wild
 [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
 
+{SURVEY_LINK}
+
 ## 📣 News
 - Chatbot Arena now supports images in beta. Check it out [here](https://chat.lmsys.org/?vision).
 
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index f8b381522..7ee19b041 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -14,6 +14,7 @@
     CONVERSATION_LIMIT_MSG,
     INPUT_CHAR_LEN_LIMIT,
     CONVERSATION_TURN_LIMIT,
+    SURVEY_LINK,
 )
 from fastchat.model.model_adapter import get_conversation_template
 from fastchat.serve.gradio_web_server import (
@@ -306,10 +307,11 @@ def flash_buttons():
 
 
 def build_side_by_side_ui_named(models):
-    notice_markdown = """
+    notice_markdown = f"""
 # ⚔️  LMSYS Chatbot Arena: Benchmarking LLMs in the Wild
 [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
 
+{SURVEY_LINK}
 
 ## 📜 Rules
 - Ask any question to two chosen models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one!
diff --git a/fastchat/serve/gradio_block_arena_vision.py b/fastchat/serve/gradio_block_arena_vision.py
index 1d1d8b9e2..25ff78c08 100644
--- a/fastchat/serve/gradio_block_arena_vision.py
+++ b/fastchat/serve/gradio_block_arena_vision.py
@@ -22,6 +22,7 @@
     CONVERSATION_LIMIT_MSG,
     INPUT_CHAR_LEN_LIMIT,
     CONVERSATION_TURN_LIMIT,
+    SURVEY_LINK,
 )
 from fastchat.model.model_adapter import (
     get_conversation_template,
@@ -255,9 +256,11 @@ def build_single_vision_language_model_ui(
     models, add_promotion_links=False, random_questions=None
 ):
     promotion = (
-        """
+        f"""
 - [GitHub](https://github.com/lm-sys/FastChat) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx)
 
+{SURVEY_LINK}
+
 **❗️ For research purposes, we log user prompts and images, and may release this data to the public in the future. Please do not upload any confidential or personal information.**
 
 Note: You can only chat with <span style='color: #DE3163; font-weight: bold'>one image per conversation</span>. You can upload images less than 15MB. Click the "Random Example" button to chat with a random image."""
diff --git a/fastchat/serve/gradio_block_arena_vision_anony.py b/fastchat/serve/gradio_block_arena_vision_anony.py
index 2c11957c6..7da3f5405 100644
--- a/fastchat/serve/gradio_block_arena_vision_anony.py
+++ b/fastchat/serve/gradio_block_arena_vision_anony.py
@@ -17,6 +17,7 @@
     SLOW_MODEL_MSG,
     BLIND_MODE_INPUT_CHAR_LEN_LIMIT,
     CONVERSATION_TURN_LIMIT,
+    SURVEY_LINK,
 )
 from fastchat.model.model_adapter import get_conversation_template
 from fastchat.serve.gradio_block_arena_named import flash_buttons
@@ -377,6 +378,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
 # ⚔️  LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild
 [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
 
+{SURVEY_LINK}
 
 ## 📜 Rules
 - Ask any question to two anonymous models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one!
diff --git a/fastchat/serve/gradio_block_arena_vision_named.py b/fastchat/serve/gradio_block_arena_vision_named.py
index 524c38b8a..ecca169ca 100644
--- a/fastchat/serve/gradio_block_arena_vision_named.py
+++ b/fastchat/serve/gradio_block_arena_vision_named.py
@@ -18,6 +18,7 @@
     SLOW_MODEL_MSG,
     INPUT_CHAR_LEN_LIMIT,
     CONVERSATION_TURN_LIMIT,
+    SURVEY_LINK,
 )
 from fastchat.model.model_adapter import get_conversation_template
 from fastchat.serve.gradio_block_arena_named import (
@@ -247,10 +248,12 @@ def add_text(
 
 
 def build_side_by_side_vision_ui_named(models, random_questions=None):
-    notice_markdown = """
+    notice_markdown = f"""
 # ⚔️  LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild
 [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx)
 
+{SURVEY_LINK}
+
 ## 📜 Rules
 - Chat with any two models side-by-side and vote!
 - You can continue chatting for multiple rounds.
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index 878f562bf..f47883beb 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -18,12 +18,26 @@
 import gradio as gr
 import numpy as np
 
+from fastchat.constants import SURVEY_LINK
 from fastchat.serve.monitor.basic_stats import report_basic_stats, get_log_files
 from fastchat.serve.monitor.clean_battle_data import clean_battle_data
 from fastchat.serve.monitor.elo_analysis import report_elo_analysis_results
 from fastchat.utils import build_logger, get_window_url_params_js
 
 
+from fastchat.serve.monitor.monitor_md import (
+    cat_name_to_baseline,
+    key_to_category_name,
+    arena_hard_title,
+    make_default_md_1,
+    make_default_md_2,
+    make_arena_leaderboard_md,
+    make_category_arena_leaderboard_md,
+    make_full_leaderboard_md,
+    make_leaderboard_md_live,
+)
+
+
 notebook_url = (
     "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH"
 )
@@ -32,92 +46,20 @@
 leader_component_values = [None] * 5
 
 
-def make_default_md_1(mirror=False):
-    link_color = "#1976D2"  # This color should be clear in both light and dark mode
-    leaderboard_md = f"""
-    # 🏆 LMSYS Chatbot Arena Leaderboard 
-    [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
-    """
-
-    return leaderboard_md
-
-
-def make_default_md_2(mirror=False):
-    mirror_str = "<span style='color: red; font-weight: bold'>This is a mirror of the live leaderboard created and maintained by the <a href='https://lmsys.org' style='color: red; text-decoration: none;'>LMSYS Organization</a>. Please link to <a href='https://leaderboard.lmsys.org' style='color: #B00020; text-decoration: none;'>leaderboard.lmsys.org</a> for citation purposes.</span>"
-    leaderboard_md = f"""
-    {mirror_str if mirror else ""}
-    
-    LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 1,000,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale.
-    You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!**
-    """
-
-    return leaderboard_md
-
-
-def make_arena_leaderboard_md(arena_df, last_updated_time, vision=False):
-    total_votes = sum(arena_df["num_battles"]) // 2
-    total_models = len(arena_df)
-    space = "&nbsp;&nbsp;&nbsp;"
-
-    leaderboard_md = f"""
-Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
-"""
-    if not vision:
-        leaderboard_md += """
-📣 **NEW!** View leaderboard for different categories (e.g., coding, long user query)! This is still in preview and subject to change.
-"""
-
-    leaderboard_md += f"""
-Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). You can contribute your vote at [chat.lmsys.org](https://chat.lmsys.org)!
-"""
-    return leaderboard_md
-
-
-def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
-    total_votes = sum(arena_df["num_battles"]) // 2
-    total_models = len(arena_df)
-    space = "&nbsp;&nbsp;&nbsp;"
-    total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
-    total_subset_models = len(arena_subset_df)
-    leaderboard_md = f"""### {cat_name_to_explanation[name]}
-#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
-"""
-    return leaderboard_md
-
-
-def make_full_leaderboard_md():
-    leaderboard_md = """
-Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
-- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute model strength.
-- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
-- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
-
-💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
-The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
-Higher values are better for all benchmarks. Empty cells mean not available.
-"""
-    return leaderboard_md
-
-
-def make_leaderboard_md_live(elo_results):
-    leaderboard_md = f"""
-# Leaderboard
-Last updated: {elo_results["last_updated_datetime"]}
-{elo_results["leaderboard_table"]}
-"""
-    return leaderboard_md
-
-
-def arena_hard_title(date):
-    arena_hard_title = f"""
-Last Updated: {date}
-
-**Arena-Hard-Auto v0.1** - an automatic evaluation tool for instruction-tuned LLMs with 500 challenging user queries curated from Chatbot Arena. 
-
-We prompt GPT-4-Turbo as judge to compare the models' responses against a baseline model (default: GPT-4-0314). If you are curious to see how well your model might perform on Chatbot Arena, we recommend trying Arena-Hard-Auto. Check out our paper for more details about how Arena-Hard-Auto works as an fully automated data pipeline converting crowdsourced data into high-quality benchmarks ->
-[[Paper](https://arxiv.org/abs/2406.11939) | [Repo](https://github.com/lm-sys/arena-hard-auto)]
-    """
-    return arena_hard_title
+def recompute_final_ranking(arena_df):
+    # compute ranking based on CI
+    ranking = {}
+    for i, model_a in enumerate(arena_df.index):
+        ranking[model_a] = 1
+        for j, model_b in enumerate(arena_df.index):
+            if i == j:
+                continue
+            if (
+                arena_df.loc[model_b]["rating_q025"]
+                > arena_df.loc[model_a]["rating_q975"]
+            ):
+                ranking[model_a] += 1
+    return list(ranking.values())
 
 
 def update_elo_components(
@@ -316,35 +258,6 @@ def create_ranking_str(ranking, ranking_difference):
         return f"{int(ranking)}"
 
 
-def recompute_final_ranking(arena_df):
-    # compute ranking based on CI
-    ranking = {}
-    for i, model_a in enumerate(arena_df.index):
-        ranking[model_a] = 1
-        for j, model_b in enumerate(arena_df.index):
-            if i == j:
-                continue
-            if (
-                arena_df.loc[model_b]["rating_q025"]
-                > arena_df.loc[model_a]["rating_q975"]
-            ):
-                ranking[model_a] += 1
-    return list(ranking.values())
-
-
-def highlight_top_models(df):
-    def highlight_max_rank(s):
-        # Pastel Yellow with transparency, rgba(red, green, blue, alpha)
-        highlight_color = "rgba(255, 255, 128, 0.2)"  # 50% transparent
-        if int(s["Rank* (UB)"].replace("↑", "").replace("↓", "")) == 1:
-            return [f"background-color: {highlight_color}" for _ in s]
-        else:
-            return ["" for _ in s]
-
-    # Apply and return the styled DataFrame
-    return df.apply(highlight_max_rank, axis=1)
-
-
 def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
     arena_df = arena_df.sort_values(
         by=["final_ranking", "rating"], ascending=[True, False]
@@ -436,59 +349,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
     return values
 
 
-key_to_category_name = {
-    "full": "Overall",
-    "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
-    "math": "Math",
-    "if": "Instruction Following",
-    "multiturn": "Multi-Turn",
-    "coding": "Coding",
-    "hard_6": "Hard Prompts (Overall)",
-    "hard_english_6": "Hard Prompts (English)",
-    "long_user": "Longer Query",
-    "english": "English",
-    "chinese": "Chinese",
-    "french": "French",
-    "german": "German",
-    "spanish": "Spanish",
-    "russian": "Russian",
-    "japanese": "Japanese",
-    "korean": "Korean",
-    "no_tie": "Exclude Ties",
-    "no_short": "Exclude Short Query (< 5 tokens)",
-    "no_refusal": "Exclude Refusal",
-    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
-    "full_old": "Overall (Deprecated)",
-}
-cat_name_to_explanation = {
-    "Overall": "Overall Questions",
-    "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
-    "Math": "Math",
-    "Instruction Following": "Instruction Following",
-    "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
-    "Coding": "Coding: whether conversation contains code snippets",
-    "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
-    "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
-    "Longer Query": "Longer Query (>= 500 tokens)",
-    "English": "English Prompts",
-    "Chinese": "Chinese Prompts",
-    "French": "French Prompts",
-    "German": "German Prompts",
-    "Spanish": "Spanish Prompts",
-    "Russian": "Russian Prompts",
-    "Japanese": "Japanese Prompts",
-    "Korean": "Korean Prompts",
-    "Exclude Ties": "Exclude Ties and Bothbad",
-    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
-    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
-    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
-    "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
-}
-cat_name_to_baseline = {
-    "Hard Prompts (English)": "English",
-}
-
-
 def update_leaderboard_df(arena_table_vals):
     elo_datarame = pd.DataFrame(
         arena_table_vals,
@@ -609,10 +469,10 @@ def update_leaderboard_and_plots(category):
             arena_values = gr.Dataframe(
                 headers=[
                     "Rank* (UB)",
-                    "🤖 Model",
-                    "⭐ Arena Score",
-                    "📊 95% CI",
-                    "🗳️ Votes",
+                    "Model",
+                    "Arena Score",
+                    "95% CI",
+                    "Votes",
                     "Organization",
                     "License",
                     "Knowledge Cutoff",
@@ -688,10 +548,10 @@ def update_leaderboard_and_plots(category):
     elo_display_df = gr.Dataframe(
         headers=[
             "Rank* (UB)",
-            "🤖 Model",
-            "⭐ Arena Elo",
-            "📊 95% CI",
-            "🗳️ Votes",
+            "Model",
+            "Arena Elo",
+            "95% CI",
+            "Votes",
             "Organization",
             "License",
             "Knowledge Cutoff",
@@ -800,6 +660,108 @@ def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score):
     )
 
 
+def get_arena_category_table(elo_results, model_table_df, categories):
+    new_category_dfs = []
+    for i, category in enumerate(categories):
+        category_df = elo_results[category]["leaderboard_table_df"]
+        category_df[key_to_category_name[category]] = recompute_final_ranking(
+            category_df
+        )
+        if i == 0:
+            new_category_dfs.append(
+                category_df[[key_to_category_name[category], "rating"]]
+            )
+        else:
+            new_category_dfs.append(category_df[[key_to_category_name[category]]])
+    category_df = pd.concat(new_category_dfs, axis=1)
+    category_df = category_df.sort_values(
+        by=[category_df.columns[0], "rating"], ascending=[True, False]
+    )
+    category_df = category_df.drop(columns=["rating"])
+
+    def get_model_name(model_key):
+        try:
+            model_name = model_table_df[model_table_df["key"] == model_key][
+                "Model"
+            ].values[0]
+            return model_name
+        except:
+            return None
+
+    category_df["Model"] = category_df.index
+    category_df["Model"] = category_df["Model"].apply(get_model_name)
+    # remove models that are not in the model table
+    category_df = category_df[category_df["Model"].notnull()]
+    # Move Model column to the front
+    cols = category_df.columns.tolist()
+    cols = cols[-1:] + cols[:-1]
+    category_df = category_df[cols]
+
+    category_df = pd.DataFrame(
+        category_df.values,
+        columns=["Model"] + [key_to_category_name[k] for k in categories],
+    )
+
+    def highlight_top_3(s):
+        return [
+            (
+                "background-color: rgba(255, 215, 0, 0.3); text-align: center; font-size: 110%"
+                if int(v) == 1
+                else "background-color: rgba(211, 211, 211, 0.4); text-align: center; font-size: 110%"
+                if int(v) == 2
+                else "background-color: rgba(209, 139, 71, 0.2); text-align: center; font-size: 110%"
+                if int(v) == 3
+                else "text-align: center; font-size: 110%"
+            )
+            for v in s
+        ]
+
+    return category_df.style.apply(
+        highlight_top_3, subset=[key_to_category_name[k] for k in categories]
+    )
+
+
+def build_category_leaderboard_tab(
+    elo_results, model_table_df, categories, categories_width
+):
+    full_table_vals = get_arena_category_table(elo_results, model_table_df, categories)
+    gr.Dataframe(
+        headers=["Model"] + [key_to_category_name[k] for k in categories],
+        datatype=["markdown"] + ["str" for k in categories],
+        value=full_table_vals,
+        elem_id="full_leaderboard_dataframe",
+        column_widths=[250]
+        + categories_width,  # IMPORTANT: THIS IS HARDCODED WITH THE CURRENT CATEGORIES
+        height=800,
+        wrap=True,
+    )
+
+
+selected_categories = [
+    "full",
+    "coding",
+    "if",
+    "math",
+    "hard_6",
+    "multiturn",
+    "long_user",
+    "no_refusal",
+]
+selected_categories_width = [85, 75, 120, 75, 125, 110, 100, 100]
+
+language_categories = [
+    "english",
+    "chinese",
+    "german",
+    "french",
+    "spanish",
+    "russian",
+    "japanese",
+    "korean",
+]
+language_categories_width = [100] * len(language_categories)
+
+
 def build_leaderboard_tab(
     elo_results_file,
     leaderboard_table_file,
@@ -834,14 +796,60 @@ def build_leaderboard_tab(
         model_table_df = pd.DataFrame(data)
 
         with gr.Tabs() as tabs:
-            with gr.Tab("Arena", id=0):
+            with gr.Tab("Ranking Breakdown", id=0):
+                gr.Markdown(
+                    f"""
+                    <div style="text-align: center; font-weight: bold;">
+                        We've updated the leaderboard to show model rank (UB) across categories to provide a more holistic comparison. Check out the Arena tab for more categories, statistics, and model info.
+                    </div>
+                    """,
+                )
+                last_updated_time = elo_results_text["full"][
+                    "last_updated_datetime"
+                ].split(" ")[0]
+                gr.Markdown(
+                    make_arena_leaderboard_md(
+                        elo_results_text["full"]["leaderboard_table_df"],
+                        last_updated_time,
+                    ),
+                    elem_id="leaderboard_markdown",
+                )
+                gr.Markdown(
+                    """<span style='text-decoration:underline; font-size: 125%;'>Task Leaderboard</span>"""
+                )
+                gr_plots = build_category_leaderboard_tab(
+                    elo_results_text,
+                    model_table_df,
+                    selected_categories,
+                    selected_categories_width,
+                )
+                gr.Markdown(
+                    """<span style='text-decoration:underline; font-size: 125%;'>Language Leaderboard</span>"""
+                )
+                build_category_leaderboard_tab(
+                    elo_results_text,
+                    model_table_df,
+                    language_categories,
+                    language_categories_width,
+                )
+                gr.Markdown(
+                    f"""
+            ***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
+            Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
+            See Figure 1 below for visualization of the confidence intervals of model scores.
+
+            Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large.
+            """,
+                    elem_id="leaderboard_markdown",
+                )
+            with gr.Tab("Arena", id=1):
                 gr_plots = build_arena_tab(
                     elo_results_text,
                     model_table_df,
                     default_md,
                     show_plot=show_plot,
                 )
-            with gr.Tab("📣 NEW: Arena (Vision)", id=1):
+            with gr.Tab("Arena (Vision)", id=2):
                 build_arena_tab(
                     elo_results_vision,
                     model_table_df,
@@ -850,7 +858,7 @@ def build_leaderboard_tab(
                     show_plot=show_plot,
                 )
             if arena_hard_leaderboard is not None:
-                with gr.Tab("Arena-Hard-Auto", id=2):
+                with gr.Tab("Arena-Hard-Auto", id=3):
                     dataFrame = arena_hard_process(
                         leaderboard_table_file, arena_hard_leaderboard
                     )
@@ -887,7 +895,7 @@ def build_leaderboard_tab(
                         column_widths=[70, 190, 80, 80, 90, 150],
                     )
 
-            with gr.Tab("Full Leaderboard", id=3):
+            with gr.Tab("Full Leaderboard", id=4):
                 build_full_leaderboard_tab(
                     elo_results_text, model_table_df, model_to_score
                 )
diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py
new file mode 100644
index 000000000..5bbde6225
--- /dev/null
+++ b/fastchat/serve/monitor/monitor_md.py
@@ -0,0 +1,150 @@
+import pandas as pd
+import pickle
+import gradio as gr
+
+from fastchat.constants import SURVEY_LINK
+
+key_to_category_name = {
+    "full": "Overall",
+    "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
+    "math": "Math",
+    "if": "Instruction Following",
+    "multiturn": "Multi-Turn",
+    "coding": "Coding",
+    "hard_6": "Hard Prompts (Overall)",
+    "hard_english_6": "Hard Prompts (English)",
+    "long_user": "Longer Query",
+    "english": "English",
+    "chinese": "Chinese",
+    "french": "French",
+    "german": "German",
+    "spanish": "Spanish",
+    "russian": "Russian",
+    "japanese": "Japanese",
+    "korean": "Korean",
+    "no_tie": "Exclude Ties",
+    "no_short": "Exclude Short Query (< 5 tokens)",
+    "no_refusal": "Exclude Refusal",
+    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
+    "full_old": "Overall (Deprecated)",
+}
+cat_name_to_explanation = {
+    "Overall": "Overall Questions",
+    "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
+    "Math": "Math",
+    "Instruction Following": "Instruction Following",
+    "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
+    "Coding": "Coding: whether conversation contains code snippets",
+    "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
+    "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
+    "Longer Query": "Longer Query (>= 500 tokens)",
+    "English": "English Prompts",
+    "Chinese": "Chinese Prompts",
+    "French": "French Prompts",
+    "German": "German Prompts",
+    "Spanish": "Spanish Prompts",
+    "Russian": "Russian Prompts",
+    "Japanese": "Japanese Prompts",
+    "Korean": "Korean Prompts",
+    "Exclude Ties": "Exclude Ties and Bothbad",
+    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
+    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
+    "overall_limit_5_user_vote": "overall_limit_5_user_vote",
+    "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
+}
+cat_name_to_baseline = {
+    "Hard Prompts (English)": "English",
+}
+
+notebook_url = (
+    "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH"
+)
+
+basic_component_values = [None] * 6
+leader_component_values = [None] * 5
+
+
+def make_default_md_1(mirror=False):
+    link_color = "#1976D2"  # This color should be clear in both light and dark mode
+    leaderboard_md = f"""
+    # 🏆 LMSYS Chatbot Arena Leaderboard 
+    [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
+    """
+
+    return leaderboard_md
+
+
+def make_default_md_2(mirror=False):
+    mirror_str = "<span style='color: red; font-weight: bold'>This is a mirror of the live leaderboard created and maintained by the <a href='https://lmsys.org' style='color: red; text-decoration: none;'>LMSYS Organization</a>. Please link to <a href='https://leaderboard.lmsys.org' style='color: #B00020; text-decoration: none;'>leaderboard.lmsys.org</a> for citation purposes.</span>"
+    leaderboard_md = f"""
+{mirror_str if mirror else ""}
+
+LMSYS Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 1,000,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale.
+You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!**
+
+{SURVEY_LINK}
+"""
+
+    return leaderboard_md
+
+
+def make_arena_leaderboard_md(arena_df, last_updated_time, vision=False):
+    total_votes = sum(arena_df["num_battles"]) // 2
+    total_models = len(arena_df)
+    space = "&nbsp;&nbsp;&nbsp;"
+
+    leaderboard_md = f"""
+Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
+"""
+
+    leaderboard_md += f"""
+Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). You can contribute your vote at [chat.lmsys.org](https://chat.lmsys.org)!
+"""
+    return leaderboard_md
+
+
+def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
+    total_votes = sum(arena_df["num_battles"]) // 2
+    total_models = len(arena_df)
+    space = "&nbsp;&nbsp;&nbsp;"
+    total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
+    total_subset_models = len(arena_subset_df)
+    leaderboard_md = f"""### {cat_name_to_explanation[name]}
+#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
+"""
+    return leaderboard_md
+
+
+def make_full_leaderboard_md():
+    leaderboard_md = """
+Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
+- [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 500K+ user votes to compute model strength.
+- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
+- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
+
+💻 Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
+The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
+Higher values are better for all benchmarks. Empty cells mean not available.
+"""
+    return leaderboard_md
+
+
+def make_leaderboard_md_live(elo_results):
+    leaderboard_md = f"""
+# Leaderboard
+Last updated: {elo_results["last_updated_datetime"]}
+{elo_results["leaderboard_table"]}
+"""
+    return leaderboard_md
+
+
+def arena_hard_title(date):
+    arena_hard_title = f"""
+Last Updated: {date}
+
+**Arena-Hard-Auto v0.1** - an automatic evaluation tool for instruction-tuned LLMs with 500 challenging user queries curated from Chatbot Arena. 
+
+We prompt GPT-4-Turbo as judge to compare the models' responses against a baseline model (default: GPT-4-0314). If you are curious to see how well your model might perform on Chatbot Arena, we recommend trying Arena-Hard-Auto. Check out our paper for more details about how Arena-Hard-Auto works as an fully automated data pipeline converting crowdsourced data into high-quality benchmarks ->
+[[Paper](https://arxiv.org/abs/2406.11939) | [Repo](https://github.com/lm-sys/arena-hard-auto)]
+    """
+    return arena_hard_title